// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tcp

import (
	"encoding/binary"
	"fmt"
	"io"
	"math"
	"runtime"
	"strings"
	"sync/atomic"
	"time"

	"gvisor.dev/gvisor/pkg/rand"
	"gvisor.dev/gvisor/pkg/sleep"
	"gvisor.dev/gvisor/pkg/sync"
	"gvisor.dev/gvisor/pkg/tcpip"
	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
	"gvisor.dev/gvisor/pkg/tcpip/header"
	"gvisor.dev/gvisor/pkg/tcpip/ports"
	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
	"gvisor.dev/gvisor/pkg/tcpip/stack"
	"gvisor.dev/gvisor/pkg/waiter"
)

// EndpointState represents the state of a TCP endpoint.
type EndpointState uint32

// Endpoint states. Note that are represented in a netstack-specific manner and
// may not be meaningful externally. Specifically, they need to be translated to
// Linux's representation for these states if presented to userspace.
const (
	// Endpoint states internal to netstack. These map to the TCP state CLOSED.
	StateInitial EndpointState = iota
	StateBound
	StateConnecting // Connect() called, but the initial SYN hasn't been sent.
	StateError

	// TCP protocol states.
	StateEstablished
	StateSynSent
	StateSynRecv
	StateFinWait1
	StateFinWait2
	StateTimeWait
	StateClose
	StateCloseWait
	StateLastAck
	StateListen
	StateClosing
)

const (
	// rcvAdvWndScale is used to split the available socket buffer into
	// application buffer and the window to be advertised to the peer. This is
	// currently hard coded to split the available space equally.
	rcvAdvWndScale = 1

	// SegOverheadFactor is used to multiply the value provided by the
	// user on a SetSockOpt for setting the socket send/receive buffer sizes.
	SegOverheadFactor = 2
)

// connected returns true when s is one of the states representing an
// endpoint connected to a peer.
func (s EndpointState) connected() bool {
	switch s {
	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
		return true
	default:
		return false
	}
}

// connecting returns true when s is one of the states representing a
// connection in progress, but not yet fully established.
func (s EndpointState) connecting() bool {
	switch s {
	case StateConnecting, StateSynSent, StateSynRecv:
		return true
	default:
		return false
	}
}

// handshake returns true when s is one of the states representing an endpoint
// in the middle of a TCP handshake.
func (s EndpointState) handshake() bool {
	switch s {
	case StateSynSent, StateSynRecv:
		return true
	default:
		return false
	}
}

// closed returns true when s is one of the states an endpoint transitions to
// when closed or when it encounters an error. This is distinct from a newly
// initialized endpoint that was never connected.
func (s EndpointState) closed() bool {
	switch s {
	case StateClose, StateError:
		return true
	default:
		return false
	}
}

// String implements fmt.Stringer.String.
func (s EndpointState) String() string {
	switch s {
	case StateInitial:
		return "INITIAL"
	case StateBound:
		return "BOUND"
	case StateConnecting:
		return "CONNECTING"
	case StateError:
		return "ERROR"
	case StateEstablished:
		return "ESTABLISHED"
	case StateSynSent:
		return "SYN-SENT"
	case StateSynRecv:
		return "SYN-RCVD"
	case StateFinWait1:
		return "FIN-WAIT1"
	case StateFinWait2:
		return "FIN-WAIT2"
	case StateTimeWait:
		return "TIME-WAIT"
	case StateClose:
		return "CLOSED"
	case StateCloseWait:
		return "CLOSE-WAIT"
	case StateLastAck:
		return "LAST-ACK"
	case StateListen:
		return "LISTEN"
	case StateClosing:
		return "CLOSING"
	default:
		panic("unreachable")
	}
}

// Reasons for notifying the protocol goroutine.
const (
	notifyNonZeroReceiveWindow = 1 << iota
	notifyClose
	notifyMTUChanged
	notifyDrain
	notifyReset
	notifyResetByPeer
	// notifyAbort is a request for an expedited teardown.
	notifyAbort
	notifyKeepaliveChanged
	notifyMSSChanged
	// notifyTickleWorker is used to tickle the protocol main loop during a
	// restore after we update the endpoint state to the correct one. This
	// ensures the loop terminates if the final state of the endpoint is
	// say TIME_WAIT.
	notifyTickleWorker
	notifyError
)

// SACKInfo holds TCP SACK related information for a given endpoint.
//
// +stateify savable
type SACKInfo struct {
	// Blocks is the maximum number of SACK blocks we track
	// per endpoint.
	Blocks [MaxSACKBlocks]header.SACKBlock

	// NumBlocks is the number of valid SACK blocks stored in the
	// blocks array above.
	NumBlocks int
}

// rcvBufAutoTuneParams are used to hold state variables to compute
// the auto tuned recv buffer size.
//
// +stateify savable
type rcvBufAutoTuneParams struct {
	// measureTime is the time at which the current measurement
	// was started.
	measureTime time.Time `state:".(unixTime)"`

	// copied is the number of bytes copied out of the receive
	// buffers since this measure began.
	copied int

	// prevCopied is the number of bytes copied out of the receive
	// buffers in the previous RTT period.
	prevCopied int

	// rtt is the non-smoothed minimum RTT as measured by observing the time
	// between when a byte is first acknowledged and the receipt of data
	// that is at least one window beyond the sequence number that was
	// acknowledged.
	rtt time.Duration

	// rttMeasureSeqNumber is the highest acceptable sequence number at the
	// time this RTT measurement period began.
	rttMeasureSeqNumber seqnum.Value

	// rttMeasureTime is the absolute time at which the current rtt
	// measurement period began.
	rttMeasureTime time.Time `state:".(unixTime)"`

	// disabled is true if an explicit receive buffer is set for the
	// endpoint.
	disabled bool
}

// ReceiveErrors collect segment receive errors within transport layer.
type ReceiveErrors struct {
	tcpip.ReceiveErrors

	// SegmentQueueDropped is the number of segments dropped due to
	// a full segment queue.
	SegmentQueueDropped tcpip.StatCounter

	// ChecksumErrors is the number of segments dropped due to bad checksums.
	ChecksumErrors tcpip.StatCounter

	// ListenOverflowSynDrop is the number of times the listen queue overflowed
	// and a SYN was dropped.
	ListenOverflowSynDrop tcpip.StatCounter

	// ListenOverflowAckDrop is the number of times the final ACK
	// in the handshake was dropped due to overflow.
	ListenOverflowAckDrop tcpip.StatCounter

	// ZeroRcvWindowState is the number of times we advertised
	// a zero receive window when rcvList is full.
	ZeroRcvWindowState tcpip.StatCounter

	// WantZeroWindow is the number of times we wanted to advertise a
	// zero receive window but couldn't because it would have caused
	// the receive window's right edge to shrink.
	WantZeroRcvWindow tcpip.StatCounter
}

// SendErrors collect segment send errors within the transport layer.
type SendErrors struct {
	tcpip.SendErrors

	// SegmentSendToNetworkFailed is the number of TCP segments failed to be sent
	// to the network endpoint.
	SegmentSendToNetworkFailed tcpip.StatCounter

	// SynSendToNetworkFailed is the number of TCP SYNs failed to be sent
	// to the network endpoint.
	SynSendToNetworkFailed tcpip.StatCounter

	// Retransmits is the number of TCP segments retransmitted.
	Retransmits tcpip.StatCounter

	// FastRetransmit is the number of segments retransmitted in fast
	// recovery.
	FastRetransmit tcpip.StatCounter

	// Timeouts is the number of times the RTO expired.
	Timeouts tcpip.StatCounter
}

// Stats holds statistics about the endpoint.
type Stats struct {
	// SegmentsReceived is the number of TCP segments received that
	// the transport layer successfully parsed.
	SegmentsReceived tcpip.StatCounter

	// SegmentsSent is the number of TCP segments sent.
	SegmentsSent tcpip.StatCounter

	// FailedConnectionAttempts is the number of times we saw Connect and
	// Accept errors.
	FailedConnectionAttempts tcpip.StatCounter

	// ReceiveErrors collects segment receive errors within the
	// transport layer.
	ReceiveErrors ReceiveErrors

	// ReadErrors collects segment read errors from an endpoint read call.
	ReadErrors tcpip.ReadErrors

	// SendErrors collects segment send errors within the transport layer.
	SendErrors SendErrors

	// WriteErrors collects segment write errors from an endpoint write call.
	WriteErrors tcpip.WriteErrors
}

// IsEndpointStats is an empty method to implement the tcpip.EndpointStats
// marker interface.
func (*Stats) IsEndpointStats() {}

// EndpointInfo holds useful information about a transport endpoint which
// can be queried by monitoring tools. This exists to allow tcp-only state to
// be exposed.
//
// +stateify savable
type EndpointInfo struct {
	stack.TransportEndpointInfo
}

// IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
// marker interface.
func (*EndpointInfo) IsEndpointInfo() {}

// endpoint represents a TCP endpoint. This struct serves as the interface
// between users of the endpoint and the protocol implementation; it is legal to
// have concurrent goroutines make calls into the endpoint, they are properly
// synchronized. The protocol implementation, however, runs in a single
// goroutine.
//
// Each endpoint has a few mutexes:
//
// e.mu -> Primary mutex for an endpoint must be held for all operations except
// in e.Readiness where acquiring it will result in a deadlock in epoll
// implementation.
//
// The following three mutexes can be acquired independent of e.mu but if
// acquired with e.mu then e.mu must be acquired first.
//
// e.acceptMu -> protects acceptedChan.
// e.rcvListMu -> Protects the rcvList and associated fields.
// e.sndBufMu -> Protects the sndQueue and associated fields.
// e.lastErrorMu -> Protects the lastError field.
//
// LOCKING/UNLOCKING of the endpoint.  The locking of an endpoint is different
// based on the context in which the lock is acquired. In the syscall context
// e.LockUser/e.UnlockUser should be used and when doing background processing
// e.mu.Lock/e.mu.Unlock should be used. The distinction is described below
// in brief.
//
// The reason for this locking behaviour is to avoid wakeups to handle packets.
// In cases where the endpoint is already locked the background processor can
// queue the packet up and go its merry way and the lock owner will eventually
// process the backlog when releasing the lock. Similarly when acquiring the
// lock from say a syscall goroutine we can implement a bit of spinning if we
// know that the lock is not held by another syscall goroutine. Background
// processors should never hold the lock for long and we can avoid an expensive
// sleep/wakeup by spinning for a shortwhile.
//
// For more details please see the detailed documentation on
// e.LockUser/e.UnlockUser methods.
//
// +stateify savable
type endpoint struct {
	EndpointInfo
	tcpip.DefaultSocketOptionsHandler

	// endpointEntry is used to queue endpoints for processing to the
	// a given tcp processor goroutine.
	//
	// Precondition: epQueue.mu must be held to read/write this field..
	endpointEntry `state:"nosave"`

	// pendingProcessing is true if this endpoint is queued for processing
	// to a TCP processor.
	//
	// Precondition: epQueue.mu must be held to read/write this field..
	pendingProcessing bool `state:"nosave"`

	// The following fields are initialized at creation time and do not
	// change throughout the lifetime of the endpoint.
	stack       *stack.Stack  `state:"manual"`
	waiterQueue *waiter.Queue `state:"wait"`
	uniqueID    uint64

	// hardError is meaningful only when state is stateError. It stores the
	// error to be returned when read/write syscalls are called and the
	// endpoint is in this state. hardError is protected by endpoint mu.
	hardError tcpip.Error

	// lastError represents the last error that the endpoint reported;
	// access to it is protected by the following mutex.
	lastErrorMu sync.Mutex `state:"nosave"`
	lastError   tcpip.Error

	// rcvReadMu synchronizes calls to Read.
	//
	// mu and rcvListMu are temporarily released during data copying. rcvReadMu
	// must be held during each read to ensure atomicity, so that multiple reads
	// do not interleave.
	//
	// rcvReadMu should be held before holding mu.
	rcvReadMu sync.Mutex `state:"nosave"`

	// rcvListMu synchronizes access to rcvList.
	//
	// rcvListMu can be taken after the endpoint mu below.
	rcvListMu sync.Mutex `state:"nosave"`

	// rcvList is the queue for ready-for-delivery segments.
	//
	// rcvReadMu, mu and rcvListMu must be held, in the stated order, to read data
	// and removing segments from list. A range of segment can be determined, then
	// temporarily release mu and rcvListMu while processing the segment range.
	// This allows new segments to be appended to the list while processing.
	//
	// rcvListMu must be held to append segments to list.
	rcvList   segmentList `state:"wait"`
	rcvClosed bool
	// rcvBufSize is the total size of the receive buffer.
	rcvBufSize int
	// rcvBufUsed is the actual number of payload bytes held in the receive buffer
	// not counting any overheads of the segments itself. NOTE: This will always
	// be strictly <= rcvMemUsed below.
	rcvBufUsed    int
	rcvAutoParams rcvBufAutoTuneParams

	// rcvMemUsed tracks the total amount of memory in use by received segments
	// held in rcvList, pendingRcvdSegments and the segment queue. This is used to
	// compute the window and the actual available buffer space. This is distinct
	// from rcvBufUsed above which is the actual number of payload bytes held in
	// the buffer not including any segment overheads.
	//
	// rcvMemUsed must be accessed atomically.
	rcvMemUsed int32

	// mu protects all endpoint fields unless documented otherwise. mu must
	// be acquired before interacting with the endpoint fields.
	//
	// During handshake, mu is locked by the protocol listen goroutine and
	// released by the handshake completion goroutine.
	mu          sync.CrossGoroutineMutex `state:"nosave"`
	ownedByUser uint32

	// state must be read/set using the EndpointState()/setEndpointState()
	// methods.
	state EndpointState `state:".(EndpointState)"`

	// origEndpointState is only used during a restore phase to save the
	// endpoint state at restore time as the socket is moved to it's correct
	// state.
	origEndpointState EndpointState `state:"nosave"`

	isPortReserved    bool `state:"manual"`
	isRegistered      bool `state:"manual"`
	boundNICID        tcpip.NICID
	route             *stack.Route `state:"manual"`
	ttl               uint8
	isConnectNotified bool

	// h stores a reference to the current handshake state if the endpoint is in
	// the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep.
	// nil otherwise.
	h *handshake `state:"nosave"`

	// portFlags stores the current values of port related flags.
	portFlags ports.Flags

	// Values used to reserve a port or register a transport endpoint
	// (which ever happens first).
	boundBindToDevice tcpip.NICID
	boundPortFlags    ports.Flags
	boundDest         tcpip.FullAddress

	// effectiveNetProtos contains the network protocols actually in use. In
	// most cases it will only contain "netProto", but in cases like IPv6
	// endpoints with v6only set to false, this could include multiple
	// protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
	// IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
	// address).
	effectiveNetProtos []tcpip.NetworkProtocolNumber

	// workerRunning specifies if a worker goroutine is running.
	workerRunning bool

	// workerCleanup specifies if the worker goroutine must perform cleanup
	// before exiting. This can only be set to true when workerRunning is
	// also true, and they're both protected by the mutex.
	workerCleanup bool

	// sendTSOk is used to indicate when the TS Option has been negotiated.
	// When sendTSOk is true every non-RST segment should carry a TS as per
	// RFC7323#section-1.1
	sendTSOk bool

	// recentTS is the timestamp that should be sent in the TSEcr field of
	// the timestamp for future segments sent by the endpoint. This field is
	// updated if required when a new segment is received by this endpoint.
	recentTS uint32

	// recentTSTime is the unix time when we updated recentTS last.
	recentTSTime time.Time `state:".(unixTime)"`

	// tsOffset is a randomized offset added to the value of the
	// TSVal field in the timestamp option.
	tsOffset uint32

	// shutdownFlags represent the current shutdown state of the endpoint.
	shutdownFlags tcpip.ShutdownFlags

	// tcpRecovery is the loss deteoction algorithm used by TCP.
	tcpRecovery tcpip.TCPRecovery

	// sackPermitted is set to true if the peer sends the TCPSACKPermitted
	// option in the SYN/SYN-ACK.
	sackPermitted bool

	// sack holds TCP SACK related information for this endpoint.
	sack SACKInfo

	// delay enables Nagle's algorithm.
	//
	// delay is a boolean (0 is false) and must be accessed atomically.
	delay uint32

	// scoreboard holds TCP SACK Scoreboard information for this endpoint.
	scoreboard *SACKScoreboard

	// segmentQueue is used to hand received segments to the protocol
	// goroutine. Segments are queued as long as the queue is not full,
	// and dropped when it is.
	segmentQueue segmentQueue `state:"wait"`

	// synRcvdCount is the number of connections for this endpoint that are
	// in SYN-RCVD state.
	synRcvdCount int

	// userMSS if non-zero is the MSS value explicitly set by the user
	// for this endpoint using the TCP_MAXSEG setsockopt.
	userMSS uint16

	// maxSynRetries is the maximum number of SYN retransmits that TCP should
	// send before aborting the attempt to connect. It cannot exceed 255.
	//
	// NOTE: This is currently a no-op and does not change the SYN
	// retransmissions.
	maxSynRetries uint8

	// windowClamp is used to bound the size of the advertised window to
	// this value.
	windowClamp uint32

	// The following fields are used to manage the send buffer. When
	// segments are ready to be sent, they are added to sndQueue and the
	// protocol goroutine is signaled via sndWaker.
	//
	// When the send side is closed, the protocol goroutine is notified via
	// sndCloseWaker, and sndClosed is set to true.
	sndBufMu      sync.Mutex `state:"nosave"`
	sndBufUsed    int
	sndClosed     bool
	sndBufInQueue seqnum.Size
	sndQueue      segmentList `state:"wait"`
	sndWaker      sleep.Waker `state:"manual"`
	sndCloseWaker sleep.Waker `state:"manual"`

	// cc stores the name of the Congestion Control algorithm to use for
	// this endpoint.
	cc tcpip.CongestionControlOption

	// The following are used when a "packet too big" control packet is
	// received. They are protected by sndBufMu. They are used to
	// communicate to the main protocol goroutine how many such control
	// messages have been received since the last notification was processed
	// and what was the smallest MTU seen.
	packetTooBigCount int
	sndMTU            int

	// newSegmentWaker is used to indicate to the protocol goroutine that
	// it needs to wake up and handle new segments queued to it.
	newSegmentWaker sleep.Waker `state:"manual"`

	// notificationWaker is used to indicate to the protocol goroutine that
	// it needs to wake up and check for notifications.
	notificationWaker sleep.Waker `state:"manual"`

	// notifyFlags is a bitmask of flags used to indicate to the protocol
	// goroutine what it was notified; this is only accessed atomically.
	notifyFlags uint32 `state:"nosave"`

	// keepalive manages TCP keepalive state. When the connection is idle
	// (no data sent or received) for keepaliveIdle, we start sending
	// keepalives every keepalive.interval. If we send keepalive.count
	// without hearing a response, the connection is closed.
	keepalive keepalive

	// userTimeout if non-zero specifies a user specified timeout for
	// a connection w/ pending data to send. A connection that has pending
	// unacked data will be forcibily aborted if the timeout is reached
	// without any data being acked.
	userTimeout time.Duration

	// deferAccept if non-zero specifies a user specified time during
	// which the final ACK of a handshake will be dropped provided the
	// ACK is a bare ACK and carries no data. If the timeout is crossed then
	// the bare ACK is accepted and the connection is delivered to the
	// listener.
	deferAccept time.Duration

	// pendingAccepted is a synchronization primitive used to track number
	// of connections that are queued up to be delivered to the accepted
	// channel. We use this to ensure that all goroutines blocked on writing
	// to the acceptedChan below terminate before we close acceptedChan.
	pendingAccepted sync.WaitGroup `state:"nosave"`

	// acceptMu protects acceptedChan.
	acceptMu sync.Mutex `state:"nosave"`

	// acceptCond is a condition variable that can be used to block on when
	// acceptedChan is full and an endpoint is ready to be delivered.
	//
	// This condition variable is required because just blocking on sending
	// to acceptedChan does not work in cases where endpoint.Listen is
	// called twice with different backlog values. In such cases the channel
	// is closed and a new one created. Any pending goroutines blocking on
	// the write to the channel will panic.
	//
	// We use this condition variable to block/unblock goroutines which
	// tried to deliver an endpoint but couldn't because accept backlog was
	// full ( See: endpoint.deliverAccepted ).
	acceptCond *sync.Cond `state:"nosave"`

	// acceptedChan is used by a listening endpoint protocol goroutine to
	// send newly accepted connections to the endpoint so that they can be
	// read by Accept() calls.
	acceptedChan chan *endpoint `state:".([]*endpoint)"`

	// The following are only used from the protocol goroutine, and
	// therefore don't need locks to protect them.
	rcv *receiver `state:"wait"`
	snd *sender   `state:"wait"`

	// The goroutine drain completion notification channel.
	drainDone chan struct{} `state:"nosave"`

	// The goroutine undrain notification channel. This is currently used as
	// a way to block the worker goroutines. Today nothing closes/writes
	// this channel and this causes any goroutines waiting on this to just
	// block. This is used during save/restore to prevent worker goroutines
	// from mutating state as it's being saved.
	undrain chan struct{} `state:"nosave"`

	// probe if not nil is invoked on every received segment. It is passed
	// a copy of the current state of the endpoint.
	probe stack.TCPProbeFunc `state:"nosave"`

	// The following are only used to assist the restore run to re-connect.
	connectingAddress tcpip.Address

	// amss is the advertised MSS to the peer by this endpoint.
	amss uint16

	// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
	// applied while sending packets. Defaults to 0 as on Linux.
	sendTOS uint8

	gso *stack.GSO

	// TODO(b/142022063): Add ability to save and restore per endpoint stats.
	stats Stats `state:"nosave"`

	// tcpLingerTimeout is the maximum amount of a time a socket
	// a socket stays in TIME_WAIT state before being marked
	// closed.
	tcpLingerTimeout time.Duration

	// closed indicates that the user has called closed on the
	// endpoint and at this point the endpoint is only around
	// to complete the TCP shutdown.
	closed bool

	// txHash is the transport layer hash to be set on outbound packets
	// emitted by this endpoint.
	txHash uint32

	// owner is used to get uid and gid of the packet.
	owner tcpip.PacketOwner

	// ops is used to get socket level options.
	ops tcpip.SocketOptions

	// lastOutOfWindowAckTime is the time at which the an ACK was sent in response
	// to an out of window segment being received by this endpoint.
	lastOutOfWindowAckTime time.Time `state:".(unixTime)"`
}

// UniqueID implements stack.TransportEndpoint.UniqueID.
func (e *endpoint) UniqueID() uint64 {
	return e.uniqueID
}

// calculateAdvertisedMSS calculates the MSS to advertise.
//
// If userMSS is non-zero and is not greater than the maximum possible MSS for
// r, it will be used; otherwise, the maximum possible MSS will be used.
func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 {
	// The maximum possible MSS is dependent on the route.
	// TODO(b/143359391): Respect TCP Min and Max size.
	maxMSS := uint16(r.MTU() - header.TCPMinimumSize)

	if userMSS != 0 && userMSS < maxMSS {
		return userMSS
	}

	return maxMSS
}

// LockUser tries to lock e.mu and if it fails it will check if the lock is held
// by another syscall goroutine. If yes, then it will goto sleep waiting for the
// lock to be released, if not then it will spin till it acquires the lock or
// another syscall goroutine acquires it in which case it will goto sleep as
// described above.
//
// The assumption behind spinning here being that background packet processing
// should not be holding the lock for long and spinning reduces latency as we
// avoid an expensive sleep/wakeup of of the syscall goroutine).
func (e *endpoint) LockUser() {
	for {
		// Try first if the sock is locked then check if it's owned
		// by another user goroutine if not then we spin, otherwise
		// we just go to sleep on the Lock() and wait.
		if !e.mu.TryLock() {
			// If socket is owned by the user then just go to sleep
			// as the lock could be held for a reasonably long time.
			if atomic.LoadUint32(&e.ownedByUser) == 1 {
				e.mu.Lock()
				atomic.StoreUint32(&e.ownedByUser, 1)
				return
			}
			// Spin but yield the processor since the lower half
			// should yield the lock soon.
			runtime.Gosched()
			continue
		}
		atomic.StoreUint32(&e.ownedByUser, 1)
		return
	}
}

// UnlockUser will check if there are any segments already queued for processing
// and process any such segments before unlocking e.mu. This is required because
// we when packets arrive and endpoint lock is already held then such packets
// are queued up to be processed. If the lock is held by the endpoint goroutine
// then it will process these packets but if the lock is instead held by the
// syscall goroutine then we can have the syscall goroutine process the backlog
// before unlocking.
//
// This avoids an unnecessary wakeup of the endpoint protocol goroutine for the
// endpoint. It's also required eventually when we get rid of the endpoint
// protocol goroutine altogether.
//
// Precondition: e.LockUser() must have been called before calling e.UnlockUser()
func (e *endpoint) UnlockUser() {
	// Lock segment queue before checking so that we avoid a race where
	// segments can be queued between the time we check if queue is empty
	// and actually unlock the endpoint mutex.
	for {
		e.segmentQueue.mu.Lock()
		if e.segmentQueue.emptyLocked() {
			if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
				panic("e.UnlockUser() called without calling e.LockUser()")
			}
			e.mu.Unlock()
			e.segmentQueue.mu.Unlock()
			return
		}
		e.segmentQueue.mu.Unlock()

		switch e.EndpointState() {
		case StateEstablished:
			if err := e.handleSegments(true /* fastPath */); err != nil {
				e.notifyProtocolGoroutine(notifyTickleWorker)
			}
		default:
			// Since we are waking the endpoint goroutine here just unlock
			// and let it process the queued segments.
			e.newSegmentWaker.Assert()
			if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
				panic("e.UnlockUser() called without calling e.LockUser()")
			}
			e.mu.Unlock()
			return
		}
	}
}

// StopWork halts packet processing. Only to be used in tests.
func (e *endpoint) StopWork() {
	e.mu.Lock()
}

// ResumeWork resumes packet processing. Only to be used in tests.
func (e *endpoint) ResumeWork() {
	e.mu.Unlock()
}

// setEndpointState updates the state of the endpoint to state atomically. This
// method is unexported as the only place we should update the state is in this
// package but we allow the state to be read freely without holding e.mu.
//
// Precondition: e.mu must be held to call this method.
func (e *endpoint) setEndpointState(state EndpointState) {
	oldstate := EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
	switch state {
	case StateEstablished:
		e.stack.Stats().TCP.CurrentEstablished.Increment()
		e.stack.Stats().TCP.CurrentConnected.Increment()
	case StateError:
		fallthrough
	case StateClose:
		if oldstate == StateCloseWait || oldstate == StateEstablished {
			e.stack.Stats().TCP.EstablishedResets.Increment()
		}
		fallthrough
	default:
		if oldstate == StateEstablished {
			e.stack.Stats().TCP.CurrentEstablished.Decrement()
		}
	}
	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
}

// EndpointState returns the current state of the endpoint.
func (e *endpoint) EndpointState() EndpointState {
	return EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
}

// setRecentTimestamp sets the recentTS field to the provided value.
func (e *endpoint) setRecentTimestamp(recentTS uint32) {
	e.recentTS = recentTS
	e.recentTSTime = time.Now()
}

// recentTimestamp returns the value of the recentTS field.
func (e *endpoint) recentTimestamp() uint32 {
	return e.recentTS
}

// keepalive is a synchronization wrapper used to appease stateify. See the
// comment in endpoint, where it is used.
//
// +stateify savable
type keepalive struct {
	sync.Mutex `state:"nosave"`
	idle       time.Duration
	interval   time.Duration
	count      int
	unacked    int
	timer      timer       `state:"nosave"`
	waker      sleep.Waker `state:"nosave"`
}

func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
	e := &endpoint{
		stack: s,
		EndpointInfo: EndpointInfo{
			TransportEndpointInfo: stack.TransportEndpointInfo{
				NetProto:   netProto,
				TransProto: header.TCPProtocolNumber,
			},
		},
		waiterQueue: waiterQueue,
		state:       StateInitial,
		rcvBufSize:  DefaultReceiveBufferSize,
		sndMTU:      int(math.MaxInt32),
		keepalive: keepalive{
			// Linux defaults.
			idle:     2 * time.Hour,
			interval: 75 * time.Second,
			count:    9,
		},
		uniqueID:      s.UniqueID(),
		txHash:        s.Rand().Uint32(),
		windowClamp:   DefaultReceiveBufferSize,
		maxSynRetries: DefaultSynRetries,
	}
	e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits)
	e.ops.SetMulticastLoop(true)
	e.ops.SetQuickAck(true)
	e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */)

	var ss tcpip.TCPSendBufferSizeRangeOption
	if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
		e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */)
	}

	var rs tcpip.TCPReceiveBufferSizeRangeOption
	if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
		e.rcvBufSize = rs.Default
	}

	var cs tcpip.CongestionControlOption
	if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil {
		e.cc = cs
	}

	var mrb tcpip.TCPModerateReceiveBufferOption
	if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
		e.rcvAutoParams.disabled = !bool(mrb)
	}

	var de tcpip.TCPDelayEnabled
	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
		e.ops.SetDelayOption(true)
	}

	var tcpLT tcpip.TCPLingerTimeoutOption
	if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil {
		e.tcpLingerTimeout = time.Duration(tcpLT)
	}

	var synRetries tcpip.TCPSynRetriesOption
	if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil {
		e.maxSynRetries = uint8(synRetries)
	}

	s.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery)

	if p := s.GetTCPProbe(); p != nil {
		e.probe = p
	}

	e.segmentQueue.ep = e
	e.tsOffset = timeStampOffset()
	e.acceptCond = sync.NewCond(&e.acceptMu)
	e.keepalive.timer.init(&e.keepalive.waker)

	return e
}

// Readiness returns the current readiness of the endpoint. For example, if
// waiter.EventIn is set, the endpoint is immediately readable.
func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
	result := waiter.EventMask(0)

	switch e.EndpointState() {
	case StateInitial, StateBound:
		// This prevents blocking of new sockets which are not
		// connected when SO_LINGER is set.
		result |= waiter.EventHUp

	case StateConnecting, StateSynSent, StateSynRecv:
		// Ready for nothing.

	case StateClose, StateError, StateTimeWait:
		// Ready for anything.
		result = mask

	case StateListen:
		// Check if there's anything in the accepted channel.
		if (mask & waiter.EventIn) != 0 {
			e.acceptMu.Lock()
			if len(e.acceptedChan) > 0 {
				result |= waiter.EventIn
			}
			e.acceptMu.Unlock()
		}
	}
	if e.EndpointState().connected() {
		// Determine if the endpoint is writable if requested.
		if (mask & waiter.EventOut) != 0 {
			e.sndBufMu.Lock()
			sndBufSize := e.getSendBufferSize()
			if e.sndClosed || e.sndBufUsed < sndBufSize {
				result |= waiter.EventOut
			}
			e.sndBufMu.Unlock()
		}

		// Determine if the endpoint is readable if requested.
		if (mask & waiter.EventIn) != 0 {
			e.rcvListMu.Lock()
			if e.rcvBufUsed > 0 || e.rcvClosed {
				result |= waiter.EventIn
			}
			e.rcvListMu.Unlock()
		}
	}

	return result
}

func (e *endpoint) fetchNotifications() uint32 {
	return atomic.SwapUint32(&e.notifyFlags, 0)
}

func (e *endpoint) notifyProtocolGoroutine(n uint32) {
	for {
		v := atomic.LoadUint32(&e.notifyFlags)
		if v&n == n {
			// The flags are already set.
			return
		}

		if atomic.CompareAndSwapUint32(&e.notifyFlags, v, v|n) {
			if v == 0 {
				// We are causing a transition from no flags to
				// at least one flag set, so we must cause the
				// protocol goroutine to wake up.
				e.notificationWaker.Assert()
			}
			return
		}
	}
}

// Abort implements stack.TransportEndpoint.Abort.
func (e *endpoint) Abort() {
	// The abort notification is not processed synchronously, so no
	// synchronization is needed.
	//
	// If the endpoint becomes connected after this check, we still close
	// the endpoint. This worst case results in a slower abort.
	//
	// If the endpoint disconnected after the check, nothing needs to be
	// done, so sending a notification which will potentially be ignored is
	// fine.
	//
	// If the endpoint connecting finishes after the check, the endpoint
	// is either in a connected state (where we would notifyAbort anyway),
	// SYN-RECV (where we would also notifyAbort anyway), or in an error
	// state where nothing is required and the notification can be safely
	// ignored.
	//
	// Endpoints where a Close during connecting or SYN-RECV state would be
	// problematic are set to state connecting before being registered (and
	// thus possible to be Aborted). They are never available in initial
	// state.
	//
	// Endpoints transitioning from initial to connecting state may be
	// safely either closed or sent notifyAbort.
	if s := e.EndpointState(); s == StateConnecting || s == StateSynRecv || s.connected() {
		e.notifyProtocolGoroutine(notifyAbort)
		return
	}
	e.Close()
}

// Close puts the endpoint in a closed state and frees all resources associated
// with it. It must be called only once and with no other concurrent calls to
// the endpoint.
func (e *endpoint) Close() {
	e.LockUser()
	defer e.UnlockUser()
	if e.closed {
		return
	}

	linger := e.SocketOptions().GetLinger()
	if linger.Enabled && linger.Timeout == 0 {
		s := e.EndpointState()
		isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv
		if isResetState {
			// Close the endpoint without doing full shutdown and
			// send a RST.
			e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
			e.closeNoShutdownLocked()

			// Wake up worker to close the endpoint.
			switch s {
			case StateSynRecv:
				e.notifyProtocolGoroutine(notifyClose)
			default:
				e.notifyProtocolGoroutine(notifyTickleWorker)
			}
			return
		}
	}

	// Issue a shutdown so that the peer knows we won't send any more data
	// if we're connected, or stop accepting if we're listening.
	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
	e.closeNoShutdownLocked()
}

// closeNoShutdown closes the endpoint without doing a full shutdown.
func (e *endpoint) closeNoShutdownLocked() {
	// For listening sockets, we always release ports inline so that they
	// are immediately available for reuse after Close() is called. If also
	// registered, we unregister as well otherwise the next user would fail
	// in Listen() when trying to register.
	if e.EndpointState() == StateListen && e.isPortReserved {
		if e.isRegistered {
			e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
			e.isRegistered = false
		}

		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice, e.boundDest)
		e.isPortReserved = false
		e.boundBindToDevice = 0
		e.boundPortFlags = ports.Flags{}
		e.boundDest = tcpip.FullAddress{}
	}

	// Mark endpoint as closed.
	e.closed = true

	switch e.EndpointState() {
	case StateClose, StateError:
		return
	}

	eventMask := waiter.EventIn | waiter.EventOut
	// Either perform the local cleanup or kick the worker to make sure it
	// knows it needs to cleanup.
	if e.workerRunning {
		e.workerCleanup = true
		tcpip.AddDanglingEndpoint(e)
		// Worker will remove the dangling endpoint when the endpoint
		// goroutine terminates.
		e.notifyProtocolGoroutine(notifyClose)
	} else {
		e.transitionToStateCloseLocked()
		// Notify that the endpoint is closed.
		eventMask |= waiter.EventHUp
	}

	// The TCP closing state-machine would eventually notify EventHUp, but we
	// notify EventIn|EventOut immediately to unblock any blocked waiters.
	e.waiterQueue.Notify(eventMask)
}

// closePendingAcceptableConnections closes all connections that have completed
// handshake but not yet been delivered to the application.
func (e *endpoint) closePendingAcceptableConnectionsLocked() {
	e.acceptMu.Lock()
	if e.acceptedChan == nil {
		e.acceptMu.Unlock()
		return
	}
	close(e.acceptedChan)
	ch := e.acceptedChan
	e.acceptedChan = nil
	e.acceptCond.Broadcast()
	e.acceptMu.Unlock()

	// Reset all connections that are waiting to be accepted.
	for n := range ch {
		n.notifyProtocolGoroutine(notifyReset)
	}
	// Wait for reset of all endpoints that are still waiting to be delivered to
	// the now closed acceptedChan.
	e.pendingAccepted.Wait()
}

// cleanupLocked frees all resources associated with the endpoint. It is called
// after Close() is called and the worker goroutine (if any) is done with its
// work.
func (e *endpoint) cleanupLocked() {
	// Close all endpoints that might have been accepted by TCP but not by
	// the client.
	e.closePendingAcceptableConnectionsLocked()
	e.keepalive.timer.cleanup()

	e.workerCleanup = false

	if e.isRegistered {
		e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
		e.isRegistered = false
	}

	if e.isPortReserved {
		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice, e.boundDest)
		e.isPortReserved = false
	}
	e.boundBindToDevice = 0
	e.boundPortFlags = ports.Flags{}
	e.boundDest = tcpip.FullAddress{}

	if e.route != nil {
		e.route.Release()
		e.route = nil
	}

	e.stack.CompleteTransportEndpointCleanup(e)
	tcpip.DeleteDanglingEndpoint(e)
}

// wndFromSpace returns the window that we can advertise based on the available
// receive buffer space.
func wndFromSpace(space int) int {
	return space >> rcvAdvWndScale
}

// initialReceiveWindow returns the initial receive window to advertise in the
// SYN/SYN-ACK.
func (e *endpoint) initialReceiveWindow() int {
	rcvWnd := wndFromSpace(e.receiveBufferAvailable())
	if rcvWnd > math.MaxUint16 {
		rcvWnd = math.MaxUint16
	}

	// Use the user supplied MSS, if available.
	routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2
	if rcvWnd > routeWnd {
		rcvWnd = routeWnd
	}
	rcvWndScale := e.rcvWndScaleForHandshake()

	// Round-down the rcvWnd to a multiple of wndScale. This ensures that the
	// window offered in SYN won't be reduced due to the loss of precision if
	// window scaling is enabled after the handshake.
	rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)

	// Ensure we can always accept at least 1 byte if the scale specified
	// was too high for the provided rcvWnd.
	if rcvWnd == 0 {
		rcvWnd = 1
	}

	return rcvWnd
}

// ModerateRecvBuf adjusts the receive buffer and the advertised window
// based on the number of bytes copied to userspace.
func (e *endpoint) ModerateRecvBuf(copied int) {
	e.LockUser()
	defer e.UnlockUser()

	e.rcvListMu.Lock()
	if e.rcvAutoParams.disabled {
		e.rcvListMu.Unlock()
		return
	}
	now := time.Now()
	if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt {
		e.rcvAutoParams.copied += copied
		e.rcvListMu.Unlock()
		return
	}
	prevRTTCopied := e.rcvAutoParams.copied + copied
	prevCopied := e.rcvAutoParams.prevCopied
	rcvWnd := 0
	if prevRTTCopied > prevCopied {
		// The minimal receive window based on what was copied by the app
		// in the immediate preceding RTT and some extra buffer for 16
		// segments to account for variations.
		// We multiply by 2 to account for packet losses.
		rcvWnd = prevRTTCopied*2 + 16*int(e.amss)

		// Scale for slow start based on bytes copied in this RTT vs previous.
		grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied

		// Multiply growth factor by 2 again to account for sender being
		// in slow-start where the sender grows it's congestion window
		// by 100% per RTT.
		rcvWnd += grow * 2

		// Make sure auto tuned buffer size can always receive upto 2x
		// the initial window of 10 segments.
		if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd {
			rcvWnd = minRcvWnd
		}

		// Cap the auto tuned buffer size by the maximum permissible
		// receive buffer size.
		if max := e.maxReceiveBufferSize(); rcvWnd > max {
			rcvWnd = max
		}

		// We do not adjust downwards as that can cause the receiver to
		// reject valid data that might already be in flight as the
		// acceptable window will shrink.
		if rcvWnd > e.rcvBufSize {
			availBefore := wndFromSpace(e.receiveBufferAvailableLocked())
			e.rcvBufSize = rcvWnd
			availAfter := wndFromSpace(e.receiveBufferAvailableLocked())
			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
				e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
			}
		}

		// We only update prevCopied when we grow the buffer because in cases
		// where prevCopied > prevRTTCopied the existing buffer is already big
		// enough to handle the current rate and we don't need to do any
		// adjustments.
		e.rcvAutoParams.prevCopied = prevRTTCopied
	}
	e.rcvAutoParams.measureTime = now
	e.rcvAutoParams.copied = 0
	e.rcvListMu.Unlock()
}

// SetOwner implements tcpip.Endpoint.SetOwner.
func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
	e.owner = owner
}

// Preconditions: e.mu must be held to call this function.
func (e *endpoint) hardErrorLocked() tcpip.Error {
	err := e.hardError
	e.hardError = nil
	return err
}

// Preconditions: e.mu must be held to call this function.
func (e *endpoint) lastErrorLocked() tcpip.Error {
	e.lastErrorMu.Lock()
	defer e.lastErrorMu.Unlock()
	err := e.lastError
	e.lastError = nil
	return err
}

// LastError implements tcpip.Endpoint.LastError.
func (e *endpoint) LastError() tcpip.Error {
	e.LockUser()
	defer e.UnlockUser()
	if err := e.hardErrorLocked(); err != nil {
		return err
	}
	return e.lastErrorLocked()
}

// UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError.
func (e *endpoint) UpdateLastError(err tcpip.Error) {
	e.LockUser()
	e.lastErrorMu.Lock()
	e.lastError = err
	e.lastErrorMu.Unlock()
	e.UnlockUser()
}

// Read implements tcpip.Endpoint.Read.
func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) {
	e.rcvReadMu.Lock()
	defer e.rcvReadMu.Unlock()

	// N.B. Here we get a range of segments to be processed. It is safe to not
	// hold rcvListMu when processing, since we hold rcvReadMu to ensure only we
	// can remove segments from the list through commitRead().
	first, last, serr := e.startRead()
	if serr != nil {
		if _, ok := serr.(*tcpip.ErrClosedForReceive); ok {
			e.stats.ReadErrors.ReadClosed.Increment()
		}
		return tcpip.ReadResult{}, serr
	}

	var err error
	done := 0
	s := first
	for s != nil {
		var n int
		n, err = s.data.ReadTo(dst, opts.Peek)
		// Book keeping first then error handling.

		done += n

		if opts.Peek {
			// For peek, we use the (first, last) range of segment returned from
			// startRead. We don't consume the receive buffer, so commitRead should
			// not be called.
			//
			// N.B. It is important to use `last` to determine the last segment, since
			// appending can happen while we process, and will lead to data race.
			if s == last {
				break
			}
			s = s.Next()
		} else {
			// N.B. commitRead() conveniently returns the next segment to read, after
			// removing the data/segment that is read.
			s = e.commitRead(n)
		}

		if err != nil {
			break
		}
	}

	// If something is read, we must report it. Report error when nothing is read.
	if done == 0 && err != nil {
		return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{}
	}
	return tcpip.ReadResult{
		Count: done,
		Total: done,
	}, nil
}

// startRead checks that endpoint is in a readable state, and return the
// inclusive range of segments that can be read.
//
// Precondition: e.rcvReadMu must be held.
func (e *endpoint) startRead() (first, last *segment, err tcpip.Error) {
	e.LockUser()
	defer e.UnlockUser()

	// When in SYN-SENT state, let the caller block on the receive.
	// An application can initiate a non-blocking connect and then block
	// on a receive. It can expect to read any data after the handshake
	// is complete. RFC793, section 3.9, p58.
	if e.EndpointState() == StateSynSent {
		return nil, nil, &tcpip.ErrWouldBlock{}
	}

	// The endpoint can be read if it's connected, or if it's already closed
	// but has some pending unread data. Also note that a RST being received
	// would cause the state to become StateError so we should allow the
	// reads to proceed before returning a ECONNRESET.
	e.rcvListMu.Lock()
	defer e.rcvListMu.Unlock()

	bufUsed := e.rcvBufUsed
	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
		if s == StateError {
			if err := e.hardErrorLocked(); err != nil {
				return nil, nil, err
			}
			return nil, nil, &tcpip.ErrClosedForReceive{}
		}
		e.stats.ReadErrors.NotConnected.Increment()
		return nil, nil, &tcpip.ErrNotConnected{}
	}

	if e.rcvBufUsed == 0 {
		if e.rcvClosed || !e.EndpointState().connected() {
			return nil, nil, &tcpip.ErrClosedForReceive{}
		}
		return nil, nil, &tcpip.ErrWouldBlock{}
	}

	return e.rcvList.Front(), e.rcvList.Back(), nil
}

// commitRead commits a read of done bytes and returns the next non-empty
// segment to read. Data read from the segment must have also been removed from
// the segment in order for this method to work correctly.
//
// It is performance critical to call commitRead frequently when servicing a big
// Read request, so TCP can make progress timely. Right now, it is designed to
// do this per segment read, hence this method conveniently returns the next
// segment to read while holding the lock.
//
// Precondition: e.rcvReadMu must be held.
func (e *endpoint) commitRead(done int) *segment {
	e.LockUser()
	defer e.UnlockUser()
	e.rcvListMu.Lock()
	defer e.rcvListMu.Unlock()

	memDelta := 0
	s := e.rcvList.Front()
	for s != nil && s.data.Size() == 0 {
		e.rcvList.Remove(s)
		// Memory is only considered released when the whole segment has been
		// read.
		memDelta += s.segMemSize()
		s.decRef()
		s = e.rcvList.Front()
	}
	e.rcvBufUsed -= done

	if memDelta > 0 {
		// If the window was small before this read and if the read freed up
		// enough buffer space, to either fit an aMSS or half a receive buffer
		// (whichever smaller), then notify the protocol goroutine to send a
		// window update.
		if crossed, above := e.windowCrossedACKThresholdLocked(memDelta); crossed && above {
			e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
		}
	}

	return e.rcvList.Front()
}

// isEndpointWritableLocked checks if a given endpoint is writable
// and also returns the number of bytes that can be written at this
// moment. If the endpoint is not writable then it returns an error
// indicating the reason why it's not writable.
// Caller must hold e.mu and e.sndBufMu
func (e *endpoint) isEndpointWritableLocked() (int, tcpip.Error) {
	// The endpoint cannot be written to if it's not connected.
	switch s := e.EndpointState(); {
	case s == StateError:
		if err := e.hardErrorLocked(); err != nil {
			return 0, err
		}
		return 0, &tcpip.ErrClosedForSend{}
	case !s.connecting() && !s.connected():
		return 0, &tcpip.ErrClosedForSend{}
	case s.connecting():
		// As per RFC793, page 56, a send request arriving when in connecting
		// state, can be queued to be completed after the state becomes
		// connected. Return an error code for the caller of endpoint Write to
		// try again, until the connection handshake is complete.
		return 0, &tcpip.ErrWouldBlock{}
	}

	// Check if the connection has already been closed for sends.
	if e.sndClosed {
		return 0, &tcpip.ErrClosedForSend{}
	}

	sndBufSize := e.getSendBufferSize()
	avail := sndBufSize - e.sndBufUsed
	if avail <= 0 {
		return 0, &tcpip.ErrWouldBlock{}
	}
	return avail, nil
}

// Write writes data to the endpoint's peer.
func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) {
	// Linux completely ignores any address passed to sendto(2) for TCP sockets
	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
	// and opts.EndOfRecord are also ignored.

	e.LockUser()
	defer e.UnlockUser()

	nextSeg, n, err := func() (*segment, int, tcpip.Error) {
		e.sndBufMu.Lock()
		defer e.sndBufMu.Unlock()

		avail, err := e.isEndpointWritableLocked()
		if err != nil {
			e.stats.WriteErrors.WriteClosed.Increment()
			return nil, 0, err
		}

		v, err := func() ([]byte, tcpip.Error) {
			// We can release locks while copying data.
			//
			// This is not possible if atomic is set, because we can't allow the
			// available buffer space to be consumed by some other caller while we
			// are copying data in.
			if !opts.Atomic {
				e.sndBufMu.Unlock()
				defer e.sndBufMu.Lock()

				e.UnlockUser()
				defer e.LockUser()
			}

			// Fetch data.
			if l := p.Len(); l < avail {
				avail = l
			}
			if avail == 0 {
				return nil, nil
			}
			v := make([]byte, avail)
			n, err := p.Read(v)
			if err != nil && err != io.EOF {
				return nil, &tcpip.ErrBadBuffer{}
			}
			return v[:n], nil
		}()
		if len(v) == 0 || err != nil {
			return nil, 0, err
		}

		if !opts.Atomic {
			// Since we released locks in between it's possible that the
			// endpoint transitioned to a CLOSED/ERROR states so make
			// sure endpoint is still writable before trying to write.
			avail, err := e.isEndpointWritableLocked()
			if err != nil {
				e.stats.WriteErrors.WriteClosed.Increment()
				return nil, 0, err
			}

			// Discard any excess data copied in due to avail being reduced due
			// to a simultaneous write call to the socket.
			if avail < len(v) {
				v = v[:avail]
			}
		}

		// Add data to the send queue.
		s := newOutgoingSegment(e.ID, v)
		e.sndBufUsed += len(v)
		e.sndBufInQueue += seqnum.Size(len(v))
		e.sndQueue.PushBack(s)

		return e.drainSendQueueLocked(), len(v), nil
	}()
	// Return if either we didn't queue anything or if an error occurred while
	// attempting to queue data.
	if n == 0 || err != nil {
		return 0, err
	}
	e.sendData(nextSeg)
	return int64(n), nil
}

// selectWindowLocked returns the new window without checking for shrinking or scaling
// applied.
// Precondition: e.mu and e.rcvListMu must be held.
func (e *endpoint) selectWindowLocked() (wnd seqnum.Size) {
	wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked())
	maxWindow := wndFromSpace(e.rcvBufSize)
	wndFromUsedBytes := maxWindow - e.rcvBufUsed

	// We take the lesser of the wndFromAvailable and wndFromUsedBytes because in
	// cases where we receive a lot of small segments the segment overhead is a
	// lot higher and we can run out socket buffer space before we can fill the
	// previous window we advertised. In cases where we receive MSS sized or close
	// MSS sized segments we will probably run out of window space before we
	// exhaust receive buffer.
	newWnd := wndFromAvailable
	if newWnd > wndFromUsedBytes {
		newWnd = wndFromUsedBytes
	}
	if newWnd < 0 {
		newWnd = 0
	}
	return seqnum.Size(newWnd)
}

// selectWindow invokes selectWindowLocked after acquiring e.rcvListMu.
func (e *endpoint) selectWindow() (wnd seqnum.Size) {
	e.rcvListMu.Lock()
	wnd = e.selectWindowLocked()
	e.rcvListMu.Unlock()
	return wnd
}

// windowCrossedACKThresholdLocked checks if the receive window to be announced
// would be under aMSS or under the window derived from half receive buffer,
// whichever smaller. This is useful as a receive side silly window syndrome
// prevention mechanism. If window grows to reasonable value, we should send ACK
// to the sender to inform the rx space is now large. We also want ensure a
// series of small read()'s won't trigger a flood of spurious tiny ACK's.
//
// For large receive buffers, the threshold is aMSS - once reader reads more
// than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of
// receive buffer size. This is chosen arbitrairly.
// crossed will be true if the window size crossed the ACK threshold.
// above will be true if the new window is >= ACK threshold and false
// otherwise.
//
// Precondition: e.mu and e.rcvListMu must be held.
func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed bool, above bool) {
	newAvail := int(e.selectWindowLocked())
	oldAvail := newAvail - deltaBefore
	if oldAvail < 0 {
		oldAvail = 0
	}
	threshold := int(e.amss)
	// rcvBufFraction is the inverse of the fraction of receive buffer size that
	// is used to decide if the available buffer space is now above it.
	const rcvBufFraction = 2
	if wndThreshold := wndFromSpace(e.rcvBufSize / rcvBufFraction); threshold > wndThreshold {
		threshold = wndThreshold
	}
	switch {
	case oldAvail < threshold && newAvail >= threshold:
		return true, true
	case oldAvail >= threshold && newAvail < threshold:
		return true, false
	}
	return false, false
}

// OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet.
func (e *endpoint) OnReuseAddressSet(v bool) {
	e.LockUser()
	e.portFlags.TupleOnly = v
	e.UnlockUser()
}

// OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet.
func (e *endpoint) OnReusePortSet(v bool) {
	e.LockUser()
	e.portFlags.LoadBalanced = v
	e.UnlockUser()
}

// OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet.
func (e *endpoint) OnKeepAliveSet(v bool) {
	e.notifyProtocolGoroutine(notifyKeepaliveChanged)
}

// OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet.
func (e *endpoint) OnDelayOptionSet(v bool) {
	if !v {
		// Handle delayed data.
		e.sndWaker.Assert()
	}
}

// OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet.
func (e *endpoint) OnCorkOptionSet(v bool) {
	if !v {
		// Handle the corked data.
		e.sndWaker.Assert()
	}
}

func (e *endpoint) getSendBufferSize() int {
	return int(e.ops.GetSendBufferSize())
}

// SetSockOptInt sets a socket option.
func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
	const inetECNMask = 3

	switch opt {
	case tcpip.KeepaliveCountOption:
		e.keepalive.Lock()
		e.keepalive.count = v
		e.keepalive.Unlock()
		e.notifyProtocolGoroutine(notifyKeepaliveChanged)

	case tcpip.IPv4TOSOption:
		e.LockUser()
		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
		// ignore the bits for now.
		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
		e.UnlockUser()

	case tcpip.IPv6TrafficClassOption:
		e.LockUser()
		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
		// ignore the bits for now.
		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
		e.UnlockUser()

	case tcpip.MaxSegOption:
		userMSS := v
		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
			return &tcpip.ErrInvalidOptionValue{}
		}
		e.LockUser()
		e.userMSS = uint16(userMSS)
		e.UnlockUser()
		e.notifyProtocolGoroutine(notifyMSSChanged)

	case tcpip.MTUDiscoverOption:
		// Return not supported if attempting to set this option to
		// anything other than path MTU discovery disabled.
		if v != tcpip.PMTUDiscoveryDont {
			return &tcpip.ErrNotSupported{}
		}

	case tcpip.ReceiveBufferSizeOption:
		// Make sure the receive buffer size is within the min and max
		// allowed.
		var rs tcpip.TCPReceiveBufferSizeRangeOption
		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
			panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &rs, err))
		}

		if v > rs.Max {
			v = rs.Max
		}

		if v < math.MaxInt32/SegOverheadFactor {
			v *= SegOverheadFactor
			if v < rs.Min {
				v = rs.Min
			}
		} else {
			v = math.MaxInt32
		}

		e.LockUser()
		e.rcvListMu.Lock()

		// Make sure the receive buffer size allows us to send a
		// non-zero window size.
		scale := uint8(0)
		if e.rcv != nil {
			scale = e.rcv.rcvWndScale
		}
		if v>>scale == 0 {
			v = 1 << scale
		}

		availBefore := wndFromSpace(e.receiveBufferAvailableLocked())
		e.rcvBufSize = v
		availAfter := wndFromSpace(e.receiveBufferAvailableLocked())

		e.rcvAutoParams.disabled = true

		// Immediately send an ACK to uncork the sender silly window
		// syndrome prevetion, when our available space grows above aMSS
		// or half receive buffer, whichever smaller.
		if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
			e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
		}

		e.rcvListMu.Unlock()
		e.UnlockUser()

	case tcpip.TTLOption:
		e.LockUser()
		e.ttl = uint8(v)
		e.UnlockUser()

	case tcpip.TCPSynCountOption:
		if v < 1 || v > 255 {
			return &tcpip.ErrInvalidOptionValue{}
		}
		e.LockUser()
		e.maxSynRetries = uint8(v)
		e.UnlockUser()

	case tcpip.TCPWindowClampOption:
		if v == 0 {
			e.LockUser()
			switch e.EndpointState() {
			case StateClose, StateInitial:
				e.windowClamp = 0
				e.UnlockUser()
				return nil
			default:
				e.UnlockUser()
				return &tcpip.ErrInvalidOptionValue{}
			}
		}
		var rs tcpip.TCPReceiveBufferSizeRangeOption
		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
			if v < rs.Min/2 {
				v = rs.Min / 2
			}
		}
		e.LockUser()
		e.windowClamp = uint32(v)
		e.UnlockUser()
	}
	return nil
}

func (e *endpoint) HasNIC(id int32) bool {
	return id == 0 || e.stack.HasNIC(tcpip.NICID(id))
}

// SetSockOpt sets a socket option.
func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
	switch v := opt.(type) {
	case *tcpip.KeepaliveIdleOption:
		e.keepalive.Lock()
		e.keepalive.idle = time.Duration(*v)
		e.keepalive.Unlock()
		e.notifyProtocolGoroutine(notifyKeepaliveChanged)

	case *tcpip.KeepaliveIntervalOption:
		e.keepalive.Lock()
		e.keepalive.interval = time.Duration(*v)
		e.keepalive.Unlock()
		e.notifyProtocolGoroutine(notifyKeepaliveChanged)

	case *tcpip.TCPUserTimeoutOption:
		e.LockUser()
		e.userTimeout = time.Duration(*v)
		e.UnlockUser()

	case *tcpip.CongestionControlOption:
		// Query the available cc algorithms in the stack and
		// validate that the specified algorithm is actually
		// supported in the stack.
		var avail tcpip.TCPAvailableCongestionControlOption
		if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
			return err
		}
		availCC := strings.Split(string(avail), " ")
		for _, cc := range availCC {
			if *v == tcpip.CongestionControlOption(cc) {
				e.LockUser()
				state := e.EndpointState()
				e.cc = *v
				switch state {
				case StateEstablished:
					if e.EndpointState() == state {
						e.snd.cc = e.snd.initCongestionControl(e.cc)
					}
				}
				e.UnlockUser()
				return nil
			}
		}

		// Linux returns ENOENT when an invalid congestion
		// control algorithm is specified.
		return &tcpip.ErrNoSuchFile{}

	case *tcpip.TCPLingerTimeoutOption:
		e.LockUser()

		switch {
		case *v < 0:
			// Same as effectively disabling TCPLinger timeout.
			*v = -1
		case *v == 0:
			// Same as the stack default.
			var stackLingerTimeout tcpip.TCPLingerTimeoutOption
			if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil {
				panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err))
			}
			*v = stackLingerTimeout
		case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout):
			// Cap it to Stack's default TCP_LINGER2 timeout.
			*v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
		default:
		}

		e.tcpLingerTimeout = time.Duration(*v)
		e.UnlockUser()

	case *tcpip.TCPDeferAcceptOption:
		e.LockUser()
		if time.Duration(*v) > MaxRTO {
			*v = tcpip.TCPDeferAcceptOption(MaxRTO)
		}
		e.deferAccept = time.Duration(*v)
		e.UnlockUser()

	case *tcpip.SocketDetachFilterOption:
		return nil

	default:
		return nil
	}
	return nil
}

// readyReceiveSize returns the number of bytes ready to be received.
func (e *endpoint) readyReceiveSize() (int, tcpip.Error) {
	e.LockUser()
	defer e.UnlockUser()

	// The endpoint cannot be in listen state.
	if e.EndpointState() == StateListen {
		return 0, &tcpip.ErrInvalidEndpointState{}
	}

	e.rcvListMu.Lock()
	defer e.rcvListMu.Unlock()

	return e.rcvBufUsed, nil
}

// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
	switch opt {
	case tcpip.KeepaliveCountOption:
		e.keepalive.Lock()
		v := e.keepalive.count
		e.keepalive.Unlock()
		return v, nil

	case tcpip.IPv4TOSOption:
		e.LockUser()
		v := int(e.sendTOS)
		e.UnlockUser()
		return v, nil

	case tcpip.IPv6TrafficClassOption:
		e.LockUser()
		v := int(e.sendTOS)
		e.UnlockUser()
		return v, nil

	case tcpip.MaxSegOption:
		// This is just stubbed out. Linux never returns the user_mss
		// value as it either returns the defaultMSS or returns the
		// actual current MSS. Netstack just returns the defaultMSS
		// always for now.
		v := header.TCPDefaultMSS
		return v, nil

	case tcpip.MTUDiscoverOption:
		// Always return the path MTU discovery disabled setting since
		// it's the only one supported.
		return tcpip.PMTUDiscoveryDont, nil

	case tcpip.ReceiveQueueSizeOption:
		return e.readyReceiveSize()

	case tcpip.ReceiveBufferSizeOption:
		e.rcvListMu.Lock()
		v := e.rcvBufSize
		e.rcvListMu.Unlock()
		return v, nil

	case tcpip.TTLOption:
		e.LockUser()
		v := int(e.ttl)
		e.UnlockUser()
		return v, nil

	case tcpip.TCPSynCountOption:
		e.LockUser()
		v := int(e.maxSynRetries)
		e.UnlockUser()
		return v, nil

	case tcpip.TCPWindowClampOption:
		e.LockUser()
		v := int(e.windowClamp)
		e.UnlockUser()
		return v, nil

	case tcpip.MulticastTTLOption:
		return 1, nil

	default:
		return -1, &tcpip.ErrUnknownProtocolOption{}
	}
}

func (e *endpoint) getTCPInfo() tcpip.TCPInfoOption {
	info := tcpip.TCPInfoOption{}
	e.LockUser()
	snd := e.snd
	if snd != nil {
		// We do not calculate RTT before sending the data packets. If
		// the connection did not send and receive data, then RTT will
		// be zero.
		snd.rtt.Lock()
		info.RTT = snd.rtt.srtt
		info.RTTVar = snd.rtt.rttvar
		snd.rtt.Unlock()

		info.RTO = snd.rto
		info.CcState = snd.state
		info.SndSsthresh = uint32(snd.sndSsthresh)
		info.SndCwnd = uint32(snd.sndCwnd)
		info.ReorderSeen = snd.rc.reorderSeen
	}
	e.UnlockUser()
	return info
}

// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error {
	switch o := opt.(type) {
	case *tcpip.TCPInfoOption:
		*o = e.getTCPInfo()

	case *tcpip.KeepaliveIdleOption:
		e.keepalive.Lock()
		*o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
		e.keepalive.Unlock()

	case *tcpip.KeepaliveIntervalOption:
		e.keepalive.Lock()
		*o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
		e.keepalive.Unlock()

	case *tcpip.TCPUserTimeoutOption:
		e.LockUser()
		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
		e.UnlockUser()

	case *tcpip.CongestionControlOption:
		e.LockUser()
		*o = e.cc
		e.UnlockUser()

	case *tcpip.TCPLingerTimeoutOption:
		e.LockUser()
		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
		e.UnlockUser()

	case *tcpip.TCPDeferAcceptOption:
		e.LockUser()
		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
		e.UnlockUser()

	case *tcpip.OriginalDestinationOption:
		e.LockUser()
		ipt := e.stack.IPTables()
		addr, port, err := ipt.OriginalDst(e.ID, e.NetProto)
		e.UnlockUser()
		if err != nil {
			return err
		}
		*o = tcpip.OriginalDestinationOption{
			Addr: addr,
			Port: port,
		}

	default:
		return &tcpip.ErrUnknownProtocolOption{}
	}
	return nil
}

// checkV4MappedLocked determines the effective network protocol and converts
// addr to its canonical form.
func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) {
	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only())
	if err != nil {
		return tcpip.FullAddress{}, 0, err
	}
	return unwrapped, netProto, nil
}

// Disconnect implements tcpip.Endpoint.Disconnect.
func (*endpoint) Disconnect() tcpip.Error {
	return &tcpip.ErrNotSupported{}
}

// Connect connects the endpoint to its peer.
func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error {
	err := e.connect(addr, true, true)
	if err != nil {
		if !err.IgnoreStats() {
			// Connect failed. Let's wake up any waiters.
			e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
			e.stats.FailedConnectionAttempts.Increment()
		}
	}
	return err
}

// connect connects the endpoint to its peer. In the normal non-S/R case, the
// new connection is expected to run the main goroutine and perform handshake.
// In restore of previously connected endpoints, both ends will be passively
// created (so no new handshaking is done); for stack-accepted connections not
// yet accepted by the app, they are restored without running the main goroutine
// here.
func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) tcpip.Error {
	e.LockUser()
	defer e.UnlockUser()

	connectingAddr := addr.Addr

	addr, netProto, err := e.checkV4MappedLocked(addr)
	if err != nil {
		return err
	}

	if e.EndpointState().connected() {
		// The endpoint is already connected. If caller hasn't been
		// notified yet, return success.
		if !e.isConnectNotified {
			e.isConnectNotified = true
			return nil
		}
		// Otherwise return that it's already connected.
		return &tcpip.ErrAlreadyConnected{}
	}

	nicID := addr.NIC
	switch e.EndpointState() {
	case StateBound:
		// If we're already bound to a NIC but the caller is requesting
		// that we use a different one now, we cannot proceed.
		if e.boundNICID == 0 {
			break
		}

		if nicID != 0 && nicID != e.boundNICID {
			return &tcpip.ErrNoRoute{}
		}

		nicID = e.boundNICID

	case StateInitial:
		// Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
		// when we find a route.

	case StateConnecting, StateSynSent, StateSynRecv:
		// A connection request has already been issued but hasn't completed
		// yet.
		return &tcpip.ErrAlreadyConnecting{}

	case StateError:
		if err := e.hardErrorLocked(); err != nil {
			return err
		}
		return &tcpip.ErrConnectionAborted{}

	default:
		return &tcpip.ErrInvalidEndpointState{}
	}

	// Find a route to the desired destination.
	r, err := e.stack.FindRoute(nicID, e.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
	if err != nil {
		return err
	}
	defer r.Release()

	netProtos := []tcpip.NetworkProtocolNumber{netProto}
	e.ID.LocalAddress = r.LocalAddress
	e.ID.RemoteAddress = r.RemoteAddress
	e.ID.RemotePort = addr.Port

	if e.ID.LocalPort != 0 {
		// The endpoint is bound to a port, attempt to register it.
		err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
		if err != nil {
			return err
		}
	} else {
		// The endpoint doesn't have a local port yet, so try to get
		// one. Make sure that it isn't one that will result in the same
		// address/port for both local and remote (otherwise this
		// endpoint would be trying to connect to itself).
		sameAddr := e.ID.LocalAddress == e.ID.RemoteAddress

		// Calculate a port offset based on the destination IP/port and
		// src IP to ensure that for a given tuple (srcIP, destIP,
		// destPort) the offset used as a starting point is the same to
		// ensure that we can cycle through the port space effectively.
		h := jenkins.Sum32(e.stack.Seed())
		h.Write([]byte(e.ID.LocalAddress))
		h.Write([]byte(e.ID.RemoteAddress))
		portBuf := make([]byte, 2)
		binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort)
		h.Write(portBuf)
		portOffset := h.Sum32()

		var twReuse tcpip.TCPTimeWaitReuseOption
		if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil {
			panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err))
		}

		reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal
		if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly {
			switch netProto {
			case header.IPv4ProtocolNumber:
				reuse = header.IsV4LoopbackAddress(e.ID.LocalAddress) && header.IsV4LoopbackAddress(e.ID.RemoteAddress)
			case header.IPv6ProtocolNumber:
				reuse = e.ID.LocalAddress == header.IPv6Loopback && e.ID.RemoteAddress == header.IPv6Loopback
			}
		}

		bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
		if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, tcpip.Error) {
			if sameAddr && p == e.ID.RemotePort {
				return false, nil
			}
			if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, bindToDevice, addr, nil /* testPort */); err != nil {
				if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse {
					return false, nil
				}
				transEPID := e.ID
				transEPID.LocalPort = p
				// Check if an endpoint is registered with demuxer in TIME-WAIT and if
				// we can reuse it. If we can't find a transport endpoint then we just
				// skip using this port as it's possible that either an endpoint has
				// bound the port but not registered with demuxer yet (no listen/connect
				// done yet) or the reservation was freed between the check above and
				// the FindTransportEndpoint below. But rather than retry the same port
				// we just skip it and move on.
				transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, r.NICID())
				if transEP == nil {
					// ReservePort failed but there is no registered endpoint with
					// demuxer. Which indicates there is at least some endpoint that has
					// bound the port.
					return false, nil
				}

				tcpEP := transEP.(*endpoint)
				tcpEP.LockUser()
				// If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but
				// less than 1 second has elapsed since its recentTS was updated then
				// we cannot reuse the port.
				if tcpEP.EndpointState() != StateTimeWait || time.Since(tcpEP.recentTSTime) < 1*time.Second {
					tcpEP.UnlockUser()
					return false, nil
				}
				// Since the endpoint is in TIME-WAIT it should be safe to acquire its
				// Lock while holding the lock for this endpoint as endpoints in
				// TIME-WAIT do not acquire locks on other endpoints.
				tcpEP.workerCleanup = false
				tcpEP.cleanupLocked()
				tcpEP.notifyProtocolGoroutine(notifyAbort)
				tcpEP.UnlockUser()
				// Now try and Reserve again if it fails then we skip.
				if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, bindToDevice, addr, nil /* testPort */); err != nil {
					return false, nil
				}
			}

			id := e.ID
			id.LocalPort = p
			if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil {
				e.stack.ReleasePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, bindToDevice, addr)
				if _, ok := err.(*tcpip.ErrPortInUse); ok {
					return false, nil
				}
				return false, err
			}

			// Port picking successful. Save the details of
			// the selected port.
			e.ID = id
			e.isPortReserved = true
			e.boundBindToDevice = bindToDevice
			e.boundPortFlags = e.portFlags
			e.boundDest = addr
			return true, nil
		}); err != nil {
			return err
		}
	}

	e.isRegistered = true
	e.setEndpointState(StateConnecting)
	r.Acquire()
	e.route = r
	e.boundNICID = nicID
	e.effectiveNetProtos = netProtos
	e.connectingAddress = connectingAddr

	e.initGSO()

	// Connect in the restore phase does not perform handshake. Restore its
	// connection setting here.
	if !handshake {
		e.segmentQueue.mu.Lock()
		for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} {
			for s := l.Front(); s != nil; s = s.Next() {
				s.id = e.ID
				e.sndWaker.Assert()
			}
		}
		e.segmentQueue.mu.Unlock()
		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
		e.setEndpointState(StateEstablished)
	}

	if run {
		if handshake {
			h := e.newHandshake()
			e.setEndpointState(StateSynSent)
			h.start()
		}
		e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
		e.workerRunning = true
		go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save.
	}

	return &tcpip.ErrConnectStarted{}
}

// ConnectEndpoint is not supported.
func (*endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error {
	return &tcpip.ErrInvalidEndpointState{}
}

// Shutdown closes the read and/or write end of the endpoint connection to its
// peer.
func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error {
	e.LockUser()
	defer e.UnlockUser()
	return e.shutdownLocked(flags)
}

func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error {
	e.shutdownFlags |= flags
	switch {
	case e.EndpointState().connected():
		// Close for read.
		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
			// Mark read side as closed.
			e.rcvListMu.Lock()
			e.rcvClosed = true
			rcvBufUsed := e.rcvBufUsed
			e.rcvListMu.Unlock()

			// If we're fully closed and we have unread data we need to abort
			// the connection with a RST.
			if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 {
				e.resetConnectionLocked(&tcpip.ErrConnectionAborted{})
				// Wake up worker to terminate loop.
				e.notifyProtocolGoroutine(notifyTickleWorker)
				return nil
			}
		}

		// Close for write.
		if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
			e.sndBufMu.Lock()
			if e.sndClosed {
				// Already closed.
				e.sndBufMu.Unlock()
				if e.EndpointState() == StateTimeWait {
					return &tcpip.ErrNotConnected{}
				}
				return nil
			}

			// Queue fin segment.
			s := newOutgoingSegment(e.ID, nil)
			e.sndQueue.PushBack(s)
			e.sndBufInQueue++
			// Mark endpoint as closed.
			e.sndClosed = true
			e.sndBufMu.Unlock()
			e.handleClose()
		}

		return nil
	case e.EndpointState() == StateListen:
		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
			// Reset all connections from the accept queue and keep the
			// worker running so that it can continue handling incoming
			// segments by replying with RST.
			//
			// By not removing this endpoint from the demuxer mapping, we
			// ensure that any other bind to the same port fails, as on Linux.
			e.rcvListMu.Lock()
			e.rcvClosed = true
			e.rcvListMu.Unlock()
			e.closePendingAcceptableConnectionsLocked()
			// Notify waiters that the endpoint is shutdown.
			e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut | waiter.EventHUp | waiter.EventErr)
		}
		return nil
	default:
		return &tcpip.ErrNotConnected{}
	}
}

// Listen puts the endpoint in "listen" mode, which allows it to accept
// new connections.
func (e *endpoint) Listen(backlog int) tcpip.Error {
	err := e.listen(backlog)
	if err != nil {
		if !err.IgnoreStats() {
			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
			e.stats.FailedConnectionAttempts.Increment()
		}
	}
	return err
}

func (e *endpoint) listen(backlog int) tcpip.Error {
	e.LockUser()
	defer e.UnlockUser()

	if e.EndpointState() == StateListen && !e.closed {
		e.acceptMu.Lock()
		defer e.acceptMu.Unlock()
		if e.acceptedChan == nil {
			// listen is called after shutdown.
			e.acceptedChan = make(chan *endpoint, backlog)
			e.shutdownFlags = 0
			e.rcvListMu.Lock()
			e.rcvClosed = false
			e.rcvListMu.Unlock()
		} else {
			// Adjust the size of the channel iff we can fix
			// existing pending connections into the new one.
			if len(e.acceptedChan) > backlog {
				return &tcpip.ErrInvalidEndpointState{}
			}
			if cap(e.acceptedChan) == backlog {
				return nil
			}
			origChan := e.acceptedChan
			e.acceptedChan = make(chan *endpoint, backlog)
			close(origChan)
			for ep := range origChan {
				e.acceptedChan <- ep
			}
		}

		// Notify any blocked goroutines that they can attempt to
		// deliver endpoints again.
		e.acceptCond.Broadcast()

		return nil
	}

	if e.EndpointState() == StateInitial {
		// The listen is called on an unbound socket, the socket is
		// automatically bound to a random free port with the local
		// address set to INADDR_ANY.
		if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
			return err
		}
	}

	// Endpoint must be bound before it can transition to listen mode.
	if e.EndpointState() != StateBound {
		e.stats.ReadErrors.InvalidEndpointState.Increment()
		return &tcpip.ErrInvalidEndpointState{}
	}

	// Register the endpoint.
	if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil {
		return err
	}

	e.isRegistered = true
	e.setEndpointState(StateListen)

	// The channel may be non-nil when we're restoring the endpoint, and it
	// may be pre-populated with some previously accepted (but not Accepted)
	// endpoints.
	e.acceptMu.Lock()
	if e.acceptedChan == nil {
		e.acceptedChan = make(chan *endpoint, backlog)
	}
	e.acceptMu.Unlock()

	e.workerRunning = true
	go e.protocolListenLoop( // S/R-SAFE: drained on save.
		seqnum.Size(e.receiveBufferAvailable()))
	return nil
}

// startAcceptedLoop sets up required state and starts a goroutine with the
// main loop for accepted connections.
func (e *endpoint) startAcceptedLoop() {
	e.workerRunning = true
	e.mu.Unlock()
	wakerInitDone := make(chan struct{})
	go e.protocolMainLoop(false, wakerInitDone) // S/R-SAFE: drained on save.
	<-wakerInitDone
}

// Accept returns a new endpoint if a peer has established a connection
// to an endpoint previously set to listen mode.
//
// addr if not-nil will contain the peer address of the returned endpoint.
func (e *endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) {
	e.LockUser()
	defer e.UnlockUser()

	e.rcvListMu.Lock()
	rcvClosed := e.rcvClosed
	e.rcvListMu.Unlock()
	// Endpoint must be in listen state before it can accept connections.
	if rcvClosed || e.EndpointState() != StateListen {
		return nil, nil, &tcpip.ErrInvalidEndpointState{}
	}

	// Get the new accepted endpoint.
	e.acceptMu.Lock()
	defer e.acceptMu.Unlock()
	var n *endpoint
	select {
	case n = <-e.acceptedChan:
		e.acceptCond.Signal()
	default:
		return nil, nil, &tcpip.ErrWouldBlock{}
	}
	if peerAddr != nil {
		*peerAddr = n.getRemoteAddress()
	}
	return n, n.waiterQueue, nil
}

// Bind binds the endpoint to a specific local port and optionally address.
func (e *endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) {
	e.LockUser()
	defer e.UnlockUser()

	return e.bindLocked(addr)
}

func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) {
	// Don't allow binding once endpoint is not in the initial state
	// anymore. This is because once the endpoint goes into a connected or
	// listen state, it is already bound.
	if e.EndpointState() != StateInitial {
		return &tcpip.ErrAlreadyBound{}
	}

	e.BindAddr = addr.Addr
	addr, netProto, err := e.checkV4MappedLocked(addr)
	if err != nil {
		return err
	}

	netProtos := []tcpip.NetworkProtocolNumber{netProto}

	// Expand netProtos to include v4 and v6 under dual-stack if the caller is
	// binding to a wildcard (empty) address, and this is an IPv6 endpoint with
	// v6only set to false.
	if netProto == header.IPv6ProtocolNumber {
		stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber)
		alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == "" && stackHasV4
		if alsoBindToV4 {
			netProtos = append(netProtos, header.IPv4ProtocolNumber)
		}
	}

	var nic tcpip.NICID
	// If an address is specified, we must ensure that it's one of our
	// local addresses.
	if len(addr.Addr) != 0 {
		nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
		if nic == 0 {
			return &tcpip.ErrBadLocalAddress{}
		}
		e.ID.LocalAddress = addr.Addr
	}

	bindToDevice := tcpip.NICID(e.ops.GetBindToDevice())
	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.portFlags, bindToDevice, tcpip.FullAddress{}, func(p uint16) bool {
		id := e.ID
		id.LocalPort = p
		// CheckRegisterTransportEndpoint should only return an error if there is a
		// listening endpoint bound with the same id and portFlags and bindToDevice
		// options.
		//
		// NOTE: Only listening and connected endpoint register with
		// demuxer. Further connected endpoints always have a remote
		// address/port. Hence this will only return an error if there is a matching
		// listening endpoint.
		if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil {
			return false
		}
		return true
	})
	if err != nil {
		return err
	}

	e.boundBindToDevice = bindToDevice
	e.boundPortFlags = e.portFlags
	// TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct.
	e.boundNICID = nic
	e.isPortReserved = true
	e.effectiveNetProtos = netProtos
	e.ID.LocalPort = port

	// Mark endpoint as bound.
	e.setEndpointState(StateBound)

	return nil
}

// GetLocalAddress returns the address to which the endpoint is bound.
func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
	e.LockUser()
	defer e.UnlockUser()

	return tcpip.FullAddress{
		Addr: e.ID.LocalAddress,
		Port: e.ID.LocalPort,
		NIC:  e.boundNICID,
	}, nil
}

// GetRemoteAddress returns the address to which the endpoint is connected.
func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) {
	e.LockUser()
	defer e.UnlockUser()

	if !e.EndpointState().connected() {
		return tcpip.FullAddress{}, &tcpip.ErrNotConnected{}
	}

	return e.getRemoteAddress(), nil
}

func (e *endpoint) getRemoteAddress() tcpip.FullAddress {
	return tcpip.FullAddress{
		Addr: e.ID.RemoteAddress,
		Port: e.ID.RemotePort,
		NIC:  e.boundNICID,
	}
}

func (*endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) {
	// TCP HandlePacket is not required anymore as inbound packets first
	// land at the Dispatcher which then can either deliver using the
	// worker go routine or directly do the invoke the tcp processing inline
	// based on the state of the endpoint.
}

func (e *endpoint) enqueueSegment(s *segment) bool {
	// Send packet to worker goroutine.
	if !e.segmentQueue.enqueue(s) {
		// The queue is full, so we drop the segment.
		e.stack.Stats().DroppedPackets.Increment()
		e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
		return false
	}
	return true
}

func (e *endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) {
	// Update last error first.
	e.lastErrorMu.Lock()
	e.lastError = err
	e.lastErrorMu.Unlock()

	// Update the error queue if IP_RECVERR is enabled.
	if e.SocketOptions().GetRecvError() {
		e.SocketOptions().QueueErr(&tcpip.SockError{
			Err:   err,
			Cause: transErr,
			// Linux passes the payload with the TCP header. We don't know if the TCP
			// header even exists, it may not for fragmented packets.
			Payload: pkt.Data.ToView(),
			Dst: tcpip.FullAddress{
				NIC:  pkt.NICID,
				Addr: e.ID.RemoteAddress,
				Port: e.ID.RemotePort,
			},
			Offender: tcpip.FullAddress{
				NIC:  pkt.NICID,
				Addr: e.ID.LocalAddress,
				Port: e.ID.LocalPort,
			},
			NetProto: pkt.NetworkProtocolNumber,
		})
	}

	// Notify of the error.
	e.notifyProtocolGoroutine(notifyError)
}

// HandleError implements stack.TransportEndpoint.
func (e *endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) {
	handlePacketTooBig := func(mtu uint32) {
		e.sndBufMu.Lock()
		e.packetTooBigCount++
		if v := int(mtu); v < e.sndMTU {
			e.sndMTU = v
		}
		e.sndBufMu.Unlock()
		e.notifyProtocolGoroutine(notifyMTUChanged)
	}

	// TODO(gvisor.dev/issues/5270): Handle all transport errors.
	switch transErr.Kind() {
	case stack.PacketTooBigTransportError:
		handlePacketTooBig(transErr.Info())
	case stack.DestinationHostUnreachableTransportError:
		e.onICMPError(&tcpip.ErrNoRoute{}, transErr, pkt)
	case stack.DestinationNetworkUnreachableTransportError:
		e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt)
	}
}

// updateSndBufferUsage is called by the protocol goroutine when room opens up
// in the send buffer. The number of newly available bytes is v.
func (e *endpoint) updateSndBufferUsage(v int) {
	sendBufferSize := e.getSendBufferSize()
	e.sndBufMu.Lock()
	notify := e.sndBufUsed >= sendBufferSize>>1
	e.sndBufUsed -= v
	// We only notify when there is half the sendBufferSize available after
	// a full buffer event occurs. This ensures that we don't wake up
	// writers to queue just 1-2 segments and go back to sleep.
	notify = notify && e.sndBufUsed < int(sendBufferSize)>>1
	e.sndBufMu.Unlock()

	if notify {
		e.waiterQueue.Notify(waiter.EventOut)
	}
}

// readyToRead is called by the protocol goroutine when a new segment is ready
// to be read, or when the connection is closed for receiving (in which case
// s will be nil).
func (e *endpoint) readyToRead(s *segment) {
	e.rcvListMu.Lock()
	if s != nil {
		e.rcvBufUsed += s.payloadSize()
		s.incRef()
		e.rcvList.PushBack(s)
	} else {
		e.rcvClosed = true
	}
	e.rcvListMu.Unlock()
	e.waiterQueue.Notify(waiter.EventIn)
}

// receiveBufferAvailableLocked calculates how many bytes are still available
// in the receive buffer.
// rcvListMu must be held when this function is called.
func (e *endpoint) receiveBufferAvailableLocked() int {
	// We may use more bytes than the buffer size when the receive buffer
	// shrinks.
	memUsed := e.receiveMemUsed()
	if memUsed >= e.rcvBufSize {
		return 0
	}

	return e.rcvBufSize - memUsed
}

// receiveBufferAvailable calculates how many bytes are still available in the
// receive buffer based on the actual memory used by all segments held in
// receive buffer/pending and segment queue.
func (e *endpoint) receiveBufferAvailable() int {
	e.rcvListMu.Lock()
	available := e.receiveBufferAvailableLocked()
	e.rcvListMu.Unlock()
	return available
}

// receiveBufferUsed returns the amount of in-use receive buffer.
func (e *endpoint) receiveBufferUsed() int {
	e.rcvListMu.Lock()
	used := e.rcvBufUsed
	e.rcvListMu.Unlock()
	return used
}

// receiveBufferSize returns the current size of the receive buffer.
func (e *endpoint) receiveBufferSize() int {
	e.rcvListMu.Lock()
	size := e.rcvBufSize
	e.rcvListMu.Unlock()
	return size
}

// receiveMemUsed returns the total memory in use by segments held by this
// endpoint.
func (e *endpoint) receiveMemUsed() int {
	return int(atomic.LoadInt32(&e.rcvMemUsed))
}

// updateReceiveMemUsed adds the provided delta to e.rcvMemUsed.
func (e *endpoint) updateReceiveMemUsed(delta int) {
	atomic.AddInt32(&e.rcvMemUsed, int32(delta))
}

// maxReceiveBufferSize returns the stack wide maximum receive buffer size for
// an endpoint.
func (e *endpoint) maxReceiveBufferSize() int {
	var rs tcpip.TCPReceiveBufferSizeRangeOption
	if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
		// As a fallback return the hardcoded max buffer size.
		return MaxBufferSize
	}
	return rs.Max
}

// rcvWndScaleForHandshake computes the receive window scale to offer to the
// peer when window scaling is enabled (true by default). If auto-tuning is
// disabled then the window scaling factor is based on the size of the
// receiveBuffer otherwise we use the max permissible receive buffer size to
// compute the scale.
func (e *endpoint) rcvWndScaleForHandshake() int {
	bufSizeForScale := e.receiveBufferSize()

	e.rcvListMu.Lock()
	autoTuningDisabled := e.rcvAutoParams.disabled
	e.rcvListMu.Unlock()
	if autoTuningDisabled {
		return FindWndScale(seqnum.Size(bufSizeForScale))
	}

	return FindWndScale(seqnum.Size(e.maxReceiveBufferSize()))
}

// updateRecentTimestamp updates the recent timestamp using the algorithm
// described in https://tools.ietf.org/html/rfc7323#section-4.3
func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
	if e.sendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
		e.setRecentTimestamp(tsVal)
	}
}

// maybeEnableTimestamp marks the timestamp option enabled for this endpoint if
// the SYN options indicate that timestamp option was negotiated. It also
// initializes the recentTS with the value provided in synOpts.TSval.
func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) {
	if synOpts.TS {
		e.sendTSOk = true
		e.setRecentTimestamp(synOpts.TSVal)
	}
}

// timestamp returns the timestamp value to be used in the TSVal field of the
// timestamp option for outgoing TCP segments for a given endpoint.
func (e *endpoint) timestamp() uint32 {
	return tcpTimeStamp(time.Now(), e.tsOffset)
}

// tcpTimeStamp returns a timestamp offset by the provided offset. This is
// not inlined above as it's used when SYN cookies are in use and endpoint
// is not created at the time when the SYN cookie is sent.
func tcpTimeStamp(curTime time.Time, offset uint32) uint32 {
	return uint32(curTime.Unix()*1000+int64(curTime.Nanosecond()/1e6)) + offset
}

// timeStampOffset returns a randomized timestamp offset to be used when sending
// timestamp values in a timestamp option for a TCP segment.
func timeStampOffset() uint32 {
	b := make([]byte, 4)
	if _, err := rand.Read(b); err != nil {
		panic(err)
	}
	// Initialize a random tsOffset that will be added to the recentTS
	// everytime the timestamp is sent when the Timestamp option is enabled.
	//
	// See https://tools.ietf.org/html/rfc7323#section-5.4 for details on
	// why this is required.
	//
	// NOTE: This is not completely to spec as normally this should be
	// initialized in a manner analogous to how sequence numbers are
	// randomized per connection basis. But for now this is sufficient.
	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
}

// maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint
// if the SYN options indicate that the SACK option was negotiated and the TCP
// stack is configured to enable TCP SACK option.
func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) {
	var v tcpip.TCPSACKEnabled
	if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
		// Stack doesn't support SACK. So just return.
		return
	}
	if bool(v) && synOpts.SACKPermitted {
		e.sackPermitted = true
	}
}

// maxOptionSize return the maximum size of TCP options.
func (e *endpoint) maxOptionSize() (size int) {
	var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
	options := e.makeOptions(maxSackBlocks[:])
	size = len(options)
	putOptions(options)

	return size
}

// completeState makes a full copy of the endpoint and returns it. This is used
// before invoking the probe. The state returned may not be fully consistent if
// there are intervening syscalls when the state is being copied.
func (e *endpoint) completeState() stack.TCPEndpointState {
	var s stack.TCPEndpointState
	s.SegTime = time.Now()

	// Copy EndpointID.
	s.ID = stack.TCPEndpointID(e.ID)

	// Copy endpoint rcv state.
	e.rcvListMu.Lock()
	s.RcvBufSize = e.rcvBufSize
	s.RcvBufUsed = e.rcvBufUsed
	s.RcvClosed = e.rcvClosed
	s.RcvAutoParams.MeasureTime = e.rcvAutoParams.measureTime
	s.RcvAutoParams.CopiedBytes = e.rcvAutoParams.copied
	s.RcvAutoParams.PrevCopiedBytes = e.rcvAutoParams.prevCopied
	s.RcvAutoParams.RTT = e.rcvAutoParams.rtt
	s.RcvAutoParams.RTTMeasureSeqNumber = e.rcvAutoParams.rttMeasureSeqNumber
	s.RcvAutoParams.RTTMeasureTime = e.rcvAutoParams.rttMeasureTime
	s.RcvAutoParams.Disabled = e.rcvAutoParams.disabled
	e.rcvListMu.Unlock()

	// Endpoint TCP Option state.
	s.SendTSOk = e.sendTSOk
	s.RecentTS = e.recentTimestamp()
	s.TSOffset = e.tsOffset
	s.SACKPermitted = e.sackPermitted
	s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
	copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
	s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()

	// Copy endpoint send state.
	sndBufSize := e.getSendBufferSize()
	e.sndBufMu.Lock()
	s.SndBufSize = sndBufSize
	s.SndBufUsed = e.sndBufUsed
	s.SndClosed = e.sndClosed
	s.SndBufInQueue = e.sndBufInQueue
	s.PacketTooBigCount = e.packetTooBigCount
	s.SndMTU = e.sndMTU
	e.sndBufMu.Unlock()

	// Copy receiver state.
	s.Receiver = stack.TCPReceiverState{
		RcvNxt:         e.rcv.rcvNxt,
		RcvAcc:         e.rcv.rcvAcc,
		RcvWndScale:    e.rcv.rcvWndScale,
		PendingBufUsed: e.rcv.pendingBufUsed,
	}

	// Copy sender state.
	s.Sender = stack.TCPSenderState{
		LastSendTime: e.snd.lastSendTime,
		DupAckCount:  e.snd.dupAckCount,
		FastRecovery: stack.TCPFastRecoveryState{
			Active:    e.snd.fr.active,
			First:     e.snd.fr.first,
			Last:      e.snd.fr.last,
			MaxCwnd:   e.snd.fr.maxCwnd,
			HighRxt:   e.snd.fr.highRxt,
			RescueRxt: e.snd.fr.rescueRxt,
		},
		SndCwnd:          e.snd.sndCwnd,
		Ssthresh:         e.snd.sndSsthresh,
		SndCAAckCount:    e.snd.sndCAAckCount,
		Outstanding:      e.snd.outstanding,
		SackedOut:        e.snd.sackedOut,
		SndWnd:           e.snd.sndWnd,
		SndUna:           e.snd.sndUna,
		SndNxt:           e.snd.sndNxt,
		RTTMeasureSeqNum: e.snd.rttMeasureSeqNum,
		RTTMeasureTime:   e.snd.rttMeasureTime,
		Closed:           e.snd.closed,
		RTO:              e.snd.rto,
		MaxPayloadSize:   e.snd.maxPayloadSize,
		SndWndScale:      e.snd.sndWndScale,
		MaxSentAck:       e.snd.maxSentAck,
	}
	e.snd.rtt.Lock()
	s.Sender.SRTT = e.snd.rtt.srtt
	s.Sender.SRTTInited = e.snd.rtt.srttInited
	e.snd.rtt.Unlock()

	if cubic, ok := e.snd.cc.(*cubicState); ok {
		s.Sender.Cubic = stack.TCPCubicState{
			WMax:                    cubic.wMax,
			WLastMax:                cubic.wLastMax,
			T:                       cubic.t,
			TimeSinceLastCongestion: time.Since(cubic.t),
			C:                       cubic.c,
			K:                       cubic.k,
			Beta:                    cubic.beta,
			WC:                      cubic.wC,
			WEst:                    cubic.wEst,
		}
	}

	rc := &e.snd.rc
	s.Sender.RACKState = stack.TCPRACKState{
		XmitTime:      rc.xmitTime,
		EndSequence:   rc.endSequence,
		FACK:          rc.fack,
		RTT:           rc.rtt,
		Reord:         rc.reorderSeen,
		DSACKSeen:     rc.dsackSeen,
		ReoWnd:        rc.reoWnd,
		ReoWndIncr:    rc.reoWndIncr,
		ReoWndPersist: rc.reoWndPersist,
		RTTSeq:        rc.rttSeq,
	}
	return s
}

func (e *endpoint) initHardwareGSO() {
	gso := &stack.GSO{}
	switch e.route.NetProto {
	case header.IPv4ProtocolNumber:
		gso.Type = stack.GSOTCPv4
		gso.L3HdrLen = header.IPv4MinimumSize
	case header.IPv6ProtocolNumber:
		gso.Type = stack.GSOTCPv6
		gso.L3HdrLen = header.IPv6MinimumSize
	default:
		panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto))
	}
	gso.NeedsCsum = true
	gso.CsumOffset = header.TCPChecksumOffset
	gso.MaxSize = e.route.GSOMaxSize()
	e.gso = gso
}

func (e *endpoint) initGSO() {
	if e.route.HasHardwareGSOCapability() {
		e.initHardwareGSO()
	} else if e.route.HasSoftwareGSOCapability() {
		e.gso = &stack.GSO{
			MaxSize:   e.route.GSOMaxSize(),
			Type:      stack.GSOSW,
			NeedsCsum: false,
		}
	}
}

// State implements tcpip.Endpoint.State. It exports the endpoint's protocol
// state for diagnostics.
func (e *endpoint) State() uint32 {
	return uint32(e.EndpointState())
}

// Info returns a copy of the endpoint info.
func (e *endpoint) Info() tcpip.EndpointInfo {
	e.LockUser()
	// Make a copy of the endpoint info.
	ret := e.TransportEndpointInfo
	e.UnlockUser()
	return &ret
}

// Stats returns a pointer to the endpoint stats.
func (e *endpoint) Stats() tcpip.EndpointStats {
	return &e.stats
}

// Wait implements stack.TransportEndpoint.Wait.
func (e *endpoint) Wait() {
	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
	e.waiterQueue.EventRegister(&waitEntry, waiter.EventHUp)
	defer e.waiterQueue.EventUnregister(&waitEntry)
	for {
		e.LockUser()
		running := e.workerRunning
		e.UnlockUser()
		if !running {
			break
		}
		<-notifyCh
	}
}

// SocketOptions implements tcpip.Endpoint.SocketOptions.
func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
	return &e.ops
}

// GetTCPSendBufferLimits is used to get send buffer size limits for TCP.
func GetTCPSendBufferLimits(s tcpip.StackHandler) tcpip.SendBufferSizeOption {
	var ss tcpip.TCPSendBufferSizeRangeOption
	if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil {
		panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err))
	}

	return tcpip.SendBufferSizeOption{
		Min:     ss.Min,
		Default: ss.Default,
		Max:     ss.Max,
	}
}

// allowOutOfWindowAck returns true if an out-of-window ACK can be sent now.
func (e *endpoint) allowOutOfWindowAck() bool {
	var limit stack.TCPInvalidRateLimitOption
	if err := e.stack.Option(&limit); err != nil {
		panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err))
	}

	now := time.Now()
	if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) {
		return false
	}

	e.lastOutOfWindowAckTime = now
	return true
}