24 files changed, 1110 insertions, 984 deletions
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index 440cb0352..74fe19e98 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -49,6 +49,7 @@ const (
 // +stateify savable
 type endpoint struct {
 	stack.TransportEndpointInfo
+	tcpip.DefaultSocketOptionsHandler
 
 	// The following fields are initialized at creation time and are
 	// immutable.
@@ -71,11 +72,9 @@ type endpoint struct {
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 	state         endpointState
-	route         stack.Route `state:"manual"`
+	route         *stack.Route `state:"manual"`
 	ttl           uint8
 	stats         tcpip.TransportEndpointStats `state:"nosave"`
-	// linger is used for SO_LINGER socket option.
-	linger tcpip.LingerOption
 
 	// owner is used to get uid and gid of the packet.
 	owner tcpip.PacketOwner
@@ -85,7 +84,7 @@ type endpoint struct {
 }
 
 func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
-	return &endpoint{
+	ep := &endpoint{
 		stack: s,
 		TransportEndpointInfo: stack.TransportEndpointInfo{
 			NetProto:   netProto,
@@ -96,7 +95,9 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt
 		sndBufSize:    32 * 1024,
 		state:         stateInitial,
 		uniqueID:      s.UniqueID(),
-	}, nil
+	}
+	ep.ops.InitHandler(ep)
+	return ep, nil
 }
 
 // UniqueID implements stack.TransportEndpoint.UniqueID.
@@ -129,7 +130,10 @@ func (e *endpoint) Close() {
 	}
 	e.rcvMu.Unlock()
 
-	e.route.Release()
+	if e.route != nil {
+		e.route.Release()
+		e.route = nil
+	}
 
 	// Update the state.
 	e.state = stateClosed
@@ -142,6 +146,7 @@ func (e *endpoint) Close() {
 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
+// SetOwner implements tcpip.Endpoint.SetOwner.
 func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
@@ -267,26 +272,8 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		}
 	}
 
-	var route *stack.Route
-	if to == nil {
-		route = &e.route
-
-		if route.IsResolutionRequired() {
-			// Promote lock to exclusive if using a shared route,
-			// given that it may need to change in Route.Resolve()
-			// call below.
-			e.mu.RUnlock()
-			defer e.mu.RLock()
-
-			e.mu.Lock()
-			defer e.mu.Unlock()
-
-			// Recheck state after lock was re-acquired.
-			if e.state != stateConnected {
-				return 0, nil, tcpip.ErrInvalidEndpointState
-			}
-		}
-	} else {
+	route := e.route
+	if to != nil {
 		// Reject destination address if it goes through a different
 		// NIC than the endpoint was bound to.
 		nicID := to.NIC
@@ -310,7 +297,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		}
 		defer r.Release()
 
-		route = &r
+		route = r
 	}
 
 	if route.IsResolutionRequired() {
@@ -343,26 +330,12 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 }
 
 // Peek only returns data from a single datagram, so do nothing here.
-func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
-	return 0, tcpip.ControlMessages{}, nil
+func (e *endpoint) Peek([][]byte) (int64, *tcpip.Error) {
+	return 0, nil
 }
 
 // SetSockOpt sets a socket option.
 func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
-	switch v := opt.(type) {
-	case *tcpip.SocketDetachFilterOption:
-		return nil
-
-	case *tcpip.LingerOption:
-		e.mu.Lock()
-		e.linger = *v
-		e.mu.Unlock()
-	}
-	return nil
-}
-
-// SetSockOptBool sets a socket option. Currently not supported.
-func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 	return nil
 }
 
@@ -378,17 +351,6 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	return nil
 }
 
-// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
-func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	switch opt {
-	case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption:
-		return false, nil
-
-	default:
-		return false, tcpip.ErrUnknownProtocolOption
-	}
-}
-
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
@@ -426,16 +388,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
-	switch o := opt.(type) {
-	case *tcpip.LingerOption:
-		e.mu.Lock()
-		*o = e.linger
-		e.mu.Unlock()
-		return nil
-
-	default:
-		return tcpip.ErrUnknownProtocolOption
-	}
+	return tcpip.ErrUnknownProtocolOption
 }
 
 func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpip.PacketOwner) *tcpip.Error {
@@ -857,6 +810,7 @@ func (*endpoint) LastError() *tcpip.Error {
 	return nil
 }
 
+// SocketOptions implements tcpip.Endpoint.SocketOptions.
 func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
 	return &e.ops
 }
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 3bff3755a..9faab4b9e 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -60,6 +60,8 @@ type packet struct {
 // +stateify savable
 type endpoint struct {
 	stack.TransportEndpointInfo
+	tcpip.DefaultSocketOptionsHandler
+
 	// The following fields are initialized at creation time and are
 	// immutable.
 	stack       *stack.Stack `state:"manual"`
@@ -83,8 +85,6 @@ type endpoint struct {
 	stats         tcpip.TransportEndpointStats `state:"nosave"`
 	bound         bool
 	boundNIC      tcpip.NICID
-	// linger is used for SO_LINGER socket option.
-	linger tcpip.LingerOption
 
 	// lastErrorMu protects lastError.
 	lastErrorMu sync.Mutex   `state:"nosave"`
@@ -107,6 +107,7 @@ func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumb
 		rcvBufSizeMax: 32 * 1024,
 		sndBufSize:    32 * 1024,
 	}
+	ep.ops.InitHandler(ep)
 
 	// Override with stack defaults.
 	var ss stack.SendBufferSizeOption
@@ -203,8 +204,8 @@ func (*endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-cha
 }
 
 // Peek implements tcpip.Endpoint.Peek.
-func (*endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
-	return 0, tcpip.ControlMessages{}, nil
+func (*endpoint) Peek([][]byte) (int64, *tcpip.Error) {
+	return 0, nil
 }
 
 // Disconnect implements tcpip.Endpoint.Disconnect. Packet sockets cannot be
@@ -303,26 +304,15 @@ func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 // used with SetSockOpt, and this function always returns
 // tcpip.ErrNotSupported.
 func (ep *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
-	switch v := opt.(type) {
+	switch opt.(type) {
 	case *tcpip.SocketDetachFilterOption:
 		return nil
 
-	case *tcpip.LingerOption:
-		ep.mu.Lock()
-		ep.linger = *v
-		ep.mu.Unlock()
-		return nil
-
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
 
-// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
-func (ep *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
-}
-
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
 func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	switch opt {
@@ -378,26 +368,7 @@ func (ep *endpoint) LastError() *tcpip.Error {
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (ep *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
-	switch o := opt.(type) {
-	case *tcpip.LingerOption:
-		ep.mu.Lock()
-		*o = ep.linger
-		ep.mu.Unlock()
-		return nil
-
-	default:
-		return tcpip.ErrNotSupported
-	}
-}
-
-// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
-func (*endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	switch opt {
-	case tcpip.AcceptConnOption:
-		return false, nil
-	default:
-		return false, tcpip.ErrNotSupported
-	}
+	return tcpip.ErrNotSupported
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
@@ -551,8 +522,10 @@ func (ep *endpoint) Stats() tcpip.EndpointStats {
 	return &ep.stats
 }
 
+// SetOwner implements tcpip.Endpoint.SetOwner.
 func (ep *endpoint) SetOwner(owner tcpip.PacketOwner) {}
 
+// SocketOptions implements tcpip.Endpoint.SocketOptions.
 func (ep *endpoint) SocketOptions() *tcpip.SocketOptions {
 	return &ep.ops
 }
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index 4ae1f92ab..87c60bdab 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -58,12 +58,13 @@ type rawPacket struct {
 // +stateify savable
 type endpoint struct {
 	stack.TransportEndpointInfo
+	tcpip.DefaultSocketOptionsHandler
+
 	// The following fields are initialized at creation time and are
 	// immutable.
 	stack       *stack.Stack `state:"manual"`
 	waiterQueue *waiter.Queue
 	associated  bool
-	hdrIncluded bool
 
 	// The following fields are used to manage the receive queue and are
 	// protected by rcvMu.
@@ -82,10 +83,8 @@ type endpoint struct {
 	bound         bool
 	// route is the route to a remote network endpoint. It is set via
 	// Connect(), and is valid only when conneted is true.
-	route stack.Route                  `state:"manual"`
+	route *stack.Route                 `state:"manual"`
 	stats tcpip.TransportEndpointStats `state:"nosave"`
-	// linger is used for SO_LINGER socket option.
-	linger tcpip.LingerOption
 
 	// owner is used to get uid and gid of the packet.
 	owner tcpip.PacketOwner
@@ -114,8 +113,9 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt
 		rcvBufSizeMax: 32 * 1024,
 		sndBufSizeMax: 32 * 1024,
 		associated:    associated,
-		hdrIncluded:   !associated,
 	}
+	e.ops.InitHandler(e)
+	e.ops.SetHeaderIncluded(!associated)
 
 	// Override with stack defaults.
 	var ss stack.SendBufferSizeOption
@@ -170,9 +170,11 @@ func (e *endpoint) Close() {
 		e.rcvList.Remove(e.rcvList.Front())
 	}
 
-	if e.connected {
+	e.connected = false
+
+	if e.route != nil {
 		e.route.Release()
-		e.connected = false
+		e.route = nil
 	}
 
 	e.closed = true
@@ -223,6 +225,13 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		return 0, nil, tcpip.ErrInvalidOptionValue
 	}
 
+	if opts.To != nil {
+		// Raw sockets do not support sending to a IPv4 address on a IPv6 endpoint.
+		if e.TransportEndpointInfo.NetProto == header.IPv6ProtocolNumber && len(opts.To.Addr) != header.IPv6AddressSize {
+			return 0, nil, tcpip.ErrInvalidOptionValue
+		}
+	}
+
 	n, ch, err := e.write(p, opts)
 	switch err {
 	case nil:
@@ -266,7 +275,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 
 	// If this is an unassociated socket and callee provided a nonzero
 	// destination address, route using that address.
-	if e.hdrIncluded {
+	if e.ops.GetHeaderIncluded() {
 		ip := header.IPv4(payloadBytes)
 		if !ip.IsValid(len(payloadBytes)) {
 			e.mu.RUnlock()
@@ -296,7 +305,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		}
 
 		if e.route.IsResolutionRequired() {
-			savedRoute := &e.route
+			savedRoute := e.route
 			// Promote lock to exclusive if using a shared route,
 			// given that it may need to change in finishWrite.
 			e.mu.RUnlock()
@@ -304,7 +313,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 
 			// Make sure that the route didn't change during the
 			// time we didn't hold the lock.
-			if !e.connected || savedRoute != &e.route {
+			if !e.connected || savedRoute != e.route {
 				e.mu.Unlock()
 				return 0, nil, tcpip.ErrInvalidEndpointState
 			}
@@ -314,7 +323,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 			return n, ch, err
 		}
 
-		n, ch, err := e.finishWrite(payloadBytes, &e.route)
+		n, ch, err := e.finishWrite(payloadBytes, e.route)
 		e.mu.RUnlock()
 		return n, ch, err
 	}
@@ -335,7 +344,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		return 0, nil, err
 	}
 
-	n, ch, err := e.finishWrite(payloadBytes, &route)
+	n, ch, err := e.finishWrite(payloadBytes, route)
 	route.Release()
 	e.mu.RUnlock()
 	return n, ch, err
@@ -356,7 +365,7 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64,
 		}
 	}
 
-	if e.hdrIncluded {
+	if e.ops.GetHeaderIncluded() {
 		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 			Data: buffer.View(payloadBytes).ToVectorisedView(),
 		})
@@ -382,8 +391,8 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64,
 }
 
 // Peek implements tcpip.Endpoint.Peek.
-func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
-	return 0, tcpip.ControlMessages{}, nil
+func (e *endpoint) Peek([][]byte) (int64, *tcpip.Error) {
+	return 0, nil
 }
 
 // Disconnect implements tcpip.Endpoint.Disconnect.
@@ -393,6 +402,11 @@ func (*endpoint) Disconnect() *tcpip.Error {
 
 // Connect implements tcpip.Endpoint.Connect.
 func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+	// Raw sockets do not support connecting to a IPv4 address on a IPv6 endpoint.
+	if e.TransportEndpointInfo.NetProto == header.IPv6ProtocolNumber && len(addr.Addr) != header.IPv6AddressSize {
+		return tcpip.ErrAddressFamilyNotSupported
+	}
+
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
@@ -516,33 +530,15 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // SetSockOpt implements tcpip.Endpoint.SetSockOpt.
 func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
-	switch v := opt.(type) {
+	switch opt.(type) {
 	case *tcpip.SocketDetachFilterOption:
 		return nil
 
-	case *tcpip.LingerOption:
-		e.mu.Lock()
-		e.linger = *v
-		e.mu.Unlock()
-		return nil
-
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
 }
 
-// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
-func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
-	switch opt {
-	case tcpip.IPHdrIncludedOption:
-		e.mu.Lock()
-		e.hdrIncluded = v
-		e.mu.Unlock()
-		return nil
-	}
-	return tcpip.ErrUnknownProtocolOption
-}
-
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	switch opt {
@@ -589,33 +585,7 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
-	switch o := opt.(type) {
-	case *tcpip.LingerOption:
-		e.mu.Lock()
-		*o = e.linger
-		e.mu.Unlock()
-		return nil
-
-	default:
-		return tcpip.ErrUnknownProtocolOption
-	}
-}
-
-// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
-func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	switch opt {
-	case tcpip.KeepaliveEnabledOption, tcpip.AcceptConnOption:
-		return false, nil
-
-	case tcpip.IPHdrIncludedOption:
-		e.mu.Lock()
-		v := e.hdrIncluded
-		e.mu.Unlock()
-		return v, nil
-
-	default:
-		return false, tcpip.ErrUnknownProtocolOption
-	}
+	return tcpip.ErrUnknownProtocolOption
 }
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
@@ -756,10 +726,12 @@ func (e *endpoint) Stats() tcpip.EndpointStats {
 // Wait implements stack.TransportEndpoint.Wait.
 func (*endpoint) Wait() {}
 
+// LastError implements tcpip.Endpoint.LastError.
 func (*endpoint) LastError() *tcpip.Error {
 	return nil
 }
 
+// SocketOptions implements tcpip.Endpoint.SocketOptions.
 func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
 	return &e.ops
 }
diff --git a/pkg/tcpip/transport/raw/endpoint_state.go b/pkg/tcpip/transport/raw/endpoint_state.go
index 7d97cbdc7..4a7e1c039 100644
--- a/pkg/tcpip/transport/raw/endpoint_state.go
+++ b/pkg/tcpip/transport/raw/endpoint_state.go
@@ -73,7 +73,13 @@ func (e *endpoint) Resume(s *stack.Stack) {
 	// If the endpoint is connected, re-connect.
 	if e.connected {
 		var err *tcpip.Error
-		e.route, err = e.stack.FindRoute(e.RegisterNICID, e.BindAddr, e.route.RemoteAddress, e.NetProto, false)
+		// TODO(gvisor.dev/issue/4906): Properly restore the route with the right
+		// remote address. We used to pass e.remote.RemoteAddress which was
+		// effectively the empty address but since moving e.route to hold a pointer
+		// to a route instead of the route by value, we pass the empty address
+		// directly. Obviously this was always wrong since we should provide the
+		// remote address we were connected to, to properly restore the route.
+		e.route, err = e.stack.FindRoute(e.RegisterNICID, e.BindAddr, "", e.NetProto, false)
 		if err != nil {
 			panic(err)
 		}
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index 518449602..cf232b508 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test", "more_shards")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -45,7 +45,9 @@ go_library(
         "rcv.go",
         "rcv_state.go",
         "reno.go",
+        "reno_recovery.go",
         "sack.go",
+        "sack_recovery.go",
         "sack_scoreboard.go",
         "segment.go",
         "segment_heap.go",
@@ -91,7 +93,7 @@ go_test(
         "tcp_test.go",
         "tcp_timestamp_test.go",
     ],
-    shard_count = 10,
+    shard_count = more_shards,
     deps = [
         ":tcp",
         "//pkg/rand",
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index 6e5adc383..3e1041cbe 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -213,7 +213,7 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 	route.ResolveWith(s.remoteLinkAddr)
 
 	n := newEndpoint(l.stack, netProto, queue)
-	n.v6only = l.v6Only
+	n.ops.SetV6Only(l.v6Only)
 	n.ID = s.id
 	n.boundNICID = s.nicID
 	n.route = route
@@ -599,7 +599,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) *tcpip.Er
 				ack:    s.sequenceNumber + 1,
 				rcvWnd: ctx.rcvWnd,
 			}
-			if err := e.sendSynTCP(&route, fields, synOpts); err != nil {
+			if err := e.sendSynTCP(route, fields, synOpts); err != nil {
 				return err
 			}
 			e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment()
@@ -752,7 +752,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) *tcpip.Er
 // its own goroutine and is responsible for handling connection requests.
 func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) {
 	e.mu.Lock()
-	v6Only := e.v6only
+	v6Only := e.ops.GetV6Only()
 	ctx := newListenContext(e.stack, e, rcvWnd, v6Only, e.NetProto)
 
 	defer func() {
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index ac6d879a7..c944dccc0 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -16,6 +16,7 @@ package tcp
 
 import (
 	"encoding/binary"
+	"math"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/rand"
@@ -133,7 +134,7 @@ func FindWndScale(wnd seqnum.Size) int {
 		return 0
 	}
 
-	max := seqnum.Size(0xffff)
+	max := seqnum.Size(math.MaxUint16)
 	s := 0
 	for wnd > max && s < header.MaxWndScale {
 		s++
@@ -300,7 +301,7 @@ func (h *handshake) synSentState(s *segment) *tcpip.Error {
 	if ttl == 0 {
 		ttl = h.ep.route.DefaultTTL()
 	}
-	h.ep.sendSynTCP(&h.ep.route, tcpFields{
+	h.ep.sendSynTCP(h.ep.route, tcpFields{
 		id:     h.ep.ID,
 		ttl:    ttl,
 		tos:    h.ep.sendTOS,
@@ -361,7 +362,7 @@ func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
 			SACKPermitted: h.ep.sackPermitted,
 			MSS:           h.ep.amss,
 		}
-		h.ep.sendSynTCP(&h.ep.route, tcpFields{
+		h.ep.sendSynTCP(h.ep.route, tcpFields{
 			id:     h.ep.ID,
 			ttl:    h.ep.ttl,
 			tos:    h.ep.sendTOS,
@@ -496,7 +497,7 @@ func (h *handshake) resolveRoute() *tcpip.Error {
 				h.ep.mu.Lock()
 			}
 			if n&notifyError != 0 {
-				return h.ep.LastError()
+				return h.ep.lastErrorLocked()
 			}
 		}
 
@@ -547,7 +548,7 @@ func (h *handshake) start() *tcpip.Error {
 	}
 
 	h.sendSYNOpts = synOpts
-	h.ep.sendSynTCP(&h.ep.route, tcpFields{
+	h.ep.sendSynTCP(h.ep.route, tcpFields{
 		id:     h.ep.ID,
 		ttl:    h.ep.ttl,
 		tos:    h.ep.sendTOS,
@@ -575,7 +576,6 @@ func (h *handshake) complete() *tcpip.Error {
 		return err
 	}
 	defer timer.stop()
-
 	for h.state != handshakeCompleted {
 		// Unlock before blocking, and reacquire again afterwards (h.ep.mu is held
 		// throughout handshake processing).
@@ -597,7 +597,7 @@ func (h *handshake) complete() *tcpip.Error {
 			// the connection with another ACK or data (as ACKs are never
 			// retransmitted on their own).
 			if h.active || !h.acked || h.deferAccept != 0 && time.Since(h.startTime) > h.deferAccept {
-				h.ep.sendSynTCP(&h.ep.route, tcpFields{
+				h.ep.sendSynTCP(h.ep.route, tcpFields{
 					id:     h.ep.ID,
 					ttl:    h.ep.ttl,
 					tos:    h.ep.sendTOS,
@@ -631,9 +631,8 @@ func (h *handshake) complete() *tcpip.Error {
 				h.ep.mu.Lock()
 			}
 			if n&notifyError != 0 {
-				return h.ep.LastError()
+				return h.ep.lastErrorLocked()
 			}
-
 		case wakerForNewSegment:
 			if err := h.processSegments(); err != nil {
 				return err
@@ -820,8 +819,8 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso
 	data = data.Clone(nil)
 
 	optLen := len(tf.opts)
-	if tf.rcvWnd > 0xffff {
-		tf.rcvWnd = 0xffff
+	if tf.rcvWnd > math.MaxUint16 {
+		tf.rcvWnd = math.MaxUint16
 	}
 
 	mss := int(gso.MSS)
@@ -865,8 +864,8 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso
 // network endpoint and under the provided identity.
 func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error {
 	optLen := len(tf.opts)
-	if tf.rcvWnd > 0xffff {
-		tf.rcvWnd = 0xffff
+	if tf.rcvWnd > math.MaxUint16 {
+		tf.rcvWnd = math.MaxUint16
 	}
 
 	if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() {
@@ -941,7 +940,7 @@ func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqn
 		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
 	}
 	options := e.makeOptions(sackBlocks)
-	err := e.sendTCP(&e.route, tcpFields{
+	err := e.sendTCP(e.route, tcpFields{
 		id:     e.ID,
 		ttl:    e.ttl,
 		tos:    e.sendTOS,
@@ -1002,7 +1001,7 @@ func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
 	// Only send a reset if the connection is being aborted for a reason
 	// other than receiving a reset.
 	e.setEndpointState(StateError)
-	e.HardError = err
+	e.hardError = err
 	if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout {
 		// The exact sequence number to be used for the RST is the same as the
 		// one used by Linux. We need to handle the case of window being shrunk
@@ -1080,7 +1079,7 @@ func (e *endpoint) transitionToStateCloseLocked() {
 // to any other listening endpoint. We reply with RST if we cannot find one.
 func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 	ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.ID, s.nicID)
-	if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.EndpointInfo.TransportEndpointInfo.ID.LocalAddress.To4() != "" {
+	if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.TransportEndpointInfo.ID.LocalAddress.To4() != "" {
 		// Dual-stack socket, try IPv4.
 		ep = e.stack.FindTransportEndpoint(header.IPv4ProtocolNumber, e.TransProto, e.ID, s.nicID)
 	}
@@ -1141,7 +1140,7 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 		//  delete the TCB, and return.
 		case StateCloseWait:
 			e.transitionToStateCloseLocked()
-			e.HardError = tcpip.ErrAborted
+			e.hardError = tcpip.ErrAborted
 			e.notifyProtocolGoroutine(notifyTickleWorker)
 			return false, nil
 		default:
@@ -1286,7 +1285,7 @@ func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
 	userTimeout := e.userTimeout
 
 	e.keepalive.Lock()
-	if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() {
+	if !e.SocketOptions().GetKeepAlive() || !e.keepalive.timer.checkExpiration() {
 		e.keepalive.Unlock()
 		return nil
 	}
@@ -1323,7 +1322,7 @@ func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
 	}
 	// Start the keepalive timer IFF it's enabled and there is no pending
 	// data to send.
-	if !e.keepalive.enabled || e.snd == nil || e.snd.sndUna != e.snd.sndNxt {
+	if !e.SocketOptions().GetKeepAlive() || e.snd == nil || e.snd.sndUna != e.snd.sndNxt {
 		e.keepalive.timer.disable()
 		e.keepalive.Unlock()
 		return
@@ -1353,7 +1352,6 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 
 	epilogue := func() {
 		// e.mu is expected to be hold upon entering this section.
-
 		if e.snd != nil {
 			e.snd.resendTimer.cleanup()
 		}
@@ -1383,7 +1381,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 			e.lastErrorMu.Unlock()
 
 			e.setEndpointState(StateError)
-			e.HardError = err
+			e.hardError = err
 
 			e.workerCleanup = true
 			// Lock released below.
@@ -1638,7 +1636,7 @@ func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()
 		}
 		extTW, newSyn := e.rcv.handleTimeWaitSegment(s)
 		if newSyn {
-			info := e.EndpointInfo.TransportEndpointInfo
+			info := e.TransportEndpointInfo
 			newID := info.ID
 			newID.RemoteAddress = ""
 			newID.RemotePort = 0
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
index a6f25896b..1d1b01a6c 100644
--- a/pkg/tcpip/transport/tcp/dual_stack_test.go
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -405,14 +405,6 @@ func testV4Accept(t *testing.T, c *context.Context) {
 		}
 	}
 
-	// Make sure we get the same error when calling the original ep and the
-	// new one. This validates that v4-mapped endpoints are still able to
-	// query the V6Only flag, whereas pure v4 endpoints are not.
-	_, expected := c.EP.GetSockOptBool(tcpip.V6OnlyOption)
-	if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != expected {
-		t.Fatalf("GetSockOpt returned unexpected value: got %v, want %v", err, expected)
-	}
-
 	// Check the peer address.
 	addr, err := nep.GetRemoteAddress()
 	if err != nil {
@@ -530,12 +522,12 @@ func TestV6AcceptOnV6(t *testing.T) {
 	c.WQ.EventRegister(&we, waiter.EventIn)
 	defer c.WQ.EventUnregister(&we)
 	var addr tcpip.FullAddress
-	nep, _, err := c.EP.Accept(&addr)
+	_, _, err := c.EP.Accept(&addr)
 	if err == tcpip.ErrWouldBlock {
 		// Wait for connection to be established.
 		select {
 		case <-ch:
-			nep, _, err = c.EP.Accept(&addr)
+			_, _, err = c.EP.Accept(&addr)
 			if err != nil {
 				t.Fatalf("Accept failed: %v", err)
 			}
@@ -548,12 +540,6 @@ func TestV6AcceptOnV6(t *testing.T) {
 	if addr.Addr != context.TestV6Addr {
 		t.Errorf("Unexpected remote address: got %s, want %s", addr.Addr, context.TestV6Addr)
 	}
-
-	// Make sure we can still query the v6 only status of the new endpoint,
-	// that is, that it is in fact a v6 socket.
-	if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != nil {
-		t.Errorf("GetSockOptBool(tcpip.V6OnlyOption) failed: %s", err)
-	}
 }
 
 func TestV4AcceptOnV4(t *testing.T) {
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index 4f4f4c65e..bb0795f78 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -310,16 +310,12 @@ type Stats struct {
 func (*Stats) IsEndpointStats() {}
 
 // EndpointInfo holds useful information about a transport endpoint which
-// can be queried by monitoring tools.
+// can be queried by monitoring tools. This exists to allow tcp-only state to
+// be exposed.
 //
 // +stateify savable
 type EndpointInfo struct {
 	stack.TransportEndpointInfo
-
-	// HardError is meaningful only when state is stateError. It stores the
-	// error to be returned when read/write syscalls are called and the
-	// endpoint is in this state. HardError is protected by endpoint mu.
-	HardError *tcpip.Error `state:".(string)"`
 }
 
 // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
@@ -367,6 +363,7 @@ func (*EndpointInfo) IsEndpointInfo() {}
 // +stateify savable
 type endpoint struct {
 	EndpointInfo
+	tcpip.DefaultSocketOptionsHandler
 
 	// endpointEntry is used to queue endpoints for processing to the
 	// a given tcp processor goroutine.
@@ -386,6 +383,11 @@ type endpoint struct {
 	waiterQueue *waiter.Queue `state:"wait"`
 	uniqueID    uint64
 
+	// hardError is meaningful only when state is stateError. It stores the
+	// error to be returned when read/write syscalls are called and the
+	// endpoint is in this state. hardError is protected by endpoint mu.
+	hardError *tcpip.Error `state:".(string)"`
+
 	// lastError represents the last error that the endpoint reported;
 	// access to it is protected by the following mutex.
 	lastErrorMu sync.Mutex   `state:"nosave"`
@@ -421,7 +423,10 @@ type endpoint struct {
 
 	// mu protects all endpoint fields unless documented otherwise. mu must
 	// be acquired before interacting with the endpoint fields.
-	mu          sync.Mutex `state:"nosave"`
+	//
+	// During handshake, mu is locked by the protocol listen goroutine and
+	// released by the handshake completion goroutine.
+	mu          sync.CrossGoroutineMutex `state:"nosave"`
 	ownedByUser uint32
 
 	// state must be read/set using the EndpointState()/setEndpointState()
@@ -436,9 +441,8 @@ type endpoint struct {
 	isPortReserved    bool `state:"manual"`
 	isRegistered      bool `state:"manual"`
 	boundNICID        tcpip.NICID
-	route             stack.Route `state:"manual"`
+	route             *stack.Route `state:"manual"`
 	ttl               uint8
-	v6only            bool
 	isConnectNotified bool
 
 	// h stores a reference to the current handshake state if the endpoint is in
@@ -506,24 +510,9 @@ type endpoint struct {
 	// delay is a boolean (0 is false) and must be accessed atomically.
 	delay uint32
 
-	// cork holds back segments until full.
-	//
-	// cork is a boolean (0 is false) and must be accessed atomically.
-	cork uint32
-
 	// scoreboard holds TCP SACK Scoreboard information for this endpoint.
 	scoreboard *SACKScoreboard
 
-	// The options below aren't implemented, but we remember the user
-	// settings because applications expect to be able to set/query these
-	// options.
-
-	// slowAck holds the negated state of quick ack. It is stubbed out and
-	// does nothing.
-	//
-	// slowAck is a boolean (0 is false) and must be accessed atomically.
-	slowAck uint32
-
 	// segmentQueue is used to hand received segments to the protocol
 	// goroutine. Segments are queued as long as the queue is not full,
 	// and dropped when it is.
@@ -685,9 +674,6 @@ type endpoint struct {
 	// owner is used to get uid and gid of the packet.
 	owner tcpip.PacketOwner
 
-	// linger is used for SO_LINGER socket option.
-	linger tcpip.LingerOption
-
 	// ops is used to get socket level options.
 	ops tcpip.SocketOptions
 }
@@ -701,7 +687,7 @@ func (e *endpoint) UniqueID() uint64 {
 //
 // If userMSS is non-zero and is not greater than the maximum possible MSS for
 // r, it will be used; otherwise, the maximum possible MSS will be used.
-func calculateAdvertisedMSS(userMSS uint16, r stack.Route) uint16 {
+func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 {
 	// The maximum possible MSS is dependent on the route.
 	// TODO(b/143359391): Respect TCP Min and Max size.
 	maxMSS := uint16(r.MTU() - header.TCPMinimumSize)
@@ -850,7 +836,6 @@ func (e *endpoint) recentTimestamp() uint32 {
 // +stateify savable
 type keepalive struct {
 	sync.Mutex `state:"nosave"`
-	enabled    bool
 	idle       time.Duration
 	interval   time.Duration
 	count      int
@@ -884,6 +869,9 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		windowClamp:   DefaultReceiveBufferSize,
 		maxSynRetries: DefaultSynRetries,
 	}
+	e.ops.InitHandler(e)
+	e.ops.SetMulticastLoop(true)
+	e.ops.SetQuickAck(true)
 
 	var ss tcpip.TCPSendBufferSizeRangeOption
 	if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
@@ -907,7 +895,7 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 
 	var de tcpip.TCPDelayEnabled
 	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
-		e.SetSockOptBool(tcpip.DelayOption, true)
+		e.ops.SetDelayOption(true)
 	}
 
 	var tcpLT tcpip.TCPLingerTimeoutOption
@@ -1049,7 +1037,8 @@ func (e *endpoint) Close() {
 		return
 	}
 
-	if e.linger.Enabled && e.linger.Timeout == 0 {
+	linger := e.SocketOptions().GetLinger()
+	if linger.Enabled && linger.Timeout == 0 {
 		s := e.EndpointState()
 		isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv
 		if isResetState {
@@ -1169,7 +1158,11 @@ func (e *endpoint) cleanupLocked() {
 	e.boundPortFlags = ports.Flags{}
 	e.boundDest = tcpip.FullAddress{}
 
-	e.route.Release()
+	if e.route != nil {
+		e.route.Release()
+		e.route = nil
+	}
+
 	e.stack.CompleteTransportEndpointCleanup(e)
 	tcpip.DeleteDanglingEndpoint(e)
 }
@@ -1279,11 +1272,20 @@ func (e *endpoint) ModerateRecvBuf(copied int) {
 	e.rcvListMu.Unlock()
 }
 
+// SetOwner implements tcpip.Endpoint.SetOwner.
 func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
 
-func (e *endpoint) LastError() *tcpip.Error {
+// Preconditions: e.mu must be held to call this function.
+func (e *endpoint) hardErrorLocked() *tcpip.Error {
+	err := e.hardError
+	e.hardError = nil
+	return err
+}
+
+// Preconditions: e.mu must be held to call this function.
+func (e *endpoint) lastErrorLocked() *tcpip.Error {
 	e.lastErrorMu.Lock()
 	defer e.lastErrorMu.Unlock()
 	err := e.lastError
@@ -1291,6 +1293,16 @@ func (e *endpoint) LastError() *tcpip.Error {
 	return err
 }
 
+// LastError implements tcpip.Endpoint.LastError.
+func (e *endpoint) LastError() *tcpip.Error {
+	e.LockUser()
+	defer e.UnlockUser()
+	if err := e.hardErrorLocked(); err != nil {
+		return err
+	}
+	return e.lastErrorLocked()
+}
+
 // Read reads data from the endpoint.
 func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
 	e.LockUser()
@@ -1312,9 +1324,11 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 	bufUsed := e.rcvBufUsed
 	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
 		e.rcvListMu.Unlock()
-		he := e.HardError
 		if s == StateError {
-			return buffer.View{}, tcpip.ControlMessages{}, he
+			if err := e.hardErrorLocked(); err != nil {
+				return buffer.View{}, tcpip.ControlMessages{}, err
+			}
+			return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
 		}
 		e.stats.ReadErrors.NotConnected.Increment()
 		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrNotConnected
@@ -1370,9 +1384,13 @@ func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
 // indicating the reason why it's not writable.
 // Caller must hold e.mu and e.sndBufMu
 func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
+	// The endpoint cannot be written to if it's not connected.
 	switch s := e.EndpointState(); {
 	case s == StateError:
-		return 0, e.HardError
+		if err := e.hardErrorLocked(); err != nil {
+			return 0, err
+		}
+		return 0, tcpip.ErrClosedForSend
 	case !s.connecting() && !s.connected():
 		return 0, tcpip.ErrClosedForSend
 	case s.connecting():
@@ -1478,7 +1496,7 @@ func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 // Peek reads data without consuming it from the endpoint.
 //
 // This method does not block if there is no data pending.
-func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+func (e *endpoint) Peek(vec [][]byte) (int64, *tcpip.Error) {
 	e.LockUser()
 	defer e.UnlockUser()
 
@@ -1486,10 +1504,10 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	// but has some pending unread data.
 	if s := e.EndpointState(); !s.connected() && s != StateClose {
 		if s == StateError {
-			return 0, tcpip.ControlMessages{}, e.HardError
+			return 0, e.hardErrorLocked()
 		}
 		e.stats.ReadErrors.InvalidEndpointState.Increment()
-		return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
+		return 0, tcpip.ErrInvalidEndpointState
 	}
 
 	e.rcvListMu.Lock()
@@ -1498,9 +1516,9 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 	if e.rcvBufUsed == 0 {
 		if e.rcvClosed || !e.EndpointState().connected() {
 			e.stats.ReadErrors.ReadClosed.Increment()
-			return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
+			return 0, tcpip.ErrClosedForReceive
 		}
-		return 0, tcpip.ControlMessages{}, tcpip.ErrWouldBlock
+		return 0, tcpip.ErrWouldBlock
 	}
 
 	// Make a copy of vec so we can modify the slide headers.
@@ -1515,7 +1533,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 
 			for len(v) > 0 {
 				if len(vec) == 0 {
-					return num, tcpip.ControlMessages{}, nil
+					return num, nil
 				}
 				if len(vec[0]) == 0 {
 					vec = vec[1:]
@@ -1530,7 +1548,7 @@ func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Erro
 		}
 	}
 
-	return num, tcpip.ControlMessages{}, nil
+	return num, nil
 }
 
 // selectWindowLocked returns the new window without checking for shrinking or scaling
@@ -1602,72 +1620,39 @@ func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed boo
 	return false, false
 }
 
-// SetSockOptBool sets a socket option.
-func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
-	switch opt {
-
-	case tcpip.CorkOption:
-		e.LockUser()
-		if !v {
-			atomic.StoreUint32(&e.cork, 0)
-
-			// Handle the corked data.
-			e.sndWaker.Assert()
-		} else {
-			atomic.StoreUint32(&e.cork, 1)
-		}
-		e.UnlockUser()
-
-	case tcpip.DelayOption:
-		if v {
-			atomic.StoreUint32(&e.delay, 1)
-		} else {
-			atomic.StoreUint32(&e.delay, 0)
-
-			// Handle delayed data.
-			e.sndWaker.Assert()
-		}
-
-	case tcpip.KeepaliveEnabledOption:
-		e.keepalive.Lock()
-		e.keepalive.enabled = v
-		e.keepalive.Unlock()
-		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
-
-	case tcpip.QuickAckOption:
-		o := uint32(1)
-		if v {
-			o = 0
-		}
-		atomic.StoreUint32(&e.slowAck, o)
-
-	case tcpip.ReuseAddressOption:
-		e.LockUser()
-		e.portFlags.TupleOnly = v
-		e.UnlockUser()
-
-	case tcpip.ReusePortOption:
-		e.LockUser()
-		e.portFlags.LoadBalanced = v
-		e.UnlockUser()
+// OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet.
+func (e *endpoint) OnReuseAddressSet(v bool) {
+	e.LockUser()
+	e.portFlags.TupleOnly = v
+	e.UnlockUser()
+}
 
-	case tcpip.V6OnlyOption:
-		// We only recognize this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return tcpip.ErrInvalidEndpointState
-		}
+// OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet.
+func (e *endpoint) OnReusePortSet(v bool) {
+	e.LockUser()
+	e.portFlags.LoadBalanced = v
+	e.UnlockUser()
+}
 
-		// We only allow this to be set when we're in the initial state.
-		if e.EndpointState() != StateInitial {
-			return tcpip.ErrInvalidEndpointState
-		}
+// OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet.
+func (e *endpoint) OnKeepAliveSet(v bool) {
+	e.notifyProtocolGoroutine(notifyKeepaliveChanged)
+}
 
-		e.LockUser()
-		e.v6only = v
-		e.UnlockUser()
+// OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet.
+func (e *endpoint) OnDelayOptionSet(v bool) {
+	if !v {
+		// Handle delayed data.
+		e.sndWaker.Assert()
 	}
+}
 
-	return nil
+// OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet.
+func (e *endpoint) OnCorkOptionSet(v bool) {
+	if !v {
+		// Handle the corked data.
+		e.sndWaker.Assert()
+	}
 }
 
 // SetSockOptInt sets a socket option.
@@ -1851,9 +1836,6 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 		e.keepalive.Unlock()
 		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
 
-	case *tcpip.OutOfBandInlineOption:
-		// We don't currently support disabling this option.
-
 	case *tcpip.TCPUserTimeoutOption:
 		e.LockUser()
 		e.userTimeout = time.Duration(*v)
@@ -1922,11 +1904,6 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 	case *tcpip.SocketDetachFilterOption:
 		return nil
 
-	case *tcpip.LingerOption:
-		e.LockUser()
-		e.linger = *v
-		e.UnlockUser()
-
 	default:
 		return nil
 	}
@@ -1949,67 +1926,6 @@ func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
 	return e.rcvBufUsed, nil
 }
 
-// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
-func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	switch opt {
-
-	case tcpip.CorkOption:
-		return atomic.LoadUint32(&e.cork) != 0, nil
-
-	case tcpip.DelayOption:
-		return atomic.LoadUint32(&e.delay) != 0, nil
-
-	case tcpip.KeepaliveEnabledOption:
-		e.keepalive.Lock()
-		v := e.keepalive.enabled
-		e.keepalive.Unlock()
-
-		return v, nil
-
-	case tcpip.QuickAckOption:
-		v := atomic.LoadUint32(&e.slowAck) == 0
-		return v, nil
-
-	case tcpip.ReuseAddressOption:
-		e.LockUser()
-		v := e.portFlags.TupleOnly
-		e.UnlockUser()
-
-		return v, nil
-
-	case tcpip.ReusePortOption:
-		e.LockUser()
-		v := e.portFlags.LoadBalanced
-		e.UnlockUser()
-
-		return v, nil
-
-	case tcpip.V6OnlyOption:
-		// We only recognize this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return false, tcpip.ErrUnknownProtocolOption
-		}
-
-		e.LockUser()
-		v := e.v6only
-		e.UnlockUser()
-
-		return v, nil
-
-	case tcpip.MulticastLoopOption:
-		return true, nil
-
-	case tcpip.AcceptConnOption:
-		e.LockUser()
-		defer e.UnlockUser()
-
-		return e.EndpointState() == StateListen, nil
-
-	default:
-		return false, tcpip.ErrUnknownProtocolOption
-	}
-}
-
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
@@ -2120,10 +2036,6 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
 		e.UnlockUser()
 
-	case *tcpip.OutOfBandInlineOption:
-		// We don't currently support disabling this option.
-		*o = 1
-
 	case *tcpip.CongestionControlOption:
 		e.LockUser()
 		*o = e.cc
@@ -2152,11 +2064,6 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 			Port: port,
 		}
 
-	case *tcpip.LingerOption:
-		e.LockUser()
-		*o = e.linger
-		e.UnlockUser()
-
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -2166,7 +2073,7 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 // checkV4MappedLocked determines the effective network protocol and converts
 // addr to its canonical form.
 func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only)
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only())
 	if err != nil {
 		return tcpip.FullAddress{}, 0, err
 	}
@@ -2243,7 +2150,10 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 		return tcpip.ErrAlreadyConnecting
 
 	case StateError:
-		return e.HardError
+		if err := e.hardErrorLocked(); err != nil {
+			return err
+		}
+		return tcpip.ErrConnectionAborted
 
 	default:
 		return tcpip.ErrInvalidEndpointState
@@ -2417,7 +2327,7 @@ func (e *endpoint) startMainLoop(handshake bool) *tcpip.Error {
 				e.lastErrorMu.Unlock()
 
 				e.setEndpointState(StateError)
-				e.HardError = err
+				e.hardError = err
 
 				// Call cleanupLocked to free up any reservations.
 				e.cleanupLocked()
@@ -2697,7 +2607,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 	// v6only set to false.
 	if netProto == header.IPv6ProtocolNumber {
 		stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber)
-		alsoBindToV4 := !e.v6only && addr.Addr == "" && stackHasV4
+		alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == "" && stackHasV4
 		if alsoBindToV4 {
 			netProtos = append(netProtos, header.IPv4ProtocolNumber)
 		}
@@ -2782,7 +2692,7 @@ func (e *endpoint) getRemoteAddress() tcpip.FullAddress {
 
 func (*endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) {
 	// TCP HandlePacket is not required anymore as inbound packets first
-	// land at the Dispatcher which then can either delivery using the
+	// land at the Dispatcher which then can either deliver using the
 	// worker go routine or directly do the invoke the tcp processing inline
 	// based on the state of the endpoint.
 }
@@ -3079,6 +2989,7 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
 		Ssthresh:         e.snd.sndSsthresh,
 		SndCAAckCount:    e.snd.sndCAAckCount,
 		Outstanding:      e.snd.outstanding,
+		SackedOut:        e.snd.sackedOut,
 		SndWnd:           e.snd.sndWnd,
 		SndUna:           e.snd.sndUna,
 		SndNxt:           e.snd.sndNxt,
@@ -3161,7 +3072,7 @@ func (e *endpoint) State() uint32 {
 func (e *endpoint) Info() tcpip.EndpointInfo {
 	e.LockUser()
 	// Make a copy of the endpoint info.
-	ret := e.EndpointInfo
+	ret := e.TransportEndpointInfo
 	e.UnlockUser()
 	return &ret
 }
@@ -3187,6 +3098,7 @@ func (e *endpoint) Wait() {
 	}
 }
 
+// SocketOptions implements tcpip.Endpoint.SocketOptions.
 func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
 	return &e.ops
 }
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index bb901c0f8..ba67176b5 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -321,21 +321,21 @@ func (e *endpoint) loadRecentTSTime(unix unixTime) {
 }
 
 // saveHardError is invoked by stateify.
-func (e *EndpointInfo) saveHardError() string {
-	if e.HardError == nil {
+func (e *endpoint) saveHardError() string {
+	if e.hardError == nil {
 		return ""
 	}
 
-	return e.HardError.String()
+	return e.hardError.String()
 }
 
 // loadHardError is invoked by stateify.
-func (e *EndpointInfo) loadHardError(s string) {
+func (e *endpoint) loadHardError(s string) {
 	if s == "" {
 		return
 	}
 
-	e.HardError = tcpip.StringToError(s)
+	e.hardError = tcpip.StringToError(s)
 }
 
 // saveMeasureTime is invoked by stateify.
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 2329aca4b..672159eed 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -250,7 +250,7 @@ func replyWithReset(stack *stack.Stack, s *segment, tos, ttl uint8) *tcpip.Error
 		ttl = route.DefaultTTL()
 	}
 
-	return sendTCP(&route, tcpFields{
+	return sendTCP(route, tcpFields{
 		id:     s.id,
 		ttl:    ttl,
 		tos:    tos,
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index 8e0b7c843..405a6dce7 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -16,6 +16,7 @@ package tcp
 
 import (
 	"container/heap"
+	"math"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
@@ -48,6 +49,10 @@ type receiver struct {
 
 	rcvWndScale uint8
 
+	// prevBufused is the snapshot of endpoint rcvBufUsed taken when we
+	// advertise a receive window.
+	prevBufUsed int
+
 	closed bool
 
 	// pendingRcvdSegments is bounded by the receive buffer size of the
@@ -80,9 +85,9 @@ func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
 	// outgoing packets, we should use what we have advertised for acceptability
 	// test.
 	scaledWindowSize := r.rcvWnd >> r.rcvWndScale
-	if scaledWindowSize > 0xffff {
+	if scaledWindowSize > math.MaxUint16 {
 		// This is what we actually put in the Window field.
-		scaledWindowSize = 0xffff
+		scaledWindowSize = math.MaxUint16
 	}
 	advertisedWindowSize := scaledWindowSize << r.rcvWndScale
 	return header.Acceptable(segSeq, segLen, r.rcvNxt, r.rcvNxt.Add(advertisedWindowSize))
@@ -106,6 +111,34 @@ func (r *receiver) currentWindow() (curWnd seqnum.Size) {
 func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
 	newWnd := r.ep.selectWindow()
 	curWnd := r.currentWindow()
+	unackLen := int(r.ep.snd.maxSentAck.Size(r.rcvNxt))
+	bufUsed := r.ep.receiveBufferUsed()
+
+	// Grow the right edge of the window only for payloads larger than the
+	// the segment overhead OR if the application is actively consuming data.
+	//
+	// Avoiding growing the right edge otherwise, addresses a situation below:
+	// An application has been slow in reading data and we have burst of
+	// incoming segments lengths < segment overhead. Here, our available free
+	// memory would reduce drastically when compared to the advertised receive
+	// window.
+	//
+	// For example: With incoming 512 bytes segments, segment overhead of
+	// 552 bytes (at the time of writing this comment), with receive window
+	// starting from 1MB and with rcvAdvWndScale being 1, buffer would reach 0
+	// when the curWnd is still 19436 bytes, because for every incoming segment
+	// newWnd would reduce by (552+512) >> rcvAdvWndScale (current value 1),
+	// while curWnd would reduce by 512 bytes.
+	// Such a situation causes us to keep tail dropping the incoming segments
+	// and never advertise zero receive window to the peer.
+	//
+	// Linux does a similar check for minimal sk_buff size (128):
+	// https://github.com/torvalds/linux/blob/d5beb3140f91b1c8a3d41b14d729aefa4dcc58bc/net/ipv4/tcp_input.c#L783
+	//
+	// Also, if the application is reading the data, we keep growing the right
+	// edge, as we are still advertising a window that we think can be serviced.
+	toGrow := unackLen >= SegSize || bufUsed <= r.prevBufUsed
+
 	// Update rcvAcc only if new window is > previously advertised window. We
 	// should never shrink the acceptable sequence space once it has been
 	// advertised the peer. If we shrink the acceptable sequence space then we
@@ -115,7 +148,7 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
 	// rcvWUP       rcvNxt         rcvAcc          new rcvAcc
 	//               <=====curWnd ===>
 	//               <========= newWnd > curWnd ========= >
-	if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) {
+	if r.rcvNxt.Add(seqnum.Size(curWnd)).LessThan(r.rcvNxt.Add(seqnum.Size(newWnd))) && toGrow {
 		// If the new window moves the right edge, then update rcvAcc.
 		r.rcvAcc = r.rcvNxt.Add(seqnum.Size(newWnd))
 	} else {
@@ -130,11 +163,22 @@ func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
 	// receiver's estimated RTT.
 	r.rcvWnd = newWnd
 	r.rcvWUP = r.rcvNxt
+	r.prevBufUsed = bufUsed
 	scaledWnd := r.rcvWnd >> r.rcvWndScale
 	if scaledWnd == 0 {
 		// Increment a metric if we are advertising an actual zero window.
 		r.ep.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
 	}
+
+	// If we started off with a window larger than what can he held in
+	// the 16bit window field, we ceil the value to the max value.
+	if scaledWnd > math.MaxUint16 {
+		scaledWnd = seqnum.Size(math.MaxUint16)
+
+		// Ensure that the stashed receive window always reflects what
+		// is being advertised.
+		r.rcvWnd = scaledWnd << r.rcvWndScale
+	}
 	return r.rcvNxt, scaledWnd
 }
 
diff --git a/pkg/tcpip/transport/tcp/reno_recovery.go b/pkg/tcpip/transport/tcp/reno_recovery.go
new file mode 100644
index 000000000..2aa708e97
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/reno_recovery.go
@@ -0,0 +1,67 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+// renoRecovery stores the variables related to TCP Reno loss recovery
+// algorithm.
+//
+// +stateify savable
+type renoRecovery struct {
+	s *sender
+}
+
+func newRenoRecovery(s *sender) *renoRecovery {
+	return &renoRecovery{s: s}
+}
+
+func (rr *renoRecovery) DoRecovery(rcvdSeg *segment, fastRetransmit bool) {
+	ack := rcvdSeg.ackNumber
+	snd := rr.s
+
+	// We are in fast recovery mode. Ignore the ack if it's out of range.
+	if !ack.InRange(snd.sndUna, snd.sndNxt+1) {
+		return
+	}
+
+	// Don't count this as a duplicate if it is carrying data or
+	// updating the window.
+	if rcvdSeg.logicalLen() != 0 || snd.sndWnd != rcvdSeg.window {
+		return
+	}
+
+	// Inflate the congestion window if we're getting duplicate acks
+	// for the packet we retransmitted.
+	if !fastRetransmit && ack == snd.fr.first {
+		// We received a dup, inflate the congestion window by 1 packet
+		// if we're not at the max yet. Only inflate the window if
+		// regular FastRecovery is in use, RFC6675 does not require
+		// inflating cwnd on duplicate ACKs.
+		if snd.sndCwnd < snd.fr.maxCwnd {
+			snd.sndCwnd++
+		}
+		return
+	}
+
+	// A partial ack was received. Retransmit this packet and remember it
+	// so that we don't retransmit it again.
+	//
+	// We don't inflate the window because we're putting the same packet
+	// back onto the wire.
+	//
+	// N.B. The retransmit timer will be reset by the caller.
+	snd.fr.first = ack
+	snd.dupAckCount = 0
+	snd.resendSegment()
+}
diff --git a/pkg/tcpip/transport/tcp/sack_recovery.go b/pkg/tcpip/transport/tcp/sack_recovery.go
new file mode 100644
index 000000000..7e813fa96
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/sack_recovery.go
@@ -0,0 +1,120 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import "gvisor.dev/gvisor/pkg/tcpip/seqnum"
+
+// sackRecovery stores the variables related to TCP SACK loss recovery
+// algorithm.
+//
+// +stateify savable
+type sackRecovery struct {
+	s *sender
+}
+
+func newSACKRecovery(s *sender) *sackRecovery {
+	return &sackRecovery{s: s}
+}
+
+// handleSACKRecovery implements the loss recovery phase as described in RFC6675
+// section 5, step C.
+func (sr *sackRecovery) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) {
+	snd := sr.s
+	snd.SetPipe()
+
+	if smss := int(snd.ep.scoreboard.SMSS()); limit > smss {
+		// Cap segment size limit to s.smss as SACK recovery requires
+		// that all retransmissions or new segments send during recovery
+		// be of <= SMSS.
+		limit = smss
+	}
+
+	nextSegHint := snd.writeList.Front()
+	for snd.outstanding < snd.sndCwnd {
+		var nextSeg *segment
+		var rescueRtx bool
+		nextSeg, nextSegHint, rescueRtx = snd.NextSeg(nextSegHint)
+		if nextSeg == nil {
+			return dataSent
+		}
+		if !snd.isAssignedSequenceNumber(nextSeg) || snd.sndNxt.LessThanEq(nextSeg.sequenceNumber) {
+			// New data being sent.
+
+			// Step C.3 described below is handled by
+			// maybeSendSegment which increments sndNxt when
+			// a segment is transmitted.
+			//
+			// Step C.3 "If any of the data octets sent in
+			// (C.1) are above HighData, HighData must be
+			// updated to reflect the transmission of
+			// previously unsent data."
+			//
+			// We pass s.smss as the limit as the Step 2) requires that
+			// new data sent should be of size s.smss or less.
+			if sent := snd.maybeSendSegment(nextSeg, limit, end); !sent {
+				return dataSent
+			}
+			dataSent = true
+			snd.outstanding++
+			snd.writeNext = nextSeg.Next()
+			continue
+		}
+
+		// Now handle the retransmission case where we matched either step 1,3 or 4
+		// of the NextSeg algorithm.
+		// RFC 6675, Step C.4.
+		//
+		// "The estimate of the amount of data outstanding in the network
+		// must be updated by incrementing pipe by the number of octets
+		// transmitted in (C.1)."
+		snd.outstanding++
+		dataSent = true
+		snd.sendSegment(nextSeg)
+
+		segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen())
+		if rescueRtx {
+			// We do the last part of rule (4) of NextSeg here to update
+			// RescueRxt as until this point we don't know if we are going
+			// to use the rescue transmission.
+			snd.fr.rescueRxt = snd.fr.last
+		} else {
+			// RFC 6675, Step C.2
+			//
+			// "If any of the data octets sent in (C.1) are below
+			// HighData, HighRxt MUST be set to the highest sequence
+			// number of the retransmitted segment unless NextSeg ()
+			// rule (4) was invoked for this retransmission."
+			snd.fr.highRxt = segEnd - 1
+		}
+	}
+	return dataSent
+}
+
+func (sr *sackRecovery) DoRecovery(rcvdSeg *segment, fastRetransmit bool) {
+	snd := sr.s
+	if fastRetransmit {
+		snd.resendSegment()
+	}
+
+	// We are in fast recovery mode. Ignore the ack if it's out of range.
+	if ack := rcvdSeg.ackNumber; !ack.InRange(snd.sndUna, snd.sndNxt+1) {
+		return
+	}
+
+	// RFC 6675 recovery algorithm step C 1-5.
+	end := snd.sndUna.Add(snd.sndWnd)
+	dataSent := sr.handleSACKRecovery(snd.maxPayloadSize, end)
+	snd.postXmit(dataSent)
+}
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 2091989cc..5ef73ec74 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -204,7 +204,7 @@ func (s *segment) payloadSize() int {
 // segMemSize is the amount of memory used to hold the segment data and
 // the associated metadata.
 func (s *segment) segMemSize() int {
-	return segSize + s.data.Size()
+	return SegSize + s.data.Size()
 }
 
 // parse populates the sequence & ack numbers, flags, and window fields of the
diff --git a/pkg/tcpip/transport/tcp/segment_unsafe.go b/pkg/tcpip/transport/tcp/segment_unsafe.go
index 0ab7b8f56..392ff0859 100644
--- a/pkg/tcpip/transport/tcp/segment_unsafe.go
+++ b/pkg/tcpip/transport/tcp/segment_unsafe.go
@@ -19,5 +19,6 @@ import (
 )
 
 const (
-	segSize = int(unsafe.Sizeof(segment{}))
+	// SegSize is the minimal size of the segment overhead.
+	SegSize = int(unsafe.Sizeof(segment{}))
 )
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 0e0fdf14c..cc991aba6 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 	"math"
 	"sort"
-	"sync/atomic"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/sleep"
@@ -92,6 +91,17 @@ type congestionControl interface {
 	PostRecovery()
 }
 
+// lossRecovery is an interface that must be implemented by any supported
+// loss recovery algorithm.
+type lossRecovery interface {
+	// DoRecovery is invoked when loss is detected and segments need
+	// to be retransmitted. The cumulative or selective ACK is passed along
+	// with the flag which identifies whether the connection entered fast
+	// retransmit with this ACK and to retransmit the first unacknowledged
+	// segment.
+	DoRecovery(rcvdSeg *segment, fastRetransmit bool)
+}
+
 // sender holds the state necessary to send TCP segments.
 //
 // +stateify savable
@@ -108,6 +118,9 @@ type sender struct {
 	// fr holds state related to fast recovery.
 	fr fastRecovery
 
+	// lr is the loss recovery algorithm used by the sender.
+	lr lossRecovery
+
 	// sndCwnd is the congestion window, in packets.
 	sndCwnd int
 
@@ -124,6 +137,9 @@ type sender struct {
 	// that have been sent but not yet acknowledged.
 	outstanding int
 
+	// sackedOut is the number of packets which are selectively acked.
+	sackedOut int
+
 	// sndWnd is the send window size.
 	sndWnd seqnum.Size
 
@@ -276,6 +292,8 @@ func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint
 
 	s.cc = s.initCongestionControl(ep.cc)
 
+	s.lr = s.initLossRecovery()
+
 	// A negative sndWndScale means that no scaling is in use, otherwise we
 	// store the scaling value.
 	if sndWndScale > 0 {
@@ -330,6 +348,14 @@ func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionCon
 	}
 }
 
+// initLossRecovery initiates the loss recovery algorithm for the sender.
+func (s *sender) initLossRecovery() lossRecovery {
+	if s.ep.sackPermitted {
+		return newSACKRecovery(s)
+	}
+	return newRenoRecovery(s)
+}
+
 // updateMaxPayloadSize updates the maximum payload size based on the given
 // MTU. If this is in response to "packet too big" control packets (indicated
 // by the count argument), it also reduces the number of outstanding packets and
@@ -349,6 +375,7 @@ func (s *sender) updateMaxPayloadSize(mtu, count int) {
 		m = 1
 	}
 
+	oldMSS := s.maxPayloadSize
 	s.maxPayloadSize = m
 	if s.gso {
 		s.ep.gso.MSS = uint16(m)
@@ -371,6 +398,7 @@ func (s *sender) updateMaxPayloadSize(mtu, count int) {
 
 	// Rewind writeNext to the first segment exceeding the MTU. Do nothing
 	// if it is already before such a packet.
+	nextSeg := s.writeNext
 	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
 		if seg == s.writeNext {
 			// We got to writeNext before we could find a segment
@@ -378,16 +406,22 @@ func (s *sender) updateMaxPayloadSize(mtu, count int) {
 			break
 		}
 
-		if seg.data.Size() > m {
+		if nextSeg == s.writeNext && seg.data.Size() > m {
 			// We found a segment exceeding the MTU. Rewind
 			// writeNext and try to retransmit it.
-			s.writeNext = seg
-			break
+			nextSeg = seg
+		}
+
+		if s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
+			// Update sackedOut for new maximum payload size.
+			s.sackedOut -= s.pCount(seg, oldMSS)
+			s.sackedOut += s.pCount(seg, s.maxPayloadSize)
 		}
 	}
 
 	// Since we likely reduced the number of outstanding packets, we may be
 	// ready to send some more.
+	s.writeNext = nextSeg
 	s.sendData()
 }
 
@@ -550,7 +584,7 @@ func (s *sender) retransmitTimerExpired() bool {
 		// We were attempting fast recovery but were not successful.
 		// Leave the state. We don't need to update ssthresh because it
 		// has already been updated when entered fast-recovery.
-		s.leaveFastRecovery()
+		s.leaveRecovery()
 	}
 
 	s.state = RTORecovery
@@ -606,13 +640,13 @@ func (s *sender) retransmitTimerExpired() bool {
 
 // pCount returns the number of packets in the segment. Due to GSO, a segment
 // can be composed of multiple packets.
-func (s *sender) pCount(seg *segment) int {
+func (s *sender) pCount(seg *segment, maxPayloadSize int) int {
 	size := seg.data.Size()
 	if size == 0 {
 		return 1
 	}
 
-	return (size-1)/s.maxPayloadSize + 1
+	return (size-1)/maxPayloadSize + 1
 }
 
 // splitSeg splits a given segment at the size specified and inserts the
@@ -789,7 +823,7 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 			}
 			if !nextTooBig && seg.data.Size() < available {
 				// Segment is not full.
-				if s.outstanding > 0 && atomic.LoadUint32(&s.ep.delay) != 0 {
+				if s.outstanding > 0 && s.ep.ops.GetDelayOption() {
 					// Nagle's algorithm. From Wikipedia:
 					//   Nagle's algorithm works by
 					//   combining a number of small
@@ -808,7 +842,7 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 				// send space and MSS.
 				// TODO(gvisor.dev/issue/2833): Drain the held segments after a
 				// timeout.
-				if seg.data.Size() < s.maxPayloadSize && atomic.LoadUint32(&s.ep.cork) != 0 {
+				if seg.data.Size() < s.maxPayloadSize && s.ep.ops.GetCorkOption() {
 					return false
 				}
 			}
@@ -913,79 +947,6 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 	return true
 }
 
-// handleSACKRecovery implements the loss recovery phase as described in RFC6675
-// section 5, step C.
-func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) {
-	s.SetPipe()
-
-	if smss := int(s.ep.scoreboard.SMSS()); limit > smss {
-		// Cap segment size limit to s.smss as SACK recovery requires
-		// that all retransmissions or new segments send during recovery
-		// be of <= SMSS.
-		limit = smss
-	}
-
-	nextSegHint := s.writeList.Front()
-	for s.outstanding < s.sndCwnd {
-		var nextSeg *segment
-		var rescueRtx bool
-		nextSeg, nextSegHint, rescueRtx = s.NextSeg(nextSegHint)
-		if nextSeg == nil {
-			return dataSent
-		}
-		if !s.isAssignedSequenceNumber(nextSeg) || s.sndNxt.LessThanEq(nextSeg.sequenceNumber) {
-			// New data being sent.
-
-			// Step C.3 described below is handled by
-			// maybeSendSegment which increments sndNxt when
-			// a segment is transmitted.
-			//
-			// Step C.3 "If any of the data octets sent in
-			// (C.1) are above HighData, HighData must be
-			// updated to reflect the transmission of
-			// previously unsent data."
-			//
-			// We pass s.smss as the limit as the Step 2) requires that
-			// new data sent should be of size s.smss or less.
-			if sent := s.maybeSendSegment(nextSeg, limit, end); !sent {
-				return dataSent
-			}
-			dataSent = true
-			s.outstanding++
-			s.writeNext = nextSeg.Next()
-			continue
-		}
-
-		// Now handle the retransmission case where we matched either step 1,3 or 4
-		// of the NextSeg algorithm.
-		// RFC 6675, Step C.4.
-		//
-		// "The estimate of the amount of data outstanding in the network
-		// must be updated by incrementing pipe by the number of octets
-		// transmitted in (C.1)."
-		s.outstanding++
-		dataSent = true
-		s.sendSegment(nextSeg)
-
-		segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen())
-		if rescueRtx {
-			// We do the last part of rule (4) of NextSeg here to update
-			// RescueRxt as until this point we don't know if we are going
-			// to use the rescue transmission.
-			s.fr.rescueRxt = s.fr.last
-		} else {
-			// RFC 6675, Step C.2
-			//
-			// "If any of the data octets sent in (C.1) are below
-			// HighData, HighRxt MUST be set to the highest sequence
-			// number of the retransmitted segment unless NextSeg ()
-			// rule (4) was invoked for this retransmission."
-			s.fr.highRxt = segEnd - 1
-		}
-	}
-	return dataSent
-}
-
 func (s *sender) sendZeroWindowProbe() {
 	ack, win := s.ep.rcv.getSendParams()
 	s.unackZeroWindowProbes++
@@ -1014,6 +975,30 @@ func (s *sender) disableZeroWindowProbing() {
 	s.resendTimer.disable()
 }
 
+func (s *sender) postXmit(dataSent bool) {
+	if dataSent {
+		// We sent data, so we should stop the keepalive timer to ensure
+		// that no keepalives are sent while there is pending data.
+		s.ep.disableKeepaliveTimer()
+	}
+
+	// If the sender has advertized zero receive window and we have
+	// data to be sent out, start zero window probing to query the
+	// the remote for it's receive window size.
+	if s.writeNext != nil && s.sndWnd == 0 {
+		s.enableZeroWindowProbing()
+	}
+
+	// Enable the timer if we have pending data and it's not enabled yet.
+	if !s.resendTimer.enabled() && s.sndUna != s.sndNxt {
+		s.resendTimer.enable(s.rto)
+	}
+	// If we have no more pending data, start the keepalive timer.
+	if s.sndUna == s.sndNxt {
+		s.ep.resetKeepaliveTimer(false)
+	}
+}
+
 // sendData sends new data segments. It is called when data becomes available or
 // when the send window opens up.
 func (s *sender) sendData() {
@@ -1034,55 +1019,29 @@ func (s *sender) sendData() {
 	}
 
 	var dataSent bool
-
-	// RFC 6675 recovery algorithm step C 1-5.
-	if s.fr.active && s.ep.sackPermitted {
-		dataSent = s.handleSACKRecovery(s.maxPayloadSize, end)
-	} else {
-		for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() {
-			cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize
-			if cwndLimit < limit {
-				limit = cwndLimit
-			}
-			if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
-				// Move writeNext along so that we don't try and scan data that
-				// has already been SACKED.
-				s.writeNext = seg.Next()
-				continue
-			}
-			if sent := s.maybeSendSegment(seg, limit, end); !sent {
-				break
-			}
-			dataSent = true
-			s.outstanding += s.pCount(seg)
+	for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() {
+		cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize
+		if cwndLimit < limit {
+			limit = cwndLimit
+		}
+		if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
+			// Move writeNext along so that we don't try and scan data that
+			// has already been SACKED.
 			s.writeNext = seg.Next()
+			continue
 		}
+		if sent := s.maybeSendSegment(seg, limit, end); !sent {
+			break
+		}
+		dataSent = true
+		s.outstanding += s.pCount(seg, s.maxPayloadSize)
+		s.writeNext = seg.Next()
 	}
 
-	if dataSent {
-		// We sent data, so we should stop the keepalive timer to ensure
-		// that no keepalives are sent while there is pending data.
-		s.ep.disableKeepaliveTimer()
-	}
-
-	// If the sender has advertized zero receive window and we have
-	// data to be sent out, start zero window probing to query the
-	// the remote for it's receive window size.
-	if s.writeNext != nil && s.sndWnd == 0 {
-		s.enableZeroWindowProbing()
-	}
-
-	// Enable the timer if we have pending data and it's not enabled yet.
-	if !s.resendTimer.enabled() && s.sndUna != s.sndNxt {
-		s.resendTimer.enable(s.rto)
-	}
-	// If we have no more pending data, start the keepalive timer.
-	if s.sndUna == s.sndNxt {
-		s.ep.resetKeepaliveTimer(false)
-	}
+	s.postXmit(dataSent)
 }
 
-func (s *sender) enterFastRecovery() {
+func (s *sender) enterRecovery() {
 	s.fr.active = true
 	// Save state to reflect we're now in fast recovery.
 	//
@@ -1090,6 +1049,7 @@ func (s *sender) enterFastRecovery() {
 	// We inflate the cwnd by 3 to account for the 3 packets which triggered
 	// the 3 duplicate ACKs and are now not in flight.
 	s.sndCwnd = s.sndSsthresh + 3
+	s.sackedOut = 0
 	s.fr.first = s.sndUna
 	s.fr.last = s.sndNxt - 1
 	s.fr.maxCwnd = s.sndCwnd + s.outstanding
@@ -1104,7 +1064,7 @@ func (s *sender) enterFastRecovery() {
 	s.ep.stack.Stats().TCP.FastRecovery.Increment()
 }
 
-func (s *sender) leaveFastRecovery() {
+func (s *sender) leaveRecovery() {
 	s.fr.active = false
 	s.fr.maxCwnd = 0
 	s.dupAckCount = 0
@@ -1115,57 +1075,6 @@ func (s *sender) leaveFastRecovery() {
 	s.cc.PostRecovery()
 }
 
-func (s *sender) handleFastRecovery(seg *segment) (rtx bool) {
-	ack := seg.ackNumber
-	// We are in fast recovery mode. Ignore the ack if it's out of
-	// range.
-	if !ack.InRange(s.sndUna, s.sndNxt+1) {
-		return false
-	}
-
-	// Leave fast recovery if it acknowledges all the data covered by
-	// this fast recovery session.
-	if s.fr.last.LessThan(ack) {
-		s.leaveFastRecovery()
-		return false
-	}
-
-	if s.ep.sackPermitted {
-		// When SACK is enabled we let retransmission be governed by
-		// the SACK logic.
-		return false
-	}
-
-	// Don't count this as a duplicate if it is carrying data or
-	// updating the window.
-	if seg.logicalLen() != 0 || s.sndWnd != seg.window {
-		return false
-	}
-
-	// Inflate the congestion window if we're getting duplicate acks
-	// for the packet we retransmitted.
-	if ack == s.fr.first {
-		// We received a dup, inflate the congestion window by 1 packet
-		// if we're not at the max yet. Only inflate the window if
-		// regular FastRecovery is in use, RFC6675 does not require
-		// inflating cwnd on duplicate ACKs.
-		if s.sndCwnd < s.fr.maxCwnd {
-			s.sndCwnd++
-		}
-		return false
-	}
-
-	// A partial ack was received. Retransmit this packet and
-	// remember it so that we don't retransmit it again. We don't
-	// inflate the window because we're putting the same packet back
-	// onto the wire.
-	//
-	// N.B. The retransmit timer will be reset by the caller.
-	s.fr.first = ack
-	s.dupAckCount = 0
-	return true
-}
-
 // isAssignedSequenceNumber relies on the fact that we only set flags once a
 // sequencenumber is assigned and that is only done right before we send the
 // segment. As a result any segment that has a non-zero flag has a valid
@@ -1228,14 +1137,11 @@ func (s *sender) SetPipe() {
 	s.outstanding = pipe
 }
 
-// checkDuplicateAck is called when an ack is received. It manages the state
-// related to duplicate acks and determines if a retransmit is needed according
-// to the rules in RFC 6582 (NewReno).
-func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) {
+// detectLoss is called when an ack is received and returns whether a loss is
+// detected. It manages the state related to duplicate acks and determines if
+// a retransmit is needed according to the rules in RFC 6582 (NewReno).
+func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) {
 	ack := seg.ackNumber
-	if s.fr.active {
-		return s.handleFastRecovery(seg)
-	}
 
 	// We're not in fast recovery yet. A segment is considered a duplicate
 	// only if it doesn't carry any data and doesn't update the send window,
@@ -1266,14 +1172,14 @@ func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) {
 	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2
 	//
 	// We only do the check here, the incrementing of last to the highest
-	// sequence number transmitted till now is done when enterFastRecovery
+	// sequence number transmitted till now is done when enterRecovery
 	// is invoked.
 	if !s.fr.last.LessThan(seg.ackNumber) {
 		s.dupAckCount = 0
 		return false
 	}
 	s.cc.HandleNDupAcks()
-	s.enterFastRecovery()
+	s.enterRecovery()
 	s.dupAckCount = 0
 	return true
 }
@@ -1313,6 +1219,7 @@ func (s *sender) walkSACK(rcvdSeg *segment) {
 				s.rc.update(seg, rcvdSeg, s.ep.tsOffset)
 				s.rc.detectReorder(seg)
 				seg.acked = true
+				s.sackedOut += s.pCount(seg, s.maxPayloadSize)
 			}
 			seg = seg.Next()
 		}
@@ -1415,14 +1322,23 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 		s.SetPipe()
 	}
 
-	// Count the duplicates and do the fast retransmit if needed.
-	rtx := s.checkDuplicateAck(rcvdSeg)
+	ack := rcvdSeg.ackNumber
+	fastRetransmit := false
+	// Do not leave fast recovery, if the ACK is out of range.
+	if s.fr.active {
+		// Leave fast recovery if it acknowledges all the data covered by
+		// this fast recovery session.
+		if ack.InRange(s.sndUna, s.sndNxt+1) && s.fr.last.LessThan(ack) {
+			s.leaveRecovery()
+		}
+	} else {
+		// Detect loss by counting the duplicates and enter recovery.
+		fastRetransmit = s.detectLoss(rcvdSeg)
+	}
 
 	// Stash away the current window size.
 	s.sndWnd = rcvdSeg.window
 
-	ack := rcvdSeg.ackNumber
-
 	// Disable zero window probing if remote advertizes a non-zero receive
 	// window. This can be with an ACK to the zero window probe (where the
 	// acknumber refers to the already acknowledged byte) OR to any previously
@@ -1477,10 +1393,10 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 			datalen := seg.logicalLen()
 
 			if datalen > ackLeft {
-				prevCount := s.pCount(seg)
+				prevCount := s.pCount(seg, s.maxPayloadSize)
 				seg.data.TrimFront(int(ackLeft))
 				seg.sequenceNumber.UpdateForward(ackLeft)
-				s.outstanding -= prevCount - s.pCount(seg)
+				s.outstanding -= prevCount - s.pCount(seg, s.maxPayloadSize)
 				break
 			}
 
@@ -1496,11 +1412,13 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 
 			s.writeList.Remove(seg)
 
-			// If SACK is enabled then Only reduce outstanding if
+			// If SACK is enabled then only reduce outstanding if
 			// the segment was not previously SACKED as these have
 			// already been accounted for in SetPipe().
 			if !s.ep.sackPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
-				s.outstanding -= s.pCount(seg)
+				s.outstanding -= s.pCount(seg, s.maxPayloadSize)
+			} else {
+				s.sackedOut -= s.pCount(seg, s.maxPayloadSize)
 			}
 			seg.decRef()
 			ackLeft -= datalen
@@ -1539,19 +1457,24 @@ func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 			s.resendTimer.disable()
 		}
 	}
+
 	// Now that we've popped all acknowledged data from the retransmit
 	// queue, retransmit if needed.
-	if rtx {
-		s.resendSegment()
+	if s.fr.active {
+		s.lr.DoRecovery(rcvdSeg, fastRetransmit)
+		// When SACK is enabled data sending is governed by steps in
+		// RFC 6675 Section 5 recovery steps  A-C.
+		// See: https://tools.ietf.org/html/rfc6675#section-5.
+		if s.ep.sackPermitted {
+			return
+		}
 	}
 
 	// Send more data now that some of the pending data has been ack'd, or
 	// that the window opened up, or the congestion window was inflated due
 	// to a duplicate ack during fast recovery. This will also re-enable
 	// the retransmit timer if needed.
-	if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || rcvdSeg.hasNewSACKInfo {
-		s.sendData()
-	}
+	s.sendData()
 }
 
 // sendSegment sends the specified segment.
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
index ef7f5719f..faf0c0ad7 100644
--- a/pkg/tcpip/transport/tcp/tcp_sack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -590,3 +590,45 @@ func TestSACKRecovery(t *testing.T) {
 		expected++
 	}
 }
+
+// TestSACKUpdateSackedOut tests the sacked out field is updated when a SACK
+// is received.
+func TestSACKUpdateSackedOut(t *testing.T) {
+	c := context.New(t, uint32(mtu))
+	defer c.Cleanup()
+
+	probeDone := make(chan struct{})
+	ackNum := 0
+	c.Stack().AddTCPProbe(func(state stack.TCPEndpointState) {
+		// Validate that the endpoint Sender.SackedOut is what we expect.
+		if state.Sender.SackedOut != 2 && ackNum == 0 {
+			t.Fatalf("SackedOut got updated to wrong value got: %v want: 2", state.Sender.SackedOut)
+		}
+
+		if state.Sender.SackedOut != 0 && ackNum == 1 {
+			t.Fatalf("SackedOut got updated to wrong value got: %v want: 0", state.Sender.SackedOut)
+		}
+		if ackNum > 0 {
+			close(probeDone)
+		}
+		ackNum++
+	})
+	setStackSACKPermitted(t, c, true)
+	createConnectedWithSACKAndTS(c)
+
+	sendAndReceive(t, c, 8)
+
+	// ACK for [3-5] packets.
+	seq := seqnum.Value(context.TestInitialSequenceNumber).Add(1)
+	start := c.IRS.Add(seqnum.Size(1 + 3*maxPayload))
+	bytesRead := 2 * maxPayload
+	end := start.Add(seqnum.Size(bytesRead))
+	c.SendAckWithSACK(seq, bytesRead, []header.SACKBlock{{start, end}})
+
+	bytesRead += 3 * maxPayload
+	c.SendAck(seq, bytesRead)
+
+	// Wait for the probe function to finish processing the ACK before the
+	// test completes.
+	<-probeDone
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 9f0fb41e3..351a5e4f5 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -75,9 +75,6 @@ func TestGiveUpConnect(t *testing.T) {
 
 	// Wait for ep to become writable.
 	<-notifyCh
-	if err := ep.LastError(); err != tcpip.ErrAborted {
-		t.Fatalf("got ep.LastError() = %s, want = %s", err, tcpip.ErrAborted)
-	}
 
 	// Call Connect again to retreive the handshake failure status
 	// and stats updates.
@@ -267,7 +264,7 @@ func TestTCPResetsSentNoICMP(t *testing.T) {
 	}
 
 	// Read outgoing ICMP stats and check no ICMP DstUnreachable was recorded.
-	sent := stats.ICMP.V4PacketsSent
+	sent := stats.ICMP.V4.PacketsSent
 	if got, want := sent.DstUnreachable.Value(), uint64(0); got != want {
 		t.Errorf("got ICMP DstUnreachable.Value() = %d, want = %d", got, want)
 	}
@@ -1935,6 +1932,84 @@ func TestFullWindowReceive(t *testing.T) {
 	)
 }
 
+// Test the stack receive window advertisement on receiving segments smaller than
+// segment overhead. It tests for the right edge of the window to not grow when
+// the endpoint is not being read from.
+func TestSmallSegReceiveWindowAdvertisement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	opt := tcpip.TCPReceiveBufferSizeRangeOption{
+		Min:     1,
+		Default: tcp.DefaultReceiveBufferSize,
+		Max:     tcp.DefaultReceiveBufferSize << tcp.FindWndScale(seqnum.Size(tcp.DefaultReceiveBufferSize)),
+	}
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, &opt); err != nil {
+		t.Fatalf("SetTransportProtocolOption(%d, &%#v): %s", tcp.ProtocolNumber, opt, err)
+	}
+
+	c.AcceptWithOptions(tcp.FindWndScale(seqnum.Size(opt.Default)), header.TCPSynOptions{MSS: defaultIPv4MSS})
+
+	// Bump up the receive buffer size such that, when the receive window grows,
+	// the scaled window exceeds maxUint16.
+	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, opt.Max); err != nil {
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, %d) failed: %s", opt.Max, err)
+	}
+
+	// Keep the payload size < segment overhead and such that it is a multiple
+	// of the window scaled value. This enables the test to perform equality
+	// checks on the incoming receive window.
+	payload := generateRandomPayload(t, (tcp.SegSize-1)&(1<<c.RcvdWindowScale))
+	payloadLen := seqnum.Size(len(payload))
+	iss := seqnum.Value(789)
+	seqNum := iss.Add(1)
+
+	// Send payload to the endpoint and return the advertised receive window
+	// from the endpoint.
+	getIncomingRcvWnd := func() uint32 {
+		c.SendPacket(payload, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			SeqNum:  seqNum,
+			AckNum:  c.IRS.Add(1),
+			Flags:   header.TCPFlagAck,
+			RcvWnd:  30000,
+		})
+		seqNum = seqNum.Add(payloadLen)
+
+		pkt := c.GetPacket()
+		return uint32(header.TCP(header.IPv4(pkt).Payload()).WindowSize()) << c.RcvdWindowScale
+	}
+
+	// Read the advertised receive window with the ACK for payload.
+	rcvWnd := getIncomingRcvWnd()
+
+	// Check if the subsequent ACK to our send has not grown the right edge of
+	// the window.
+	if got, want := getIncomingRcvWnd(), rcvWnd-uint32(len(payload)); got != want {
+		t.Fatalf("got incomingRcvwnd %d want %d", got, want)
+	}
+
+	// Read the data so that the subsequent ACK from the endpoint
+	// grows the right edge of the window.
+	if _, _, err := c.EP.Read(nil); err != nil {
+		t.Fatalf("got Read(nil) = %s", err)
+	}
+
+	// Check if we have received max uint16 as our advertised
+	// scaled window now after a read above.
+	maxRcv := uint32(math.MaxUint16 << c.RcvdWindowScale)
+	if got, want := getIncomingRcvWnd(), maxRcv; got != want {
+		t.Fatalf("got incomingRcvwnd %d want %d", got, want)
+	}
+
+	// Check if the subsequent ACK to our send has not grown the right edge of
+	// the window.
+	if got, want := getIncomingRcvWnd(), maxRcv-uint32(len(payload)); got != want {
+		t.Fatalf("got incomingRcvwnd %d want %d", got, want)
+	}
+}
+
 func TestNoWindowShrinking(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -2532,10 +2607,10 @@ func TestSegmentMerging(t *testing.T) {
 		{
 			"cork",
 			func(ep tcpip.Endpoint) {
-				ep.SetSockOptBool(tcpip.CorkOption, true)
+				ep.SocketOptions().SetCorkOption(true)
 			},
 			func(ep tcpip.Endpoint) {
-				ep.SetSockOptBool(tcpip.CorkOption, false)
+				ep.SocketOptions().SetCorkOption(false)
 			},
 		},
 	}
@@ -2627,7 +2702,7 @@ func TestDelay(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	c.EP.SetSockOptBool(tcpip.DelayOption, true)
+	c.EP.SocketOptions().SetDelayOption(true)
 
 	var allData []byte
 	for i, data := range [][]byte{{0}, {1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} {
@@ -2675,7 +2750,7 @@ func TestUndelay(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
-	c.EP.SetSockOptBool(tcpip.DelayOption, true)
+	c.EP.SocketOptions().SetDelayOption(true)
 
 	allData := [][]byte{{0}, {1, 2, 3}}
 	for i, data := range allData {
@@ -2708,7 +2783,7 @@ func TestUndelay(t *testing.T) {
 	// Check that we don't get the second packet yet.
 	c.CheckNoPacketTimeout("delayed second packet transmitted", 100*time.Millisecond)
 
-	c.EP.SetSockOptBool(tcpip.DelayOption, false)
+	c.EP.SocketOptions().SetDelayOption(false)
 
 	// Check that data is received.
 	second := c.GetPacket()
@@ -2745,8 +2820,8 @@ func TestMSSNotDelayed(t *testing.T) {
 		fn   func(tcpip.Endpoint)
 	}{
 		{"no-op", func(tcpip.Endpoint) {}},
-		{"delay", func(ep tcpip.Endpoint) { ep.SetSockOptBool(tcpip.DelayOption, true) }},
-		{"cork", func(ep tcpip.Endpoint) { ep.SetSockOptBool(tcpip.CorkOption, true) }},
+		{"delay", func(ep tcpip.Endpoint) { ep.SocketOptions().SetDelayOption(true) }},
+		{"cork", func(ep tcpip.Endpoint) { ep.SocketOptions().SetCorkOption(true) }},
 	}
 
 	for _, test := range tests {
@@ -3198,6 +3273,11 @@ loop:
 		case tcpip.ErrWouldBlock:
 			select {
 			case <-ch:
+				// Expect the state to be StateError and subsequent Reads to fail with HardError.
+				if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionReset {
+					t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset)
+				}
+				break loop
 			case <-time.After(1 * time.Second):
 				t.Fatalf("Timed out waiting for reset to arrive")
 			}
@@ -3207,14 +3287,10 @@ loop:
 			t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset)
 		}
 	}
-	// Expect the state to be StateError and subsequent Reads to fail with HardError.
-	if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionReset {
-		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset)
-	}
+
 	if tcp.EndpointState(c.EP.State()) != tcp.StateError {
 		t.Fatalf("got EP state is not StateError")
 	}
-
 	if got := c.Stack().Stats().TCP.EstablishedResets.Value(); got != 1 {
 		t.Errorf("got stats.TCP.EstablishedResets.Value() = %d, want = 1", got)
 	}
@@ -4150,7 +4226,7 @@ func TestReadAfterClosedState(t *testing.T) {
 
 	// Check that peek works.
 	peekBuf := make([]byte, 10)
-	n, _, err := c.EP.Peek([][]byte{peekBuf})
+	n, err := c.EP.Peek([][]byte{peekBuf})
 	if err != nil {
 		t.Fatalf("Peek failed: %s", err)
 	}
@@ -4176,7 +4252,7 @@ func TestReadAfterClosedState(t *testing.T) {
 		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrClosedForReceive)
 	}
 
-	if _, _, err := c.EP.Peek([][]byte{peekBuf}); err != tcpip.ErrClosedForReceive {
+	if _, err := c.EP.Peek([][]byte{peekBuf}); err != tcpip.ErrClosedForReceive {
 		t.Fatalf("got c.EP.Peek(...) = %s, want = %s", err, tcpip.ErrClosedForReceive)
 	}
 }
@@ -4193,9 +4269,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
-		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
-	}
+	c.EP.SocketOptions().SetReuseAddress(true)
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %s", err)
 	}
@@ -4205,9 +4279,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
-		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
-	}
+	c.EP.SocketOptions().SetReuseAddress(true)
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %s", err)
 	}
@@ -4218,9 +4290,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
-		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
-	}
+	c.EP.SocketOptions().SetReuseAddress(true)
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %s", err)
 	}
@@ -4233,9 +4303,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
-		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
-	}
+	c.EP.SocketOptions().SetReuseAddress(true)
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %s", err)
 	}
@@ -4246,9 +4314,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
-		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
-	}
+	c.EP.SocketOptions().SetReuseAddress(true)
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %s", err)
 	}
@@ -4261,9 +4327,7 @@ func TestReusePort(t *testing.T) {
 	if err != nil {
 		t.Fatalf("NewEndpoint failed; %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
-		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
-	}
+	c.EP.SocketOptions().SetReuseAddress(true)
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
 		t.Fatalf("Bind failed: %s", err)
 	}
@@ -4656,13 +4720,9 @@ func TestConnectAvoidsBoundPorts(t *testing.T) {
 												switch network {
 												case "ipv4":
 												case "ipv6":
-													if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
-														t.Fatalf("SetSockOptBool(V6OnlyOption(true)) failed: %s", err)
-													}
+													ep.SocketOptions().SetV6Only(true)
 												case "dual":
-													if err := ep.SetSockOptBool(tcpip.V6OnlyOption, false); err != nil {
-														t.Fatalf("SetSockOptBool(V6OnlyOption(false)) failed: %s", err)
-													}
+													ep.SocketOptions().SetV6Only(false)
 												default:
 													t.Fatalf("unknown network: '%s'", network)
 												}
@@ -4998,9 +5058,7 @@ func TestKeepalive(t *testing.T) {
 	if err := c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5); err != nil {
 		t.Fatalf("c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5): %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true); err != nil {
-		t.Fatalf("c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true): %s", err)
-	}
+	c.EP.SocketOptions().SetKeepAlive(true)
 
 	// 5 unacked keepalives are sent. ACK each one, and check that the
 	// connection stays alive after 5.
@@ -6118,10 +6176,13 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 	// Introduce a 25ms latency by delaying the first byte.
 	latency := 25 * time.Millisecond
 	time.Sleep(latency)
-	rawEP.SendPacketWithTS([]byte{1}, tsVal)
+	// Send an initial payload with atleast segment overhead size. The receive
+	// window would not grow for smaller segments.
+	rawEP.SendPacketWithTS(make([]byte, tcp.SegSize), tsVal)
 
 	pkt := rawEP.VerifyAndReturnACKWithTS(tsVal)
 	rcvWnd := header.TCP(header.IPv4(pkt).Payload()).WindowSize()
+
 	time.Sleep(25 * time.Millisecond)
 
 	// Allocate a large enough payload for the test.
@@ -6394,10 +6455,7 @@ func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcpip.T
 	if err != nil {
 		t.Fatalf("NewEndPoint(tcp, ipv4, new(waiter.Queue)) failed: %s", err)
 	}
-	gotDelayOption, err := ep.GetSockOptBool(tcpip.DelayOption)
-	if err != nil {
-		t.Fatalf("ep.GetSockOptBool(tcpip.DelayOption) failed: %s", err)
-	}
+	gotDelayOption := ep.SocketOptions().GetDelayOption()
 	if gotDelayOption != wantDelayOption {
 		t.Errorf("ep.GetSockOptBool(tcpip.DelayOption) got: %t, want: %t", gotDelayOption, wantDelayOption)
 	}
@@ -7250,9 +7308,7 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 	if err := c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10); err != nil {
 		t.Fatalf("c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10): %s", err)
 	}
-	if err := c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true); err != nil {
-		t.Fatalf("c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true): %s", err)
-	}
+	c.EP.SocketOptions().SetKeepAlive(true)
 
 	// Set userTimeout to be the duration to be 1 keepalive
 	// probes. Which means that after the first probe is sent
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index e6aa4fc4b..ee55f030c 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -592,9 +592,7 @@ func (c *Context) CreateV6Endpoint(v6only bool) {
 		c.t.Fatalf("NewEndpoint failed: %v", err)
 	}
 
-	if err := c.EP.SetSockOptBool(tcpip.V6OnlyOption, v6only); err != nil {
-		c.t.Fatalf("SetSockOpt failed failed: %v", err)
-	}
+	c.EP.SocketOptions().SetV6Only(v6only)
 }
 
 // GetV6Packet reads a single packet from the link layer endpoint of the context
@@ -637,11 +635,11 @@ func (c *Context) SendV6PacketWithAddrs(payload []byte, h *Headers, src, dst tcp
 	// Initialize the IP header.
 	ip := header.IPv6(buf)
 	ip.Encode(&header.IPv6Fields{
-		PayloadLength: uint16(header.TCPMinimumSize + len(payload)),
-		NextHeader:    uint8(tcp.ProtocolNumber),
-		HopLimit:      65,
-		SrcAddr:       src,
-		DstAddr:       dst,
+		PayloadLength:     uint16(header.TCPMinimumSize + len(payload)),
+		TransportProtocol: tcp.ProtocolNumber,
+		HopLimit:          65,
+		SrcAddr:           src,
+		DstAddr:           dst,
 	})
 
 	// Initialize the TCP header.
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
index c78549424..153e8c950 100644
--- a/pkg/tcpip/transport/udp/BUILD
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -56,6 +56,8 @@ go_test(
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
         "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/icmp",
         "//pkg/waiter",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 57976d4e3..763d1d654 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -16,8 +16,8 @@ package udp
 
 import (
 	"fmt"
+	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -30,10 +30,11 @@ import (
 // +stateify savable
 type udpPacket struct {
 	udpPacketEntry
-	senderAddress tcpip.FullAddress
-	packetInfo    tcpip.IPPacketInfo
-	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
-	timestamp     int64
+	senderAddress      tcpip.FullAddress
+	destinationAddress tcpip.FullAddress
+	packetInfo         tcpip.IPPacketInfo
+	data               buffer.VectorisedView `state:".(buffer.VectorisedView)"`
+	timestamp          int64
 	// tos stores either the receiveTOS or receiveTClass value.
 	tos uint8
 }
@@ -77,6 +78,7 @@ func (s EndpointState) String() string {
 // +stateify savable
 type endpoint struct {
 	stack.TransportEndpointInfo
+	tcpip.DefaultSocketOptionsHandler
 
 	// The following fields are initialized at creation time and do not
 	// change throughout the lifetime of the endpoint.
@@ -94,21 +96,20 @@ type endpoint struct {
 	rcvClosed     bool
 
 	// The following fields are protected by the mu mutex.
-	mu             sync.RWMutex `state:"nosave"`
-	sndBufSize     int
-	sndBufSizeMax  int
+	mu            sync.RWMutex `state:"nosave"`
+	sndBufSize    int
+	sndBufSizeMax int
+	// state must be read/set using the EndpointState()/setEndpointState()
+	// methods.
 	state          EndpointState
-	route          stack.Route `state:"manual"`
+	route          *stack.Route `state:"manual"`
 	dstPort        uint16
-	v6only         bool
 	ttl            uint8
 	multicastTTL   uint8
 	multicastAddr  tcpip.Address
 	multicastNICID tcpip.NICID
-	multicastLoop  bool
 	portFlags      ports.Flags
 	bindToDevice   tcpip.NICID
-	noChecksum     bool
 
 	lastErrorMu sync.Mutex   `state:"nosave"`
 	lastError   *tcpip.Error `state:".(string)"`
@@ -122,17 +123,6 @@ type endpoint struct {
 	// applied while sending packets. Defaults to 0 as on Linux.
 	sendTOS uint8
 
-	// receiveTOS determines if the incoming IPv4 TOS header field is passed
-	// as ancillary data to ControlMessages on Read.
-	receiveTOS bool
-
-	// receiveTClass determines if the incoming IPv6 TClass header field is
-	// passed as ancillary data to ControlMessages on Read.
-	receiveTClass bool
-
-	// receiveIPPacketInfo determines if the packet info is returned by Read.
-	receiveIPPacketInfo bool
-
 	// shutdownFlags represent the current shutdown state of the endpoint.
 	shutdownFlags tcpip.ShutdownFlags
 
@@ -154,9 +144,6 @@ type endpoint struct {
 	// owner is used to get uid and gid of the packet.
 	owner tcpip.PacketOwner
 
-	// linger is used for SO_LINGER socket option.
-	linger tcpip.LingerOption
-
 	// ops is used to get socket level options.
 	ops tcpip.SocketOptions
 }
@@ -188,13 +175,14 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		//
 		// Linux defaults to TTL=1.
 		multicastTTL:         1,
-		multicastLoop:        true,
 		rcvBufSizeMax:        32 * 1024,
 		sndBufSizeMax:        32 * 1024,
 		multicastMemberships: make(map[multicastMembership]struct{}),
 		state:                StateInitial,
 		uniqueID:             s.UniqueID(),
 	}
+	e.ops.InitHandler(e)
+	e.ops.SetMulticastLoop(true)
 
 	// Override with stack defaults.
 	var ss stack.SendBufferSizeOption
@@ -210,6 +198,20 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 	return e
 }
 
+// setEndpointState updates the state of the endpoint to state atomically. This
+// method is unexported as the only place we should update the state is in this
+// package but we allow the state to be read freely without holding e.mu.
+//
+// Precondition: e.mu must be held to call this method.
+func (e *endpoint) setEndpointState(state EndpointState) {
+	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
+}
+
+// EndpointState() returns the current state of the endpoint.
+func (e *endpoint) EndpointState() EndpointState {
+	return EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
+}
+
 // UniqueID implements stack.TransportEndpoint.UniqueID.
 func (e *endpoint) UniqueID() uint64 {
 	return e.uniqueID
@@ -235,7 +237,7 @@ func (e *endpoint) Close() {
 	e.mu.Lock()
 	e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite
 
-	switch e.state {
+	switch e.EndpointState() {
 	case StateBound, StateConnected:
 		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
 		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice, tcpip.FullAddress{})
@@ -258,10 +260,13 @@ func (e *endpoint) Close() {
 	}
 	e.rcvMu.Unlock()
 
-	e.route.Release()
+	if e.route != nil {
+		e.route.Release()
+		e.route = nil
+	}
 
 	// Update the state.
-	e.state = StateClosed
+	e.setEndpointState(StateClosed)
 
 	e.mu.Unlock()
 
@@ -303,24 +308,23 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 		HasTimestamp: true,
 		Timestamp:    p.timestamp,
 	}
-	e.mu.RLock()
-	receiveTOS := e.receiveTOS
-	receiveTClass := e.receiveTClass
-	receiveIPPacketInfo := e.receiveIPPacketInfo
-	e.mu.RUnlock()
-	if receiveTOS {
+	if e.ops.GetReceiveTOS() {
 		cm.HasTOS = true
 		cm.TOS = p.tos
 	}
-	if receiveTClass {
+	if e.ops.GetReceiveTClass() {
 		cm.HasTClass = true
 		// Although TClass is an 8-bit value it's read in the CMsg as a uint32.
 		cm.TClass = uint32(p.tos)
 	}
-	if receiveIPPacketInfo {
+	if e.ops.GetReceivePacketInfo() {
 		cm.HasIPPacketInfo = true
 		cm.PacketInfo = p.packetInfo
 	}
+	if e.ops.GetReceiveOriginalDstAddress() {
+		cm.HasOriginalDstAddress = true
+		cm.OriginalDstAddress = p.destinationAddress
+	}
 	return p.data.ToView(), cm, nil
 }
 
@@ -330,7 +334,7 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 //
 // Returns true for retry if preparation should be retried.
 func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpip.Error) {
-	switch e.state {
+	switch e.EndpointState() {
 	case StateInitial:
 	case StateConnected:
 		return false, nil
@@ -352,7 +356,7 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi
 
 	// The state changed when we released the shared locked and re-acquired
 	// it in exclusive mode. Try again.
-	if e.state != StateInitial {
+	if e.EndpointState() != StateInitial {
 		return true, nil
 	}
 
@@ -367,9 +371,9 @@ func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpi
 // connectRoute establishes a route to the specified interface or the
 // configured multicast interface if no interface is specified and the
 // specified address is a multicast address.
-func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (stack.Route, tcpip.NICID, *tcpip.Error) {
+func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (*stack.Route, tcpip.NICID, *tcpip.Error) {
 	localAddr := e.ID.LocalAddress
-	if isBroadcastOrMulticast(localAddr) {
+	if e.isBroadcastOrMulticast(nicID, netProto, localAddr) {
 		// A packet can only originate from a unicast address (i.e., an interface).
 		localAddr = ""
 	}
@@ -384,9 +388,9 @@ func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netPr
 	}
 
 	// Find a route to the desired destination.
-	r, err := e.stack.FindRoute(nicID, localAddr, addr.Addr, netProto, e.multicastLoop)
+	r, err := e.stack.FindRoute(nicID, localAddr, addr.Addr, netProto, e.ops.GetMulticastLoop())
 	if err != nil {
-		return stack.Route{}, 0, err
+		return nil, 0, err
 	}
 	return r, nicID, nil
 }
@@ -429,7 +433,13 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 	to := opts.To
 
 	e.mu.RLock()
-	defer e.mu.RUnlock()
+	lockReleased := false
+	defer func() {
+		if lockReleased {
+			return
+		}
+		e.mu.RUnlock()
+	}()
 
 	// If we've shutdown with SHUT_WR we are in an invalid state for sending.
 	if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
@@ -448,36 +458,9 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		}
 	}
 
-	var route *stack.Route
-	var resolve func(waker *sleep.Waker) (ch <-chan struct{}, err *tcpip.Error)
-	var dstPort uint16
-	if to == nil {
-		route = &e.route
-		dstPort = e.dstPort
-		resolve = func(waker *sleep.Waker) (ch <-chan struct{}, err *tcpip.Error) {
-			// Promote lock to exclusive if using a shared route, given that it may
-			// need to change in Route.Resolve() call below.
-			e.mu.RUnlock()
-			e.mu.Lock()
-
-			// Recheck state after lock was re-acquired.
-			if e.state != StateConnected {
-				err = tcpip.ErrInvalidEndpointState
-			}
-			if err == nil && route.IsResolutionRequired() {
-				ch, err = route.Resolve(waker)
-			}
-
-			e.mu.Unlock()
-			e.mu.RLock()
-
-			// Recheck state after lock was re-acquired.
-			if e.state != StateConnected {
-				err = tcpip.ErrInvalidEndpointState
-			}
-			return
-		}
-	} else {
+	route := e.route
+	dstPort := e.dstPort
+	if to != nil {
 		// Reject destination address if it goes through a different
 		// NIC than the endpoint was bound to.
 		nicID := to.NIC
@@ -505,9 +488,8 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		}
 		defer r.Release()
 
-		route = &r
+		route = r
 		dstPort = dst.Port
-		resolve = route.Resolve
 	}
 
 	if !e.ops.GetBroadcast() && route.IsOutboundBroadcast() {
@@ -515,7 +497,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 	}
 
 	if route.IsResolutionRequired() {
-		if ch, err := resolve(nil); err != nil {
+		if ch, err := route.Resolve(nil); err != nil {
 			if err == tcpip.ErrWouldBlock {
 				return 0, ch, tcpip.ErrNoLinkAddress
 			}
@@ -541,77 +523,46 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		useDefaultTTL = false
 	}
 
-	if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS, e.owner, e.noChecksum); err != nil {
+	localPort := e.ID.LocalPort
+	sendTOS := e.sendTOS
+	owner := e.owner
+	noChecksum := e.SocketOptions().GetNoChecksum()
+	lockReleased = true
+	e.mu.RUnlock()
+
+	// Do not hold lock when sending as loopback is synchronous and if the UDP
+	// datagram ends up generating an ICMP response then it can result in a
+	// deadlock where the ICMP response handling ends up acquiring this endpoint's
+	// mutex using e.mu.RLock() in endpoint.HandleControlPacket which can cause a
+	// deadlock if another caller is trying to acquire e.mu in exclusive mode w/
+	// e.mu.Lock(). Since e.mu.Lock() prevents any new read locks to ensure the
+	// lock can be eventually acquired.
+	//
+	// See: https://golang.org/pkg/sync/#RWMutex for details on why recursive read
+	// locking is prohibited.
+	if err := sendUDP(route, buffer.View(v).ToVectorisedView(), localPort, dstPort, ttl, useDefaultTTL, sendTOS, owner, noChecksum); err != nil {
 		return 0, nil, err
 	}
 	return int64(len(v)), nil, nil
 }
 
 // Peek only returns data from a single datagram, so do nothing here.
-func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
-	return 0, tcpip.ControlMessages{}, nil
+func (e *endpoint) Peek([][]byte) (int64, *tcpip.Error) {
+	return 0, nil
 }
 
-// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
-func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
-	switch opt {
-	case tcpip.MulticastLoopOption:
-		e.mu.Lock()
-		e.multicastLoop = v
-		e.mu.Unlock()
-
-	case tcpip.NoChecksumOption:
-		e.mu.Lock()
-		e.noChecksum = v
-		e.mu.Unlock()
-
-	case tcpip.ReceiveTOSOption:
-		e.mu.Lock()
-		e.receiveTOS = v
-		e.mu.Unlock()
-
-	case tcpip.ReceiveTClassOption:
-		// We only support this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return tcpip.ErrNotSupported
-		}
-
-		e.mu.Lock()
-		e.receiveTClass = v
-		e.mu.Unlock()
-
-	case tcpip.ReceiveIPPacketInfoOption:
-		e.mu.Lock()
-		e.receiveIPPacketInfo = v
-		e.mu.Unlock()
-
-	case tcpip.ReuseAddressOption:
-		e.mu.Lock()
-		e.portFlags.MostRecent = v
-		e.mu.Unlock()
-
-	case tcpip.ReusePortOption:
-		e.mu.Lock()
-		e.portFlags.LoadBalanced = v
-		e.mu.Unlock()
-
-	case tcpip.V6OnlyOption:
-		// We only recognize this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return tcpip.ErrInvalidEndpointState
-		}
-
-		e.mu.Lock()
-		defer e.mu.Unlock()
-
-		// We only allow this to be set when we're in the initial state.
-		if e.state != StateInitial {
-			return tcpip.ErrInvalidEndpointState
-		}
+// OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet.
+func (e *endpoint) OnReuseAddressSet(v bool) {
+	e.mu.Lock()
+	e.portFlags.MostRecent = v
+	e.mu.Unlock()
+}
 
-		e.v6only = v
-	}
-	return nil
+// OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet.
+func (e *endpoint) OnReusePortSet(v bool) {
+	e.mu.Lock()
+	e.portFlags.LoadBalanced = v
+	e.mu.Unlock()
 }
 
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
@@ -814,90 +765,10 @@ func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) *tcpip.Error {
 
 	case *tcpip.SocketDetachFilterOption:
 		return nil
-
-	case *tcpip.LingerOption:
-		e.mu.Lock()
-		e.linger = *v
-		e.mu.Unlock()
 	}
 	return nil
 }
 
-// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
-func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
-	switch opt {
-	case tcpip.KeepaliveEnabledOption:
-		return false, nil
-
-	case tcpip.MulticastLoopOption:
-		e.mu.RLock()
-		v := e.multicastLoop
-		e.mu.RUnlock()
-		return v, nil
-
-	case tcpip.NoChecksumOption:
-		e.mu.RLock()
-		v := e.noChecksum
-		e.mu.RUnlock()
-		return v, nil
-
-	case tcpip.ReceiveTOSOption:
-		e.mu.RLock()
-		v := e.receiveTOS
-		e.mu.RUnlock()
-		return v, nil
-
-	case tcpip.ReceiveTClassOption:
-		// We only support this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return false, tcpip.ErrNotSupported
-		}
-
-		e.mu.RLock()
-		v := e.receiveTClass
-		e.mu.RUnlock()
-		return v, nil
-
-	case tcpip.ReceiveIPPacketInfoOption:
-		e.mu.RLock()
-		v := e.receiveIPPacketInfo
-		e.mu.RUnlock()
-		return v, nil
-
-	case tcpip.ReuseAddressOption:
-		e.mu.RLock()
-		v := e.portFlags.MostRecent
-		e.mu.RUnlock()
-
-		return v, nil
-
-	case tcpip.ReusePortOption:
-		e.mu.RLock()
-		v := e.portFlags.LoadBalanced
-		e.mu.RUnlock()
-
-		return v, nil
-
-	case tcpip.V6OnlyOption:
-		// We only recognize this option on v6 endpoints.
-		if e.NetProto != header.IPv6ProtocolNumber {
-			return false, tcpip.ErrUnknownProtocolOption
-		}
-
-		e.mu.RLock()
-		v := e.v6only
-		e.mu.RUnlock()
-
-		return v, nil
-
-	case tcpip.AcceptConnOption:
-		return false, nil
-
-	default:
-		return false, tcpip.ErrUnknownProtocolOption
-	}
-}
-
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 	switch opt {
@@ -972,11 +843,6 @@ func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) *tcpip.Error {
 		*o = tcpip.BindToDeviceOption(e.bindToDevice)
 		e.mu.RUnlock()
 
-	case *tcpip.LingerOption:
-		e.mu.RLock()
-		*o = e.linger
-		e.mu.RUnlock()
-
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -1036,7 +902,7 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 // checkV4MappedLocked determines the effective network protocol and converts
 // addr to its canonical form.
 func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
-	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only)
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only())
 	if err != nil {
 		return tcpip.FullAddress{}, 0, err
 	}
@@ -1048,7 +914,7 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	if e.state != StateConnected {
+	if e.EndpointState() != StateConnected {
 		return nil
 	}
 	var (
@@ -1071,7 +937,7 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 		if err != nil {
 			return err
 		}
-		e.state = StateBound
+		e.setEndpointState(StateBound)
 		boundPortFlags = e.boundPortFlags
 	} else {
 		if e.ID.LocalPort != 0 {
@@ -1079,14 +945,14 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, boundPortFlags, e.boundBindToDevice, tcpip.FullAddress{})
 			e.boundPortFlags = ports.Flags{}
 		}
-		e.state = StateInitial
+		e.setEndpointState(StateInitial)
 	}
 
 	e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, boundPortFlags, e.boundBindToDevice)
 	e.ID = id
 	e.boundBindToDevice = btd
 	e.route.Release()
-	e.route = stack.Route{}
+	e.route = nil
 	e.dstPort = 0
 
 	return nil
@@ -1104,7 +970,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 
 	nicID := addr.NIC
 	var localPort uint16
-	switch e.state {
+	switch e.EndpointState() {
 	case StateInitial:
 	case StateBound, StateConnected:
 		localPort = e.ID.LocalPort
@@ -1139,7 +1005,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		RemoteAddress: r.RemoteAddress,
 	}
 
-	if e.state == StateInitial {
+	if e.EndpointState() == StateInitial {
 		id.LocalAddress = r.LocalAddress
 	}
 
@@ -1147,7 +1013,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	// packets on a different network protocol, so we register both even if
 	// v6only is set to false and this is an ipv6 endpoint.
 	netProtos := []tcpip.NetworkProtocolNumber{netProto}
-	if netProto == header.IPv6ProtocolNumber && !e.v6only {
+	if netProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() {
 		netProtos = []tcpip.NetworkProtocolNumber{
 			header.IPv4ProtocolNumber,
 			header.IPv6ProtocolNumber,
@@ -1173,7 +1039,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	e.RegisterNICID = nicID
 	e.effectiveNetProtos = netProtos
 
-	e.state = StateConnected
+	e.setEndpointState(StateConnected)
 
 	e.rcvMu.Lock()
 	e.rcvReady = true
@@ -1195,7 +1061,7 @@ func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
 
 	// A socket in the bound state can still receive multicast messages,
 	// so we need to notify waiters on shutdown.
-	if e.state != StateBound && e.state != StateConnected {
+	if state := e.EndpointState(); state != StateBound && state != StateConnected {
 		return tcpip.ErrNotConnected
 	}
 
@@ -1246,7 +1112,7 @@ func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.Networ
 func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 	// Don't allow binding once endpoint is not in the initial state
 	// anymore.
-	if e.state != StateInitial {
+	if e.EndpointState() != StateInitial {
 		return tcpip.ErrInvalidEndpointState
 	}
 
@@ -1259,7 +1125,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 	// wildcard (empty) address, and this is an IPv6 endpoint with v6only
 	// set to false.
 	netProtos := []tcpip.NetworkProtocolNumber{netProto}
-	if netProto == header.IPv6ProtocolNumber && !e.v6only && addr.Addr == "" {
+	if netProto == header.IPv6ProtocolNumber && !e.ops.GetV6Only() && addr.Addr == "" {
 		netProtos = []tcpip.NetworkProtocolNumber{
 			header.IPv6ProtocolNumber,
 			header.IPv4ProtocolNumber,
@@ -1267,7 +1133,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 	}
 
 	nicID := addr.NIC
-	if len(addr.Addr) != 0 && !isBroadcastOrMulticast(addr.Addr) {
+	if len(addr.Addr) != 0 && !e.isBroadcastOrMulticast(addr.NIC, netProto, addr.Addr) {
 		// A local unicast address was specified, verify that it's valid.
 		nicID = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
 		if nicID == 0 {
@@ -1290,7 +1156,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
 	e.effectiveNetProtos = netProtos
 
 	// Mark endpoint as bound.
-	e.state = StateBound
+	e.setEndpointState(StateBound)
 
 	e.rcvMu.Lock()
 	e.rcvReady = true
@@ -1322,7 +1188,7 @@ func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
 	defer e.mu.RUnlock()
 
 	addr := e.ID.LocalAddress
-	if e.state == StateConnected {
+	if e.EndpointState() == StateConnected {
 		addr = e.route.LocalAddress
 	}
 
@@ -1338,7 +1204,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	e.mu.RLock()
 	defer e.mu.RUnlock()
 
-	if e.state != StateConnected {
+	if e.EndpointState() != StateConnected {
 		return tcpip.FullAddress{}, tcpip.ErrNotConnected
 	}
 
@@ -1393,7 +1259,6 @@ func verifyChecksum(hdr header.UDP, pkt *stack.PacketBuffer) bool {
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
 func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
-	// Get the header then trim it from the view.
 	hdr := header.UDP(pkt.TransportHeader().View())
 	if int(hdr.Length()) > pkt.Data.Size()+header.UDPMinimumSize {
 		// Malformed packet.
@@ -1402,6 +1267,10 @@ func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketB
 		return
 	}
 
+	// TODO(gvisor.dev/issues/5033): We should mirror the Network layer and cap
+	// packets at "Parse" instead of when handling a packet.
+	pkt.Data.CapLength(int(hdr.PayloadLength()))
+
 	if !verifyChecksum(hdr, pkt) {
 		// Checksum Error.
 		e.stack.Stats().UDP.ChecksumErrors.Increment()
@@ -1435,7 +1304,12 @@ func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketB
 		senderAddress: tcpip.FullAddress{
 			NIC:  pkt.NICID,
 			Addr: id.RemoteAddress,
-			Port: header.UDP(hdr).SourcePort(),
+			Port: hdr.SourcePort(),
+		},
+		destinationAddress: tcpip.FullAddress{
+			NIC:  pkt.NICID,
+			Addr: id.LocalAddress,
+			Port: header.UDP(hdr).DestinationPort(),
 		},
 	}
 	packet.data = pkt.Data
@@ -1470,25 +1344,20 @@ func (e *endpoint) HandlePacket(id stack.TransportEndpointID, pkt *stack.PacketB
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
 func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 	if typ == stack.ControlPortUnreachable {
-		e.mu.RLock()
-		if e.state == StateConnected {
+		if e.EndpointState() == StateConnected {
 			e.lastErrorMu.Lock()
 			e.lastError = tcpip.ErrConnectionRefused
 			e.lastErrorMu.Unlock()
-			e.mu.RUnlock()
 
 			e.waiterQueue.Notify(waiter.EventErr)
 			return
 		}
-		e.mu.RUnlock()
 	}
 }
 
 // State implements tcpip.Endpoint.State.
 func (e *endpoint) State() uint32 {
-	e.mu.Lock()
-	defer e.mu.Unlock()
-	return uint32(e.state)
+	return uint32(e.EndpointState())
 }
 
 // Info returns a copy of the endpoint info.
@@ -1508,14 +1377,16 @@ func (e *endpoint) Stats() tcpip.EndpointStats {
 // Wait implements tcpip.Endpoint.Wait.
 func (*endpoint) Wait() {}
 
-func isBroadcastOrMulticast(a tcpip.Address) bool {
-	return a == header.IPv4Broadcast || header.IsV4MulticastAddress(a) || header.IsV6MulticastAddress(a)
+func (e *endpoint) isBroadcastOrMulticast(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, addr tcpip.Address) bool {
+	return addr == header.IPv4Broadcast || header.IsV4MulticastAddress(addr) || header.IsV6MulticastAddress(addr) || e.stack.IsSubnetBroadcast(nicID, netProto, addr)
 }
 
+// SetOwner implements tcpip.Endpoint.SetOwner.
 func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
 
+// SocketOptions implements tcpip.Endpoint.SocketOptions.
 func (e *endpoint) SocketOptions() *tcpip.SocketOptions {
 	return &e.ops
 }
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
index 858c99a45..13b72dc88 100644
--- a/pkg/tcpip/transport/udp/endpoint_state.go
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -98,7 +98,8 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		}
 	}
 
-	if e.state != StateBound && e.state != StateConnected {
+	state := e.EndpointState()
+	if state != StateBound && state != StateConnected {
 		return
 	}
 
@@ -113,12 +114,12 @@ func (e *endpoint) Resume(s *stack.Stack) {
 	}
 
 	var err *tcpip.Error
-	if e.state == StateConnected {
-		e.route, err = e.stack.FindRoute(e.RegisterNICID, e.ID.LocalAddress, e.ID.RemoteAddress, netProto, e.multicastLoop)
+	if state == StateConnected {
+		e.route, err = e.stack.FindRoute(e.RegisterNICID, e.ID.LocalAddress, e.ID.RemoteAddress, netProto, e.ops.GetMulticastLoop())
 		if err != nil {
 			panic(err)
 		}
-	} else if len(e.ID.LocalAddress) != 0 && !isBroadcastOrMulticast(e.ID.LocalAddress) { // stateBound
+	} else if len(e.ID.LocalAddress) != 0 && !e.isBroadcastOrMulticast(e.RegisterNICID, netProto, e.ID.LocalAddress) { // stateBound
 		// A local unicast address is specified, verify that it's valid.
 		if e.stack.CheckLocalAddress(e.RegisterNICID, netProto, e.ID.LocalAddress) == 0 {
 			panic(tcpip.ErrBadLocalAddress)
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 764ad0857..08980c298 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -22,6 +22,7 @@ import (
 	"testing"
 	"time"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/checker"
@@ -32,6 +33,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -54,6 +56,7 @@ const (
 	stackPort       = 1234
 	testAddr        = "\x0a\x00\x00\x02"
 	testPort        = 4096
+	invalidPort     = 8192
 	multicastAddr   = "\xe8\x2b\xd3\xea"
 	multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
 	broadcastAddr   = header.IPv4Broadcast
@@ -295,7 +298,8 @@ func newDualTestContext(t *testing.T, mtu uint32) *testContext {
 	t.Helper()
 	return newDualTestContextWithOptions(t, mtu, stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocolFactory{ipv4.NewProtocol, ipv6.NewProtocol},
-		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol},
+		TransportProtocols: []stack.TransportProtocolFactory{udp.NewProtocol, icmp.NewProtocol6, icmp.NewProtocol4},
+		HandleLocal:        true,
 	})
 }
 
@@ -360,9 +364,7 @@ func (c *testContext) createEndpointForFlow(flow testFlow) {
 
 	c.createEndpoint(flow.sockProto())
 	if flow.isV6Only() {
-		if err := c.ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
-			c.t.Fatalf("SetSockOptBool failed: %s", err)
-		}
+		c.ep.SocketOptions().SetV6Only(true)
 	} else if flow.isBroadcast() {
 		c.ep.SocketOptions().SetBroadcast(true)
 	}
@@ -451,12 +453,12 @@ func (c *testContext) buildV6Packet(payload []byte, h *header4Tuple) buffer.View
 	// Initialize the IP header.
 	ip := header.IPv6(buf)
 	ip.Encode(&header.IPv6Fields{
-		TrafficClass:  testTOS,
-		PayloadLength: uint16(header.UDPMinimumSize + len(payload)),
-		NextHeader:    uint8(udp.ProtocolNumber),
-		HopLimit:      65,
-		SrcAddr:       h.srcAddr.Addr,
-		DstAddr:       h.dstAddr.Addr,
+		TrafficClass:      testTOS,
+		PayloadLength:     uint16(header.UDPMinimumSize + len(payload)),
+		TransportProtocol: udp.ProtocolNumber,
+		HopLimit:          65,
+		SrcAddr:           h.srcAddr.Addr,
+		DstAddr:           h.dstAddr.Addr,
 	})
 
 	// Initialize the UDP header.
@@ -972,7 +974,7 @@ func testFailingWrite(c *testContext, flow testFlow, wantErr *tcpip.Error) {
 // provided.
 func testWrite(c *testContext, flow testFlow, checkers ...checker.NetworkChecker) uint16 {
 	c.t.Helper()
-	return testWriteInternal(c, flow, true, checkers...)
+	return testWriteAndVerifyInternal(c, flow, true, checkers...)
 }
 
 // testWriteWithoutDestination sends a packet of the given test flow from the
@@ -981,10 +983,10 @@ func testWrite(c *testContext, flow testFlow, checkers ...checker.NetworkChecker
 // checker functions provided.
 func testWriteWithoutDestination(c *testContext, flow testFlow, checkers ...checker.NetworkChecker) uint16 {
 	c.t.Helper()
-	return testWriteInternal(c, flow, false, checkers...)
+	return testWriteAndVerifyInternal(c, flow, false, checkers...)
 }
 
-func testWriteInternal(c *testContext, flow testFlow, setDest bool, checkers ...checker.NetworkChecker) uint16 {
+func testWriteNoVerify(c *testContext, flow testFlow, setDest bool) buffer.View {
 	c.t.Helper()
 	// Take a snapshot of the stats to validate them at the end of the test.
 	epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
@@ -1006,6 +1008,12 @@ func testWriteInternal(c *testContext, flow testFlow, setDest bool, checkers ...
 		c.t.Fatalf("Bad number of bytes written: got %v, want %v", n, len(payload))
 	}
 	c.checkEndpointWriteStats(1, epstats, err)
+	return payload
+}
+
+func testWriteAndVerifyInternal(c *testContext, flow testFlow, setDest bool, checkers ...checker.NetworkChecker) uint16 {
+	c.t.Helper()
+	payload := testWriteNoVerify(c, flow, setDest)
 	// Received the packet and check the payload.
 	b := c.getPacketAndVerify(flow, checkers...)
 	var udp header.UDP
@@ -1150,6 +1158,39 @@ func TestV4WriteOnConnected(t *testing.T) {
 	testWriteWithoutDestination(c, unicastV4)
 }
 
+func TestWriteOnConnectedInvalidPort(t *testing.T) {
+	protocols := map[string]tcpip.NetworkProtocolNumber{
+		"ipv4": ipv4.ProtocolNumber,
+		"ipv6": ipv6.ProtocolNumber,
+	}
+	for name, pn := range protocols {
+		t.Run(name, func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpoint(pn)
+			if err := c.ep.Connect(tcpip.FullAddress{Addr: stackAddr, Port: invalidPort}); err != nil {
+				c.t.Fatalf("Connect failed: %s", err)
+			}
+			writeOpts := tcpip.WriteOptions{
+				To: &tcpip.FullAddress{Addr: stackAddr, Port: invalidPort},
+			}
+			payload := buffer.View(newPayload())
+			n, _, err := c.ep.Write(tcpip.SlicePayload(payload), writeOpts)
+			if err != nil {
+				c.t.Fatalf("c.ep.Write(...) = %+s, want nil", err)
+			}
+			if got, want := n, int64(len(payload)); got != want {
+				c.t.Fatalf("c.ep.Write(...) wrote %d bytes, want %d bytes", got, want)
+			}
+
+			if err := c.ep.LastError(); err != tcpip.ErrConnectionRefused {
+				c.t.Fatalf("expected c.ep.LastError() == ErrConnectionRefused, got: %+v", err)
+			}
+		})
+	}
+}
+
 // TestWriteOnBoundToV4Multicast checks that we can send packets out of a socket
 // that is bound to a V4 multicast address.
 func TestWriteOnBoundToV4Multicast(t *testing.T) {
@@ -1372,9 +1413,7 @@ func TestReadIPPacketInfo(t *testing.T) {
 				}
 			}
 
-			if err := c.ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, true); err != nil {
-				t.Fatalf("c.ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, true): %s", err)
-			}
+			c.ep.SocketOptions().SetReceivePacketInfo(true)
 
 			testRead(c, test.flow, checker.ReceiveIPPacketInfo(tcpip.IPPacketInfo{
 				NIC:             1,
@@ -1389,6 +1428,93 @@ func TestReadIPPacketInfo(t *testing.T) {
 	}
 }
 
+func TestReadRecvOriginalDstAddr(t *testing.T) {
+	tests := []struct {
+		name                    string
+		proto                   tcpip.NetworkProtocolNumber
+		flow                    testFlow
+		expectedOriginalDstAddr tcpip.FullAddress
+	}{
+		{
+			name:                    "IPv4 unicast",
+			proto:                   header.IPv4ProtocolNumber,
+			flow:                    unicastV4,
+			expectedOriginalDstAddr: tcpip.FullAddress{1, stackAddr, stackPort},
+		},
+		{
+			name:  "IPv4 multicast",
+			proto: header.IPv4ProtocolNumber,
+			flow:  multicastV4,
+			// This should actually be a unicast address assigned to the interface.
+			//
+			// TODO(gvisor.dev/issue/3556): This check is validating incorrect
+			// behaviour. We still include the test so that once the bug is
+			// resolved, this test will start to fail and the individual tasked
+			// with fixing this bug knows to also fix this test :).
+			expectedOriginalDstAddr: tcpip.FullAddress{1, multicastAddr, stackPort},
+		},
+		{
+			name:  "IPv4 broadcast",
+			proto: header.IPv4ProtocolNumber,
+			flow:  broadcast,
+			// This should actually be a unicast address assigned to the interface.
+			//
+			// TODO(gvisor.dev/issue/3556): This check is validating incorrect
+			// behaviour. We still include the test so that once the bug is
+			// resolved, this test will start to fail and the individual tasked
+			// with fixing this bug knows to also fix this test :).
+			expectedOriginalDstAddr: tcpip.FullAddress{1, broadcastAddr, stackPort},
+		},
+		{
+			name:                    "IPv6 unicast",
+			proto:                   header.IPv6ProtocolNumber,
+			flow:                    unicastV6,
+			expectedOriginalDstAddr: tcpip.FullAddress{1, stackV6Addr, stackPort},
+		},
+		{
+			name:  "IPv6 multicast",
+			proto: header.IPv6ProtocolNumber,
+			flow:  multicastV6,
+			// This should actually be a unicast address assigned to the interface.
+			//
+			// TODO(gvisor.dev/issue/3556): This check is validating incorrect
+			// behaviour. We still include the test so that once the bug is
+			// resolved, this test will start to fail and the individual tasked
+			// with fixing this bug knows to also fix this test :).
+			expectedOriginalDstAddr: tcpip.FullAddress{1, multicastV6Addr, stackPort},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpoint(test.proto)
+
+			bindAddr := tcpip.FullAddress{Port: stackPort}
+			if err := c.ep.Bind(bindAddr); err != nil {
+				t.Fatalf("Bind(%#v): %s", bindAddr, err)
+			}
+
+			if test.flow.isMulticast() {
+				ifoptSet := tcpip.AddMembershipOption{NIC: 1, MulticastAddr: test.flow.getMcastAddr()}
+				if err := c.ep.SetSockOpt(&ifoptSet); err != nil {
+					c.t.Fatalf("SetSockOpt(&%#v): %s:", ifoptSet, err)
+				}
+			}
+
+			c.ep.SocketOptions().SetReceiveOriginalDstAddress(true)
+
+			testRead(c, test.flow, checker.ReceiveOriginalDstAddr(test.expectedOriginalDstAddr))
+
+			if got := c.s.Stats().UDP.PacketsReceived.Value(); got != 1 {
+				t.Fatalf("Read did not increment PacketsReceived: got = %d, want = 1", got)
+			}
+		})
+	}
+}
+
 func TestWriteIncrementsPacketsSent(t *testing.T) {
 	c := newDualTestContext(t, defaultMTU)
 	defer c.cleanup()
@@ -1412,16 +1538,12 @@ func TestNoChecksum(t *testing.T) {
 			c.createEndpointForFlow(flow)
 
 			// Disable the checksum generation.
-			if err := c.ep.SetSockOptBool(tcpip.NoChecksumOption, true); err != nil {
-				t.Fatalf("SetSockOptBool failed: %s", err)
-			}
+			c.ep.SocketOptions().SetNoChecksum(true)
 			// This option is effective on IPv4 only.
 			testWrite(c, flow, checker.UDP(checker.NoChecksum(flow.isV4())))
 
 			// Enable the checksum generation.
-			if err := c.ep.SetSockOptBool(tcpip.NoChecksumOption, false); err != nil {
-				t.Fatalf("SetSockOptBool failed: %s", err)
-			}
+			c.ep.SocketOptions().SetNoChecksum(false)
 			testWrite(c, flow, checker.UDP(checker.NoChecksum(false)))
 		})
 	}
@@ -1591,13 +1713,15 @@ func TestSetTClass(t *testing.T) {
 }
 
 func TestReceiveTosTClass(t *testing.T) {
+	const RcvTOSOpt = "ReceiveTosOption"
+	const RcvTClassOpt = "ReceiveTClassOption"
+
 	testCases := []struct {
-		name             string
-		getReceiveOption tcpip.SockOptBool
-		tests            []testFlow
+		name  string
+		tests []testFlow
 	}{
-		{"ReceiveTosOption", tcpip.ReceiveTOSOption, []testFlow{unicastV4, broadcast}},
-		{"ReceiveTClassOption", tcpip.ReceiveTClassOption, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}},
+		{RcvTOSOpt, []testFlow{unicastV4, broadcast}},
+		{RcvTClassOpt, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}},
 	}
 	for _, testCase := range testCases {
 		for _, flow := range testCase.tests {
@@ -1606,29 +1730,32 @@ func TestReceiveTosTClass(t *testing.T) {
 				defer c.cleanup()
 
 				c.createEndpointForFlow(flow)
-				option := testCase.getReceiveOption
 				name := testCase.name
 
-				// Verify that setting and reading the option works.
-				v, err := c.ep.GetSockOptBool(option)
-				if err != nil {
-					c.t.Errorf("GetSockOptBool(%s) failed: %s", name, err)
+				var optionGetter func() bool
+				var optionSetter func(bool)
+				switch name {
+				case RcvTOSOpt:
+					optionGetter = c.ep.SocketOptions().GetReceiveTOS
+					optionSetter = c.ep.SocketOptions().SetReceiveTOS
+				case RcvTClassOpt:
+					optionGetter = c.ep.SocketOptions().GetReceiveTClass
+					optionSetter = c.ep.SocketOptions().SetReceiveTClass
+				default:
+					t.Fatalf("unkown test variant: %s", name)
 				}
+
+				// Verify that setting and reading the option works.
+				v := optionGetter()
 				// Test for expected default value.
 				if v != false {
 					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, v, false)
 				}
 
 				want := true
-				if err := c.ep.SetSockOptBool(option, want); err != nil {
-					c.t.Fatalf("SetSockOptBool(%s, %t) failed: %s", name, want, err)
-				}
-
-				got, err := c.ep.GetSockOptBool(option)
-				if err != nil {
-					c.t.Errorf("GetSockOptBool(%s) failed: %s", name, err)
-				}
+				optionSetter(want)
 
+				got := optionGetter()
 				if got != want {
 					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, got, want)
 				}
@@ -1638,10 +1765,10 @@ func TestReceiveTosTClass(t *testing.T) {
 				if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
 					c.t.Fatalf("Bind failed: %s", err)
 				}
-				switch option {
-				case tcpip.ReceiveTClassOption:
+				switch name {
+				case RcvTClassOpt:
 					testRead(c, flow, checker.ReceiveTClass(testTOS))
-				case tcpip.ReceiveTOSOption:
+				case RcvTOSOpt:
 					testRead(c, flow, checker.ReceiveTOS(testTOS))
 				default:
 					t.Fatalf("unknown test variant: %s", name)
@@ -1788,27 +1915,31 @@ func TestV4UnknownDestination(t *testing.T) {
 			icmpPkt := header.ICMPv4(hdr.Payload())
 			payloadIPHeader := header.IPv4(icmpPkt.Payload())
 			incomingHeaderLength := header.IPv4MinimumSize + header.UDPMinimumSize
-			wantLen := len(payload)
+			wantPayloadLen := len(payload)
 			if tc.largePayload {
 				// To work out the data size we need to simulate what the sender would
 				// have done. The wanted size is the total available minus the sum of
 				// the headers in the UDP AND ICMP packets, given that we know the test
 				// had only a minimal IP header but the ICMP sender will have allowed
 				// for a maximally sized packet header.
-				wantLen = header.IPv4MinimumProcessableDatagramSize - header.IPv4MaximumHeaderSize - header.ICMPv4MinimumSize - incomingHeaderLength
+				wantPayloadLen = header.IPv4MinimumProcessableDatagramSize - header.IPv4MaximumHeaderSize - header.ICMPv4MinimumSize - incomingHeaderLength
 			}
 
 			// In the case of large payloads the IP packet may be truncated. Update
 			// the length field before retrieving the udp datagram payload.
 			// Add back the two headers within the payload.
-			payloadIPHeader.SetTotalLength(uint16(wantLen + incomingHeaderLength))
-
+			payloadIPHeader.SetTotalLength(uint16(wantPayloadLen + incomingHeaderLength))
 			origDgram := header.UDP(payloadIPHeader.Payload())
-			if got, want := len(origDgram.Payload()), wantLen; got != want {
-				t.Fatalf("unexpected payload length got: %d, want: %d", got, want)
+			wantDgramLen := wantPayloadLen + header.UDPMinimumSize
+
+			if got, want := len(origDgram), wantDgramLen; got != want {
+				t.Fatalf("got len(origDgram) = %d, want = %d", got, want)
 			}
-			if got, want := origDgram.Payload(), payload[:wantLen]; !bytes.Equal(got, want) {
-				t.Fatalf("unexpected payload got: %d, want: %d", got, want)
+			// Correct UDP length to access payload.
+			origDgram.SetLength(uint16(wantDgramLen))
+
+			if got, want := origDgram.Payload(), payload[:wantPayloadLen]; !bytes.Equal(got, want) {
+				t.Fatalf("got origDgram.Payload() = %x, want = %x", got, want)
 			}
 		})
 	}
@@ -1883,20 +2014,23 @@ func TestV6UnknownDestination(t *testing.T) {
 
 			icmpPkt := header.ICMPv6(hdr.Payload())
 			payloadIPHeader := header.IPv6(icmpPkt.Payload())
-			wantLen := len(payload)
+			wantPayloadLen := len(payload)
 			if tc.largePayload {
-				wantLen = header.IPv6MinimumMTU - header.IPv6MinimumSize*2 - header.ICMPv6MinimumSize - header.UDPMinimumSize
+				wantPayloadLen = header.IPv6MinimumMTU - header.IPv6MinimumSize*2 - header.ICMPv6MinimumSize - header.UDPMinimumSize
 			}
+			wantDgramLen := wantPayloadLen + header.UDPMinimumSize
 			// In case of large payloads the IP packet may be truncated. Update
 			// the length field before retrieving the udp datagram payload.
-			payloadIPHeader.SetPayloadLength(uint16(wantLen + header.UDPMinimumSize))
+			payloadIPHeader.SetPayloadLength(uint16(wantDgramLen))
 
 			origDgram := header.UDP(payloadIPHeader.Payload())
-			if got, want := len(origDgram.Payload()), wantLen; got != want {
-				t.Fatalf("unexpected payload length got: %d, want: %d", got, want)
+			if got, want := len(origDgram), wantPayloadLen+header.UDPMinimumSize; got != want {
+				t.Fatalf("got len(origDgram) = %d, want = %d", got, want)
 			}
-			if got, want := origDgram.Payload(), payload[:wantLen]; !bytes.Equal(got, want) {
-				t.Fatalf("unexpected payload got: %v, want: %v", got, want)
+			// Correct UDP length to access payload.
+			origDgram.SetLength(uint16(wantPayloadLen + header.UDPMinimumSize))
+			if diff := cmp.Diff(payload[:wantPayloadLen], origDgram.Payload()); diff != "" {
+				t.Fatalf("origDgram.Payload() mismatch (-want +got):\n%s", diff)
 			}
 		})
 	}
@@ -1955,12 +2089,12 @@ func TestShortHeader(t *testing.T) {
 	// Initialize the IP header.
 	ip := header.IPv6(buf)
 	ip.Encode(&header.IPv6Fields{
-		TrafficClass:  testTOS,
-		PayloadLength: uint16(udpSize),
-		NextHeader:    uint8(udp.ProtocolNumber),
-		HopLimit:      65,
-		SrcAddr:       h.srcAddr.Addr,
-		DstAddr:       h.dstAddr.Addr,
+		TrafficClass:      testTOS,
+		PayloadLength:     uint16(udpSize),
+		TransportProtocol: udp.ProtocolNumber,
+		HopLimit:          65,
+		SrcAddr:           h.srcAddr.Addr,
+		DstAddr:           h.dstAddr.Addr,
 	})
 
 	// Initialize the UDP header.
@@ -2409,3 +2543,67 @@ func TestOutgoingSubnetBroadcast(t *testing.T) {
 		})
 	}
 }
+
+func TestReceiveShortLength(t *testing.T) {
+	flows := []testFlow{unicastV4, unicastV6}
+	for _, flow := range flows {
+		t.Run(flow.String(), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Bind to wildcard.
+			bindAddr := tcpip.FullAddress{Port: stackPort}
+			if err := c.ep.Bind(bindAddr); err != nil {
+				c.t.Fatalf("c.ep.Bind(%#v): %s", bindAddr, err)
+			}
+
+			payload := newPayload()
+			extraBytes := []byte{1, 2, 3, 4}
+			h := flow.header4Tuple(incoming)
+			var buf buffer.View
+			var proto tcpip.NetworkProtocolNumber
+
+			// Build packets with extra bytes not accounted for in the UDP length
+			// field.
+			var udp header.UDP
+			if flow.isV4() {
+				buf = c.buildV4Packet(payload, &h)
+				buf = append(buf, extraBytes...)
+				ip := header.IPv4(buf)
+				ip.SetTotalLength(ip.TotalLength() + uint16(len(extraBytes)))
+				ip.SetChecksum(0)
+				ip.SetChecksum(^ip.CalculateChecksum())
+				proto = ipv4.ProtocolNumber
+				udp = ip.Payload()
+			} else {
+				buf = c.buildV6Packet(payload, &h)
+				buf = append(buf, extraBytes...)
+				ip := header.IPv6(buf)
+				ip.SetPayloadLength(ip.PayloadLength() + uint16(len(extraBytes)))
+				proto = ipv6.ProtocolNumber
+				udp = ip.Payload()
+			}
+
+			if diff := cmp.Diff(payload, udp.Payload()); diff != "" {
+				t.Errorf("udp.Payload() mismatch (-want +got):\n%s", diff)
+			}
+
+			c.linkEP.InjectInbound(proto, stack.NewPacketBuffer(stack.PacketBufferOptions{
+				Data: buf.ToVectorisedView(),
+			}))
+
+			// Try to receive the data.
+			v, _, err := c.ep.Read(nil)
+			if err != nil {
+				t.Fatalf("c.ep.Read(nil): %s", err)
+			}
+
+			// Check the payload is read back without extra bytes.
+			if diff := cmp.Diff(buffer.View(payload), v); diff != "" {
+				t.Errorf("c.ep.Read(nil) mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}