133 files changed, 16986 insertions, 3610 deletions
diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
index e57d45f2a..a984f1712 100644
--- a/pkg/tcpip/adapters/gonet/BUILD
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -22,7 +22,6 @@ go_test(
     size = "small",
     srcs = ["gonet_test.go"],
     library = ":gonet",
-    tags = ["flaky"],
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/header",
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
index 6e0db2741..d82ed5205 100644
--- a/pkg/tcpip/adapters/gonet/gonet.go
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -335,6 +335,11 @@ func (c *TCPConn) Read(b []byte) (int, error) {
 	deadline := c.readCancel()
 
 	numRead := 0
+	defer func() {
+		if numRead != 0 {
+			c.ep.ModerateRecvBuf(numRead)
+		}
+	}()
 	for numRead != len(b) {
 		if len(c.read) == 0 {
 			var err error
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
index 9a3c5d6c3..ea0c5413d 100644
--- a/pkg/tcpip/buffer/view.go
+++ b/pkg/tcpip/buffer/view.go
@@ -65,6 +65,16 @@ func (v View) ToVectorisedView() VectorisedView {
 	return NewVectorisedView(len(v), []View{v})
 }
 
+// IsEmpty returns whether v is of length zero.
+func (v View) IsEmpty() bool {
+	return len(v) == 0
+}
+
+// Size returns the length of v.
+func (v View) Size() int {
+	return len(v)
+}
+
 // VectorisedView is a vectorised version of View using non contiguous memory.
 // It supports all the convenience methods supported by View.
 //
diff --git a/pkg/tcpip/checker/BUILD b/pkg/tcpip/checker/BUILD
index ed434807f..c984470e6 100644
--- a/pkg/tcpip/checker/BUILD
+++ b/pkg/tcpip/checker/BUILD
@@ -12,5 +12,6 @@ go_library(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/seqnum",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
index c1745ba6a..1e5f5abf2 100644
--- a/pkg/tcpip/checker/checker.go
+++ b/pkg/tcpip/checker/checker.go
@@ -21,6 +21,7 @@ import (
 	"reflect"
 	"testing"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -169,10 +170,9 @@ func ReceiveTClass(want uint32) ControlMessagesChecker {
 	return func(t *testing.T, cm tcpip.ControlMessages) {
 		t.Helper()
 		if !cm.HasTClass {
-			t.Fatalf("got cm.HasTClass = %t, want cm.TClass = %d", cm.HasTClass, want)
-		}
-		if got := cm.TClass; got != want {
-			t.Fatalf("got cm.TClass = %d, want %d", got, want)
+			t.Errorf("got cm.HasTClass = %t, want = true", cm.HasTClass)
+		} else if got := cm.TClass; got != want {
+			t.Errorf("got cm.TClass = %d, want %d", got, want)
 		}
 	}
 }
@@ -182,10 +182,22 @@ func ReceiveTOS(want uint8) ControlMessagesChecker {
 	return func(t *testing.T, cm tcpip.ControlMessages) {
 		t.Helper()
 		if !cm.HasTOS {
-			t.Fatalf("got cm.HasTOS = %t, want cm.TOS = %d", cm.HasTOS, want)
+			t.Errorf("got cm.HasTOS = %t, want = true", cm.HasTOS)
+		} else if got := cm.TOS; got != want {
+			t.Errorf("got cm.TOS = %d, want %d", got, want)
 		}
-		if got := cm.TOS; got != want {
-			t.Fatalf("got cm.TOS = %d, want %d", got, want)
+	}
+}
+
+// ReceiveIPPacketInfo creates a checker that checks the PacketInfo field in
+// ControlMessages.
+func ReceiveIPPacketInfo(want tcpip.IPPacketInfo) ControlMessagesChecker {
+	return func(t *testing.T, cm tcpip.ControlMessages) {
+		t.Helper()
+		if !cm.HasIPPacketInfo {
+			t.Errorf("got cm.HasIPPacketInfo = %t, want = true", cm.HasIPPacketInfo)
+		} else if diff := cmp.Diff(want, cm.PacketInfo); diff != "" {
+			t.Errorf("IPPacketInfo mismatch (-want +got):\n%s", diff)
 		}
 	}
 }
@@ -320,6 +332,22 @@ func DstPort(port uint16) TransportChecker {
 	}
 }
 
+// NoChecksum creates a checker that checks if the checksum is zero.
+func NoChecksum(noChecksum bool) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		udp, ok := h.(header.UDP)
+		if !ok {
+			return
+		}
+
+		if b := udp.Checksum() == 0; b != noChecksum {
+			t.Errorf("bad checksum state, got %t, want %t", b, noChecksum)
+		}
+	}
+}
+
 // SeqNum creates a checker that checks the sequence number.
 func SeqNum(seq uint32) TransportChecker {
 	return func(t *testing.T, h header.Transport) {
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
index 0cde694dc..d87797617 100644
--- a/pkg/tcpip/header/BUILD
+++ b/pkg/tcpip/header/BUILD
@@ -48,7 +48,7 @@ go_test(
         "//pkg/rand",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
-        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
 
@@ -64,6 +64,6 @@ go_test(
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
-        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/header/arp.go b/pkg/tcpip/header/arp.go
index 718a4720a..83189676e 100644
--- a/pkg/tcpip/header/arp.go
+++ b/pkg/tcpip/header/arp.go
@@ -14,14 +14,33 @@
 
 package header
 
-import "gvisor.dev/gvisor/pkg/tcpip"
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
 
 const (
 	// ARPProtocolNumber is the ARP network protocol number.
 	ARPProtocolNumber tcpip.NetworkProtocolNumber = 0x0806
 
 	// ARPSize is the size of an IPv4-over-Ethernet ARP packet.
-	ARPSize = 2 + 2 + 1 + 1 + 2 + 2*6 + 2*4
+	ARPSize = 28
+)
+
+// ARPHardwareType is the hardware type for LinkEndpoint in an ARP header.
+type ARPHardwareType uint16
+
+// Typical ARP HardwareType values. Some of the constants have to be specific
+// values as they are egressed on the wire in the HTYPE field of an ARP header.
+const (
+	ARPHardwareNone ARPHardwareType = 0
+	// ARPHardwareEther specifically is the HTYPE for Ethernet as specified
+	// in the IANA list here:
+	//
+	// https://www.iana.org/assignments/arp-parameters/arp-parameters.xhtml#arp-parameters-2
+	ARPHardwareEther    ARPHardwareType = 1
+	ARPHardwareLoopback ARPHardwareType = 2
 )
 
 // ARPOp is an ARP opcode.
@@ -36,54 +55,64 @@ const (
 // ARP is an ARP packet stored in a byte array as described in RFC 826.
 type ARP []byte
 
-func (a ARP) hardwareAddressSpace() uint16 { return uint16(a[0])<<8 | uint16(a[1]) }
-func (a ARP) protocolAddressSpace() uint16 { return uint16(a[2])<<8 | uint16(a[3]) }
-func (a ARP) hardwareAddressSize() int     { return int(a[4]) }
-func (a ARP) protocolAddressSize() int     { return int(a[5]) }
+const (
+	hTypeOffset                 = 0
+	protocolOffset              = 2
+	haAddressSizeOffset         = 4
+	protoAddressSizeOffset      = 5
+	opCodeOffset                = 6
+	senderHAAddressOffset       = 8
+	senderProtocolAddressOffset = senderHAAddressOffset + EthernetAddressSize
+	targetHAAddressOffset       = senderProtocolAddressOffset + IPv4AddressSize
+	targetProtocolAddressOffset = targetHAAddressOffset + EthernetAddressSize
+)
+
+func (a ARP) hardwareAddressType() ARPHardwareType {
+	return ARPHardwareType(binary.BigEndian.Uint16(a[hTypeOffset:]))
+}
+
+func (a ARP) protocolAddressSpace() uint16 { return binary.BigEndian.Uint16(a[protocolOffset:]) }
+func (a ARP) hardwareAddressSize() int     { return int(a[haAddressSizeOffset]) }
+func (a ARP) protocolAddressSize() int     { return int(a[protoAddressSizeOffset]) }
 
 // Op is the ARP opcode.
-func (a ARP) Op() ARPOp { return ARPOp(a[6])<<8 | ARPOp(a[7]) }
+func (a ARP) Op() ARPOp { return ARPOp(binary.BigEndian.Uint16(a[opCodeOffset:])) }
 
 // SetOp sets the ARP opcode.
 func (a ARP) SetOp(op ARPOp) {
-	a[6] = uint8(op >> 8)
-	a[7] = uint8(op)
+	binary.BigEndian.PutUint16(a[opCodeOffset:], uint16(op))
 }
 
 // SetIPv4OverEthernet configures the ARP packet for IPv4-over-Ethernet.
 func (a ARP) SetIPv4OverEthernet() {
-	a[0], a[1] = 0, 1       // htypeEthernet
-	a[2], a[3] = 0x08, 0x00 // IPv4ProtocolNumber
-	a[4] = 6                // macSize
-	a[5] = uint8(IPv4AddressSize)
+	binary.BigEndian.PutUint16(a[hTypeOffset:], uint16(ARPHardwareEther))
+	binary.BigEndian.PutUint16(a[protocolOffset:], uint16(IPv4ProtocolNumber))
+	a[haAddressSizeOffset] = EthernetAddressSize
+	a[protoAddressSizeOffset] = uint8(IPv4AddressSize)
 }
 
 // HardwareAddressSender is the link address of the sender.
 // It is a view on to the ARP packet so it can be used to set the value.
 func (a ARP) HardwareAddressSender() []byte {
-	const s = 8
-	return a[s : s+6]
+	return a[senderHAAddressOffset : senderHAAddressOffset+EthernetAddressSize]
 }
 
 // ProtocolAddressSender is the protocol address of the sender.
 // It is a view on to the ARP packet so it can be used to set the value.
 func (a ARP) ProtocolAddressSender() []byte {
-	const s = 8 + 6
-	return a[s : s+4]
+	return a[senderProtocolAddressOffset : senderProtocolAddressOffset+IPv4AddressSize]
 }
 
 // HardwareAddressTarget is the link address of the target.
 // It is a view on to the ARP packet so it can be used to set the value.
 func (a ARP) HardwareAddressTarget() []byte {
-	const s = 8 + 6 + 4
-	return a[s : s+6]
+	return a[targetHAAddressOffset : targetHAAddressOffset+EthernetAddressSize]
 }
 
 // ProtocolAddressTarget is the protocol address of the target.
 // It is a view on to the ARP packet so it can be used to set the value.
 func (a ARP) ProtocolAddressTarget() []byte {
-	const s = 8 + 6 + 4 + 6
-	return a[s : s+4]
+	return a[targetProtocolAddressOffset : targetProtocolAddressOffset+IPv4AddressSize]
 }
 
 // IsValid reports whether this is an ARP packet for IPv4 over Ethernet.
@@ -91,10 +120,8 @@ func (a ARP) IsValid() bool {
 	if len(a) < ARPSize {
 		return false
 	}
-	const htypeEthernet = 1
-	const macSize = 6
-	return a.hardwareAddressSpace() == htypeEthernet &&
+	return a.hardwareAddressType() == ARPHardwareEther &&
 		a.protocolAddressSpace() == uint16(IPv4ProtocolNumber) &&
-		a.hardwareAddressSize() == macSize &&
+		a.hardwareAddressSize() == EthernetAddressSize &&
 		a.protocolAddressSize() == IPv4AddressSize
 }
diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go
index b1e92d2d7..eaface8cb 100644
--- a/pkg/tcpip/header/eth.go
+++ b/pkg/tcpip/header/eth.go
@@ -53,6 +53,10 @@ const (
 	// (all bits set to 0).
 	unspecifiedEthernetAddress = tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00")
 
+	// EthernetBroadcastAddress is an ethernet address that addresses every node
+	// on a local link.
+	EthernetBroadcastAddress = tcpip.LinkAddress("\xff\xff\xff\xff\xff\xff")
+
 	// unicastMulticastFlagMask is the mask of the least significant bit in
 	// the first octet (in network byte order) of an ethernet address that
 	// determines whether the ethernet address is a unicast or multicast. If
diff --git a/pkg/tcpip/header/icmpv4.go b/pkg/tcpip/header/icmpv4.go
index 7908c5744..1a631b31a 100644
--- a/pkg/tcpip/header/icmpv4.go
+++ b/pkg/tcpip/header/icmpv4.go
@@ -72,6 +72,7 @@ const (
 // Values for ICMP code as defined in RFC 792.
 const (
 	ICMPv4TTLExceeded         = 0
+	ICMPv4HostUnreachable     = 1
 	ICMPv4PortUnreachable     = 3
 	ICMPv4FragmentationNeeded = 4
 )
diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
index c7ee2de57..a13b4b809 100644
--- a/pkg/tcpip/header/icmpv6.go
+++ b/pkg/tcpip/header/icmpv6.go
@@ -110,9 +110,16 @@ const (
 	ICMPv6RedirectMsg     ICMPv6Type = 137
 )
 
-// Values for ICMP code as defined in RFC 4443.
+// Values for ICMP destination unreachable code as defined in RFC 4443 section
+// 3.1.
 const (
-	ICMPv6PortUnreachable = 4
+	ICMPv6NetworkUnreachable = 0
+	ICMPv6Prohibited         = 1
+	ICMPv6BeyondScope        = 2
+	ICMPv6AddressUnreachable = 3
+	ICMPv6PortUnreachable    = 4
+	ICMPv6Policy             = 5
+	ICMPv6RejectRoute        = 6
 )
 
 // Type is the ICMP type field.
diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go
index 76839eb92..680eafd16 100644
--- a/pkg/tcpip/header/ipv4.go
+++ b/pkg/tcpip/header/ipv4.go
@@ -101,6 +101,11 @@ const (
 	// IPv4Version is the version of the ipv4 protocol.
 	IPv4Version = 4
 
+	// IPv4AllSystems is the all systems IPv4 multicast address as per
+	// IANA's IPv4 Multicast Address Space Registry. See
+	// https://www.iana.org/assignments/multicast-addresses/multicast-addresses.xhtml.
+	IPv4AllSystems tcpip.Address = "\xe0\x00\x00\x01"
+
 	// IPv4Broadcast is the broadcast address of the IPv4 procotol.
 	IPv4Broadcast tcpip.Address = "\xff\xff\xff\xff"
 
@@ -159,6 +164,11 @@ func (b IPv4) Flags() uint8 {
 	return uint8(binary.BigEndian.Uint16(b[flagsFO:]) >> 13)
 }
 
+// More returns whether the more fragments flag is set.
+func (b IPv4) More() bool {
+	return b.Flags()&IPv4FlagMoreFragments != 0
+}
+
 // TTL returns the "TTL" field of the ipv4 header.
 func (b IPv4) TTL() uint8 {
 	return b[ttl]
@@ -305,3 +315,12 @@ func IsV4MulticastAddress(addr tcpip.Address) bool {
 	}
 	return (addr[0] & 0xf0) == 0xe0
 }
+
+// IsV4LoopbackAddress determines if the provided address is an IPv4 loopback
+// address (belongs to 127.0.0.1/8 subnet).
+func IsV4LoopbackAddress(addr tcpip.Address) bool {
+	if len(addr) != IPv4AddressSize {
+		return false
+	}
+	return addr[0] == 0x7f
+}
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
index 4f367fe4c..ea3823898 100644
--- a/pkg/tcpip/header/ipv6.go
+++ b/pkg/tcpip/header/ipv6.go
@@ -98,6 +98,9 @@ const (
 	// section 5.
 	IPv6MinimumMTU = 1280
 
+	// IPv6Loopback is the IPv6 Loopback address.
+	IPv6Loopback tcpip.Address = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+
 	// IPv6Any is the non-routable IPv6 "any" meta address. It is also
 	// known as the unspecified address.
 	IPv6Any tcpip.Address = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
diff --git a/pkg/tcpip/header/ipv6_extension_headers.go b/pkg/tcpip/header/ipv6_extension_headers.go
index 2c4591409..3499d8399 100644
--- a/pkg/tcpip/header/ipv6_extension_headers.go
+++ b/pkg/tcpip/header/ipv6_extension_headers.go
@@ -354,6 +354,13 @@ func (b IPv6FragmentExtHdr) ID() uint32 {
 	return binary.BigEndian.Uint32(b[ipv6FragmentExtHdrIdentificationOffset:])
 }
 
+// IsAtomic returns whether the fragment header indicates an atomic fragment. An
+// atomic fragment is a fragment that contains all the data required to
+// reassemble a full packet.
+func (b IPv6FragmentExtHdr) IsAtomic() bool {
+	return !b.More() && b.FragmentOffset() == 0
+}
+
 // IPv6PayloadIterator is an iterator over the contents of an IPv6 payload.
 //
 // The IPv6 payload may contain IPv6 extension headers before any upper layer
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
index b8b93e78e..39ca774ef 100644
--- a/pkg/tcpip/link/channel/BUILD
+++ b/pkg/tcpip/link/channel/BUILD
@@ -10,6 +10,7 @@ go_library(
         "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
     ],
 )
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
index 9bf67686d..c95aef63c 100644
--- a/pkg/tcpip/link/channel/channel.go
+++ b/pkg/tcpip/link/channel/channel.go
@@ -23,6 +23,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
@@ -181,13 +182,13 @@ func (e *Endpoint) NumQueued() int {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	e.InjectLinkAddr(protocol, "", pkt)
 }
 
 // InjectLinkAddr injects an inbound packet with a remote link address.
-func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt stack.PacketBuffer) {
-	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, pkt)
+func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
 }
 
 // Attach saves the stack network-layer dispatcher for use later when packets
@@ -229,13 +230,13 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
 }
 
 // WritePacket stores outbound packets into the channel.
-func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	// Clone r then release its resource so we only get the relevant fields from
 	// stack.Route without holding a reference to a NIC's endpoint.
 	route := r.Clone()
 	route.Release()
 	p := PacketInfo{
-		Pkt:   &pkt,
+		Pkt:   pkt,
 		Proto: protocol,
 		GSO:   gso,
 		Route: route,
@@ -273,7 +274,9 @@ func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	p := PacketInfo{
-		Pkt:   &stack.PacketBuffer{Data: vv},
+		Pkt: stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: vv,
+		}),
 		Proto: 0,
 		GSO:   nil,
 	}
@@ -296,3 +299,12 @@ func (e *Endpoint) AddNotify(notify Notification) *NotificationHandle {
 func (e *Endpoint) RemoveNotify(handle *NotificationHandle) {
 	e.q.RemoveNotify(handle)
 }
+
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
+func (*Endpoint) ARPHardwareType() header.ARPHardwareType {
+	return header.ARPHardwareNone
+}
+
+// AddHeader implements stack.LinkEndpoint.AddHeader.
+func (e *Endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+}
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
index aa6db9aea..10072eac1 100644
--- a/pkg/tcpip/link/fdbased/BUILD
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -15,6 +15,7 @@ go_library(
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/binary",
+        "//pkg/iovec",
         "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
@@ -36,5 +37,6 @@ go_test(
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/rawfile",
         "//pkg/tcpip/stack",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
index affa1bbdf..975309fc8 100644
--- a/pkg/tcpip/link/fdbased/endpoint.go
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -45,6 +45,7 @@ import (
 
 	"golang.org/x/sys/unix"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/iovec"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -385,32 +386,40 @@ const (
 	_VIRTIO_NET_HDR_GSO_TCPV6 = 4
 )
 
-// WritePacket writes outbound packets to the file descriptor. If it is not
-// currently writable, the packet is dropped.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+// AddHeader implements stack.LinkEndpoint.AddHeader.
+func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	if e.hdrSize > 0 {
 		// Add ethernet header if needed.
-		eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
-		pkt.LinkHeader = buffer.View(eth)
+		eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize))
 		ethHdr := &header.EthernetFields{
-			DstAddr: r.RemoteLinkAddress,
+			DstAddr: remote,
 			Type:    protocol,
 		}
 
 		// Preserve the src address if it's set in the route.
-		if r.LocalLinkAddress != "" {
-			ethHdr.SrcAddr = r.LocalLinkAddress
+		if local != "" {
+			ethHdr.SrcAddr = local
 		} else {
 			ethHdr.SrcAddr = e.addr
 		}
 		eth.Encode(ethHdr)
 	}
+}
+
+// WritePacket writes outbound packets to the file descriptor. If it is not
+// currently writable, the packet is dropped.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	if e.hdrSize > 0 {
+		e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress, protocol, pkt)
+	}
+
+	var builder iovec.Builder
 
 	fd := e.fds[pkt.Hash%uint32(len(e.fds))]
 	if e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
 		vnetHdr := virtioNetHdr{}
 		if gso != nil {
-			vnetHdr.hdrLen = uint16(pkt.Header.UsedLength())
+			vnetHdr.hdrLen = uint16(pkt.HeaderSize())
 			if gso.NeedsCsum {
 				vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
 				vnetHdr.csumStart = header.EthernetMinimumSize + gso.L3HdrLen
@@ -430,49 +439,28 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 		}
 
 		vnetHdrBuf := binary.Marshal(make([]byte, 0, virtioNetHdrSize), binary.LittleEndian, vnetHdr)
-		return rawfile.NonBlockingWrite3(fd, vnetHdrBuf, pkt.Header.View(), pkt.Data.ToView())
+		builder.Add(vnetHdrBuf)
 	}
 
-	if pkt.Data.Size() == 0 {
-		return rawfile.NonBlockingWrite(fd, pkt.Header.View())
+	for _, v := range pkt.Views() {
+		builder.Add(v)
 	}
-	if pkt.Header.UsedLength() == 0 {
-		return rawfile.NonBlockingWrite(fd, pkt.Data.ToView())
-	}
-
-	return rawfile.NonBlockingWrite3(fd, pkt.Header.View(), pkt.Data.ToView(), nil)
+	return rawfile.NonBlockingWriteIovec(fd, builder.Build())
 }
 
 func (e *endpoint) sendBatch(batchFD int, batch []*stack.PacketBuffer) (int, *tcpip.Error) {
 	// Send a batch of packets through batchFD.
 	mmsgHdrs := make([]rawfile.MMsgHdr, 0, len(batch))
 	for _, pkt := range batch {
-		var ethHdrBuf []byte
-		iovLen := 0
 		if e.hdrSize > 0 {
-			// Add ethernet header if needed.
-			ethHdrBuf = make([]byte, header.EthernetMinimumSize)
-			eth := header.Ethernet(ethHdrBuf)
-			ethHdr := &header.EthernetFields{
-				DstAddr: pkt.EgressRoute.RemoteLinkAddress,
-				Type:    pkt.NetworkProtocolNumber,
-			}
-
-			// Preserve the src address if it's set in the route.
-			if pkt.EgressRoute.LocalLinkAddress != "" {
-				ethHdr.SrcAddr = pkt.EgressRoute.LocalLinkAddress
-			} else {
-				ethHdr.SrcAddr = e.addr
-			}
-			eth.Encode(ethHdr)
-			iovLen++
+			e.AddHeader(pkt.EgressRoute.LocalLinkAddress, pkt.EgressRoute.RemoteLinkAddress, pkt.NetworkProtocolNumber, pkt)
 		}
 
-		vnetHdr := virtioNetHdr{}
 		var vnetHdrBuf []byte
 		if e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+			vnetHdr := virtioNetHdr{}
 			if pkt.GSOOptions != nil {
-				vnetHdr.hdrLen = uint16(pkt.Header.UsedLength())
+				vnetHdr.hdrLen = uint16(pkt.HeaderSize())
 				if pkt.GSOOptions.NeedsCsum {
 					vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
 					vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen
@@ -491,45 +479,18 @@ func (e *endpoint) sendBatch(batchFD int, batch []*stack.PacketBuffer) (int, *tc
 				}
 			}
 			vnetHdrBuf = binary.Marshal(make([]byte, 0, virtioNetHdrSize), binary.LittleEndian, vnetHdr)
-			iovLen++
 		}
 
-		iovecs := make([]syscall.Iovec, iovLen+1+len(pkt.Data.Views()))
+		var builder iovec.Builder
+		builder.Add(vnetHdrBuf)
+		for _, v := range pkt.Views() {
+			builder.Add(v)
+		}
+		iovecs := builder.Build()
+
 		var mmsgHdr rawfile.MMsgHdr
 		mmsgHdr.Msg.Iov = &iovecs[0]
-		iovecIdx := 0
-		if vnetHdrBuf != nil {
-			v := &iovecs[iovecIdx]
-			v.Base = &vnetHdrBuf[0]
-			v.Len = uint64(len(vnetHdrBuf))
-			iovecIdx++
-		}
-		if ethHdrBuf != nil {
-			v := &iovecs[iovecIdx]
-			v.Base = &ethHdrBuf[0]
-			v.Len = uint64(len(ethHdrBuf))
-			iovecIdx++
-		}
-		pktSize := uint64(0)
-		// Encode L3 Header
-		v := &iovecs[iovecIdx]
-		hdr := &pkt.Header
-		hdrView := hdr.View()
-		v.Base = &hdrView[0]
-		v.Len = uint64(len(hdrView))
-		pktSize += v.Len
-		iovecIdx++
-
-		// Now encode the Transport Payload.
-		pktViews := pkt.Data.Views()
-		for i := range pktViews {
-			vec := &iovecs[iovecIdx]
-			iovecIdx++
-			vec.Base = &pktViews[i][0]
-			vec.Len = uint64(len(pktViews[i]))
-			pktSize += vec.Len
-		}
-		mmsgHdr.Msg.Iovlen = uint64(iovecIdx)
+		mmsgHdr.Msg.Iovlen = uint64(len(iovecs))
 		mmsgHdrs = append(mmsgHdrs, mmsgHdr)
 	}
 
@@ -626,6 +587,14 @@ func (e *endpoint) GSOMaxSize() uint32 {
 	return e.gsoMaxSize
 }
 
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
+func (e *endpoint) ARPHardwareType() header.ARPHardwareType {
+	if e.hdrSize > 0 {
+		return header.ARPHardwareEther
+	}
+	return header.ARPHardwareNone
+}
+
 // InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes
 // to the FD, but does not read from it. All reads come from injected packets.
 type InjectableEndpoint struct {
@@ -641,8 +610,8 @@ func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, pkt)
+func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, pkt)
 }
 
 // NewInjectable creates a new fd-based InjectableEndpoint.
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
index 3bfb15a8e..709f829c8 100644
--- a/pkg/tcpip/link/fdbased/endpoint_test.go
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -26,6 +26,7 @@ import (
 	"time"
 	"unsafe"
 
+	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
@@ -43,9 +44,36 @@ const (
 )
 
 type packetInfo struct {
-	raddr    tcpip.LinkAddress
-	proto    tcpip.NetworkProtocolNumber
-	contents stack.PacketBuffer
+	Raddr    tcpip.LinkAddress
+	Proto    tcpip.NetworkProtocolNumber
+	Contents *stack.PacketBuffer
+}
+
+type packetContents struct {
+	LinkHeader      buffer.View
+	NetworkHeader   buffer.View
+	TransportHeader buffer.View
+	Data            buffer.View
+}
+
+func checkPacketInfoEqual(t *testing.T, got, want packetInfo) {
+	t.Helper()
+	if diff := cmp.Diff(
+		want, got,
+		cmp.Transformer("ExtractPacketBuffer", func(pk *stack.PacketBuffer) *packetContents {
+			if pk == nil {
+				return nil
+			}
+			return &packetContents{
+				LinkHeader:      pk.LinkHeader().View(),
+				NetworkHeader:   pk.NetworkHeader().View(),
+				TransportHeader: pk.TransportHeader().View(),
+				Data:            pk.Data.ToView(),
+			}
+		}),
+	); diff != "" {
+		t.Errorf("unexpected packetInfo (-want +got):\n%s", diff)
+	}
 }
 
 type context struct {
@@ -103,10 +131,14 @@ func (c *context) cleanup() {
 	}
 }
 
-func (c *context) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (c *context) DeliverNetworkPacket(remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	c.ch <- packetInfo{remote, protocol, pkt}
 }
 
+func (c *context) DeliverOutboundPacket(remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	panic("unimplemented")
+}
+
 func TestNoEthernetProperties(t *testing.T) {
 	c := newContext(t, &Options{MTU: mtu})
 	defer c.cleanup()
@@ -155,19 +187,28 @@ func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32, hash u
 		RemoteLinkAddress: raddr,
 	}
 
-	// Build header.
-	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()) + 100)
-	b := hdr.Prepend(100)
-	for i := range b {
-		b[i] = uint8(rand.Intn(256))
+	// Build payload.
+	payload := buffer.NewView(plen)
+	if _, err := rand.Read(payload); err != nil {
+		t.Fatalf("rand.Read(payload): %s", err)
 	}
 
-	// Build payload and write.
-	payload := make(buffer.View, plen)
-	for i := range payload {
-		payload[i] = uint8(rand.Intn(256))
+	// Build packet buffer.
+	const netHdrLen = 100
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(c.ep.MaxHeaderLength()) + netHdrLen,
+		Data:               payload.ToVectorisedView(),
+	})
+	pkt.Hash = hash
+
+	// Build header.
+	b := pkt.NetworkHeader().Push(netHdrLen)
+	if _, err := rand.Read(b); err != nil {
+		t.Fatalf("rand.Read(b): %s", err)
 	}
-	want := append(hdr.View(), payload...)
+
+	// Write.
+	want := append(append(buffer.View(nil), b...), payload...)
 	var gso *stack.GSO
 	if gsoMaxSize != 0 {
 		gso = &stack.GSO{
@@ -179,11 +220,7 @@ func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32, hash u
 			L3HdrLen:   header.IPv4MaximumHeaderSize,
 		}
 	}
-	if err := c.ep.WritePacket(r, gso, proto, stack.PacketBuffer{
-		Header: hdr,
-		Data:   payload.ToVectorisedView(),
-		Hash:   hash,
-	}); err != nil {
+	if err := c.ep.WritePacket(r, gso, proto, pkt); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 
@@ -292,13 +329,14 @@ func TestPreserveSrcAddress(t *testing.T) {
 		LocalLinkAddress:  baddr,
 	}
 
-	// WritePacket panics given a prependable with anything less than
-	// the minimum size of the ethernet header.
-	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
-	if err := c.ep.WritePacket(r, nil /* gso */, proto, stack.PacketBuffer{
-		Header: hdr,
-		Data:   buffer.VectorisedView{},
-	}); err != nil {
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		// WritePacket panics given a prependable with anything less than
+		// the minimum size of the ethernet header.
+		// TODO(b/153685824): Figure out if this should use c.ep.MaxHeaderLength().
+		ReserveHeaderBytes: header.EthernetMinimumSize,
+		Data:               buffer.VectorisedView{},
+	})
+	if err := c.ep.WritePacket(r, nil /* gso */, proto, pkt); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 
@@ -327,24 +365,25 @@ func TestDeliverPacket(t *testing.T) {
 				defer c.cleanup()
 
 				// Build packet.
-				b := make([]byte, plen)
-				all := b
-				for i := range b {
-					b[i] = uint8(rand.Intn(256))
+				all := make([]byte, plen)
+				if _, err := rand.Read(all); err != nil {
+					t.Fatalf("rand.Read(all): %s", err)
 				}
-
-				var hdr header.Ethernet
-				if !eth {
-					// So that it looks like an IPv4 packet.
-					b[0] = 0x40
-				} else {
-					hdr = make(header.Ethernet, header.EthernetMinimumSize)
+				// Make it look like an IPv4 packet.
+				all[0] = 0x40
+
+				wantPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+					ReserveHeaderBytes: header.EthernetMinimumSize,
+					Data:               buffer.NewViewFromBytes(all).ToVectorisedView(),
+				})
+				if eth {
+					hdr := header.Ethernet(wantPkt.LinkHeader().Push(header.EthernetMinimumSize))
 					hdr.Encode(&header.EthernetFields{
 						SrcAddr: raddr,
 						DstAddr: laddr,
 						Type:    proto,
 					})
-					all = append(hdr, b...)
+					all = append(hdr, all...)
 				}
 
 				// Write packet via the file descriptor.
@@ -356,24 +395,15 @@ func TestDeliverPacket(t *testing.T) {
 				select {
 				case pi := <-c.ch:
 					want := packetInfo{
-						raddr: raddr,
-						proto: proto,
-						contents: stack.PacketBuffer{
-							Data:       buffer.View(b).ToVectorisedView(),
-							LinkHeader: buffer.View(hdr),
-						},
+						Raddr:    raddr,
+						Proto:    proto,
+						Contents: wantPkt,
 					}
 					if !eth {
-						want.proto = header.IPv4ProtocolNumber
-						want.raddr = ""
-					}
-					// want.contents.Data will be a single
-					// view, so make pi do the same for the
-					// DeepEqual check.
-					pi.contents.Data = pi.contents.Data.ToView().ToVectorisedView()
-					if !reflect.DeepEqual(want, pi) {
-						t.Fatalf("Unexpected received packet: %+v, want %+v", pi, want)
+						want.Proto = header.IPv4ProtocolNumber
+						want.Raddr = ""
 					}
+					checkPacketInfoEqual(t, pi, want)
 				case <-time.After(10 * time.Second):
 					t.Fatalf("Timed out waiting for packet")
 				}
@@ -500,3 +530,80 @@ func TestRecvMMsgDispatcherCapLength(t *testing.T) {
 
 	}
 }
+
+// fakeNetworkDispatcher delivers packets to pkts.
+type fakeNetworkDispatcher struct {
+	pkts []*stack.PacketBuffer
+}
+
+func (d *fakeNetworkDispatcher) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	d.pkts = append(d.pkts, pkt)
+}
+
+func (d *fakeNetworkDispatcher) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	panic("unimplemented")
+}
+
+func TestDispatchPacketFormat(t *testing.T) {
+	for _, test := range []struct {
+		name          string
+		newDispatcher func(fd int, e *endpoint) (linkDispatcher, error)
+	}{
+		{
+			name:          "readVDispatcher",
+			newDispatcher: newReadVDispatcher,
+		},
+		{
+			name:          "recvMMsgDispatcher",
+			newDispatcher: newRecvMMsgDispatcher,
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			// Create a socket pair to send/recv.
+			fds, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0)
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer syscall.Close(fds[0])
+			defer syscall.Close(fds[1])
+
+			data := []byte{
+				// Ethernet header.
+				1, 2, 3, 4, 5, 60,
+				1, 2, 3, 4, 5, 61,
+				8, 0,
+				// Mock network header.
+				40, 41, 42, 43,
+			}
+			err = syscall.Sendmsg(fds[1], data, nil, nil, 0)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			// Create and run dispatcher once.
+			sink := &fakeNetworkDispatcher{}
+			d, err := test.newDispatcher(fds[0], &endpoint{
+				hdrSize:    header.EthernetMinimumSize,
+				dispatcher: sink,
+			})
+			if err != nil {
+				t.Fatal(err)
+			}
+			if ok, err := d.dispatch(); !ok || err != nil {
+				t.Fatalf("d.dispatch() = %v, %v", ok, err)
+			}
+
+			// Verify packet.
+			if got, want := len(sink.pkts), 1; got != want {
+				t.Fatalf("len(sink.pkts) = %d, want %d", got, want)
+			}
+			pkt := sink.pkts[0]
+			if got, want := pkt.LinkHeader().View().Size(), header.EthernetMinimumSize; got != want {
+				t.Errorf("pkt.LinkHeader().View().Size() = %d, want %d", got, want)
+			}
+			if got, want := pkt.Data.Size(), 4; got != want {
+				t.Errorf("pkt.Data.Size() = %d, want %d", got, want)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
index fe2bf3b0b..c475dda20 100644
--- a/pkg/tcpip/link/fdbased/mmap.go
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -18,6 +18,7 @@ package fdbased
 
 import (
 	"encoding/binary"
+	"fmt"
 	"syscall"
 
 	"golang.org/x/sys/unix"
@@ -170,10 +171,9 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
 	var (
 		p             tcpip.NetworkProtocolNumber
 		remote, local tcpip.LinkAddress
-		eth           header.Ethernet
 	)
 	if d.e.hdrSize > 0 {
-		eth = header.Ethernet(pkt)
+		eth := header.Ethernet(pkt)
 		p = eth.Type()
 		remote = eth.SourceAddress()
 		local = eth.DestinationAddress()
@@ -190,10 +190,14 @@ func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
 		}
 	}
 
-	pkt = pkt[d.e.hdrSize:]
-	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, stack.PacketBuffer{
-		Data:       buffer.View(pkt).ToVectorisedView(),
-		LinkHeader: buffer.View(eth),
+	pbuf := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: buffer.View(pkt).ToVectorisedView(),
 	})
+	if d.e.hdrSize > 0 {
+		if _, ok := pbuf.LinkHeader().Consume(d.e.hdrSize); !ok {
+			panic(fmt.Sprintf("LinkHeader().Consume(%d) must succeed", d.e.hdrSize))
+		}
+	}
+	d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pbuf)
 	return true, nil
 }
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go
index cb4cbea69..8c3ca86d6 100644
--- a/pkg/tcpip/link/fdbased/packet_dispatchers.go
+++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go
@@ -103,7 +103,7 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
 	d.allocateViews(BufConfig)
 
 	n, err := rawfile.BlockingReadv(d.fd, d.iovecs)
-	if err != nil {
+	if n == 0 || err != nil {
 		return false, err
 	}
 	if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
@@ -111,17 +111,22 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
 		// isn't used and it isn't in a view.
 		n -= virtioNetHdrSize
 	}
-	if n <= d.e.hdrSize {
-		return false, nil
-	}
+
+	used := d.capViews(n, BufConfig)
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...)),
+	})
 
 	var (
 		p             tcpip.NetworkProtocolNumber
 		remote, local tcpip.LinkAddress
-		eth           header.Ethernet
 	)
 	if d.e.hdrSize > 0 {
-		eth = header.Ethernet(d.views[0][:header.EthernetMinimumSize])
+		hdr, ok := pkt.LinkHeader().Consume(d.e.hdrSize)
+		if !ok {
+			return false, nil
+		}
+		eth := header.Ethernet(hdr)
 		p = eth.Type()
 		remote = eth.SourceAddress()
 		local = eth.DestinationAddress()
@@ -138,14 +143,7 @@ func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
 		}
 	}
 
-	used := d.capViews(n, BufConfig)
-	pkt := stack.PacketBuffer{
-		Data:       buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...)),
-		LinkHeader: buffer.View(eth),
-	}
-	pkt.Data.TrimFront(d.e.hdrSize)
-
-	d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, pkt)
+	d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt)
 
 	// Prepare e.views for another packet: release used views.
 	for i := 0; i < used; i++ {
@@ -169,7 +167,7 @@ type recvMMsgDispatcher struct {
 
 	// iovecs is an array of array of iovec records where each iovec base
 	// pointer and length are initialzed to the corresponding view above,
-	// except when GSO is neabled then the first iovec in each array of
+	// except when GSO is enabled then the first iovec in each array of
 	// iovecs points to a buffer for the vnet header which is stripped
 	// before the views are passed up the stack for further processing.
 	iovecs [][]syscall.Iovec
@@ -268,17 +266,22 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
 		if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
 			n -= virtioNetHdrSize
 		}
-		if n <= d.e.hdrSize {
-			return false, nil
-		}
+
+		used := d.capViews(k, int(n), BufConfig)
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...)),
+		})
 
 		var (
 			p             tcpip.NetworkProtocolNumber
 			remote, local tcpip.LinkAddress
-			eth           header.Ethernet
 		)
 		if d.e.hdrSize > 0 {
-			eth = header.Ethernet(d.views[k][0])
+			hdr, ok := pkt.LinkHeader().Consume(d.e.hdrSize)
+			if !ok {
+				return false, nil
+			}
+			eth := header.Ethernet(hdr)
 			p = eth.Type()
 			remote = eth.SourceAddress()
 			local = eth.DestinationAddress()
@@ -295,13 +298,7 @@ func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
 			}
 		}
 
-		used := d.capViews(k, int(n), BufConfig)
-		pkt := stack.PacketBuffer{
-			Data:       buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...)),
-			LinkHeader: buffer.View(eth),
-		}
-		pkt.Data.TrimFront(d.e.hdrSize)
-		d.e.dispatcher.DeliverNetworkPacket(d.e, remote, local, p, pkt)
+		d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt)
 
 		// Prepare e.views for another packet: release used views.
 		for i := 0; i < used; i++ {
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
index 073c84ef9..38aa694e4 100644
--- a/pkg/tcpip/link/loopback/loopback.go
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -76,17 +76,17 @@ func (*endpoint) Wait() {}
 
 // WritePacket implements stack.LinkEndpoint.WritePacket. It delivers outbound
 // packets to the network-layer dispatcher.
-func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
-	views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
-	views[0] = pkt.Header.View()
-	views = append(views, pkt.Data.Views()...)
+func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	// Construct data as the unparsed portion for the loopback packet.
+	data := buffer.NewVectorisedView(pkt.Size(), pkt.Views())
 
 	// Because we're immediately turning around and writing the packet back
 	// to the rx path, we intentionally don't preserve the remote and local
 	// link addresses from the stack.Route we're passed.
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, protocol, stack.PacketBuffer{
-		Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
+	newPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: data,
 	})
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, newPkt)
 
 	return nil
 }
@@ -98,18 +98,25 @@ func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: vv,
+	})
 	// There should be an ethernet header at the beginning of vv.
-	hdr, ok := vv.PullUp(header.EthernetMinimumSize)
+	hdr, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize)
 	if !ok {
 		// Reject the packet if it's shorter than an ethernet header.
 		return tcpip.ErrBadAddress
 	}
 	linkHeader := header.Ethernet(hdr)
-	vv.TrimFront(len(linkHeader))
-	e.dispatcher.DeliverNetworkPacket(e, "" /* remote */, "" /* local */, linkHeader.Type(), stack.PacketBuffer{
-		Data:       vv,
-		LinkHeader: buffer.View(linkHeader),
-	})
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, linkHeader.Type(), pkt)
 
 	return nil
 }
+
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
+func (*endpoint) ARPHardwareType() header.ARPHardwareType {
+	return header.ARPHardwareLoopback
+}
+
+func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+}
diff --git a/pkg/tcpip/link/muxed/BUILD b/pkg/tcpip/link/muxed/BUILD
index 82b441b79..e7493e5c5 100644
--- a/pkg/tcpip/link/muxed/BUILD
+++ b/pkg/tcpip/link/muxed/BUILD
@@ -9,6 +9,7 @@ go_library(
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
     ],
 )
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
index a5478ce17..56a611825 100644
--- a/pkg/tcpip/link/muxed/injectable.go
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -18,6 +18,7 @@ package muxed
 import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
@@ -80,8 +81,8 @@ func (m *InjectableEndpoint) IsAttached() bool {
 }
 
 // InjectInbound implements stack.InjectableLinkEndpoint.
-func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
-	m.dispatcher.DeliverNetworkPacket(m, "" /* remote */, "" /* local */, protocol, pkt)
+func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	m.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, pkt)
 }
 
 // WritePackets writes outbound packets to the appropriate
@@ -98,7 +99,7 @@ func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts s
 // WritePacket writes outbound packets to the appropriate LinkInjectableEndpoint
 // based on the RemoteAddress. HandleLocal only works if r.RemoteAddress has a
 // route registered in this endpoint.
-func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	if endpoint, ok := m.routes[r.RemoteAddress]; ok {
 		return endpoint.WritePacket(r, gso, protocol, pkt)
 	}
@@ -129,6 +130,15 @@ func (m *InjectableEndpoint) Wait() {
 	}
 }
 
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
+func (*InjectableEndpoint) ARPHardwareType() header.ARPHardwareType {
+	panic("unsupported operation")
+}
+
+// AddHeader implements stack.LinkEndpoint.AddHeader.
+func (*InjectableEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+}
+
 // NewInjectableEndpoint creates a new multi-endpoint injectable endpoint.
 func NewInjectableEndpoint(routes map[tcpip.Address]stack.InjectableLinkEndpoint) *InjectableEndpoint {
 	return &InjectableEndpoint{
diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go
index 87c734c1f..3e4afcdad 100644
--- a/pkg/tcpip/link/muxed/injectable_test.go
+++ b/pkg/tcpip/link/muxed/injectable_test.go
@@ -46,14 +46,14 @@ func TestInjectableEndpointRawDispatch(t *testing.T) {
 func TestInjectableEndpointDispatch(t *testing.T) {
 	endpoint, sock, dstIP := makeTestInjectableEndpoint(t)
 
-	hdr := buffer.NewPrependable(1)
-	hdr.Prepend(1)[0] = 0xFA
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: 1,
+		Data:               buffer.NewViewFromBytes([]byte{0xFB}).ToVectorisedView(),
+	})
+	pkt.TransportHeader().Push(1)[0] = 0xFA
 	packetRoute := stack.Route{RemoteAddress: dstIP}
 
-	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, stack.PacketBuffer{
-		Header: hdr,
-		Data:   buffer.NewViewFromBytes([]byte{0xFB}).ToVectorisedView(),
-	})
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, pkt)
 
 	buf := make([]byte, 6500)
 	bytesRead, err := sock.Read(buf)
@@ -67,13 +67,14 @@ func TestInjectableEndpointDispatch(t *testing.T) {
 
 func TestInjectableEndpointDispatchHdrOnly(t *testing.T) {
 	endpoint, sock, dstIP := makeTestInjectableEndpoint(t)
-	hdr := buffer.NewPrependable(1)
-	hdr.Prepend(1)[0] = 0xFA
-	packetRoute := stack.Route{RemoteAddress: dstIP}
-	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, stack.PacketBuffer{
-		Header: hdr,
-		Data:   buffer.NewView(0).ToVectorisedView(),
+
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: 1,
+		Data:               buffer.NewView(0).ToVectorisedView(),
 	})
+	pkt.TransportHeader().Push(1)[0] = 0xFA
+	packetRoute := stack.Route{RemoteAddress: dstIP}
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, pkt)
 	buf := make([]byte, 6500)
 	bytesRead, err := sock.Read(buf)
 	if err != nil {
diff --git a/pkg/tcpip/link/nested/BUILD b/pkg/tcpip/link/nested/BUILD
new file mode 100644
index 000000000..2cdb23475
--- /dev/null
+++ b/pkg/tcpip/link/nested/BUILD
@@ -0,0 +1,32 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "nested",
+    srcs = [
+        "nested.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+    ],
+)
+
+go_test(
+    name = "nested_test",
+    size = "small",
+    srcs = [
+        "nested_test.go",
+    ],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/nested",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/nested/nested.go b/pkg/tcpip/link/nested/nested.go
new file mode 100644
index 000000000..d40de54df
--- /dev/null
+++ b/pkg/tcpip/link/nested/nested.go
@@ -0,0 +1,152 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package nested provides helpers to implement the pattern of nested
+// stack.LinkEndpoints.
+package nested
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// Endpoint is a wrapper around stack.LinkEndpoint and stack.NetworkDispatcher
+// that can be used to implement nesting safely by providing lifecycle
+// concurrency guards.
+//
+// See the tests in this package for example usage.
+type Endpoint struct {
+	child    stack.LinkEndpoint
+	embedder stack.NetworkDispatcher
+
+	// mu protects dispatcher.
+	mu         sync.RWMutex
+	dispatcher stack.NetworkDispatcher
+}
+
+var _ stack.GSOEndpoint = (*Endpoint)(nil)
+var _ stack.LinkEndpoint = (*Endpoint)(nil)
+var _ stack.NetworkDispatcher = (*Endpoint)(nil)
+
+// Init initializes a nested.Endpoint that uses embedder as the dispatcher for
+// child on Attach.
+//
+// See the tests in this package for example usage.
+func (e *Endpoint) Init(child stack.LinkEndpoint, embedder stack.NetworkDispatcher) {
+	e.child = child
+	e.embedder = embedder
+}
+
+// DeliverNetworkPacket implements stack.NetworkDispatcher.
+func (e *Endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.mu.RLock()
+	d := e.dispatcher
+	e.mu.RUnlock()
+	if d != nil {
+		d.DeliverNetworkPacket(remote, local, protocol, pkt)
+	}
+}
+
+// DeliverOutboundPacket implements stack.NetworkDispatcher.DeliverOutboundPacket.
+func (e *Endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.mu.RLock()
+	d := e.dispatcher
+	e.mu.RUnlock()
+	if d != nil {
+		d.DeliverOutboundPacket(remote, local, protocol, pkt)
+	}
+}
+
+// Attach implements stack.LinkEndpoint.
+func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.mu.Lock()
+	e.dispatcher = dispatcher
+	e.mu.Unlock()
+	// If we're attaching to a valid dispatcher, pass embedder as the dispatcher
+	// to our child, otherwise detach the child by giving it a nil dispatcher.
+	var pass stack.NetworkDispatcher
+	if dispatcher != nil {
+		pass = e.embedder
+	}
+	e.child.Attach(pass)
+}
+
+// IsAttached implements stack.LinkEndpoint.
+func (e *Endpoint) IsAttached() bool {
+	e.mu.RLock()
+	isAttached := e.dispatcher != nil
+	e.mu.RUnlock()
+	return isAttached
+}
+
+// MTU implements stack.LinkEndpoint.
+func (e *Endpoint) MTU() uint32 {
+	return e.child.MTU()
+}
+
+// Capabilities implements stack.LinkEndpoint.
+func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.child.Capabilities()
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.
+func (e *Endpoint) MaxHeaderLength() uint16 {
+	return e.child.MaxHeaderLength()
+}
+
+// LinkAddress implements stack.LinkEndpoint.
+func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.child.LinkAddress()
+}
+
+// WritePacket implements stack.LinkEndpoint.
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	return e.child.WritePacket(r, gso, protocol, pkt)
+}
+
+// WritePackets implements stack.LinkEndpoint.
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	return e.child.WritePackets(r, gso, pkts, protocol)
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.
+func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	return e.child.WriteRawPacket(vv)
+}
+
+// Wait implements stack.LinkEndpoint.
+func (e *Endpoint) Wait() {
+	e.child.Wait()
+}
+
+// GSOMaxSize implements stack.GSOEndpoint.
+func (e *Endpoint) GSOMaxSize() uint32 {
+	if e, ok := e.child.(stack.GSOEndpoint); ok {
+		return e.GSOMaxSize()
+	}
+	return 0
+}
+
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType
+func (e *Endpoint) ARPHardwareType() header.ARPHardwareType {
+	return e.child.ARPHardwareType()
+}
+
+// AddHeader implements stack.LinkEndpoint.AddHeader.
+func (e *Endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.child.AddHeader(local, remote, protocol, pkt)
+}
diff --git a/pkg/tcpip/link/nested/nested_test.go b/pkg/tcpip/link/nested/nested_test.go
new file mode 100644
index 000000000..c1f9d308c
--- /dev/null
+++ b/pkg/tcpip/link/nested/nested_test.go
@@ -0,0 +1,109 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nested_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/nested"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+type parentEndpoint struct {
+	nested.Endpoint
+}
+
+var _ stack.LinkEndpoint = (*parentEndpoint)(nil)
+var _ stack.NetworkDispatcher = (*parentEndpoint)(nil)
+
+type childEndpoint struct {
+	stack.LinkEndpoint
+	dispatcher stack.NetworkDispatcher
+}
+
+var _ stack.LinkEndpoint = (*childEndpoint)(nil)
+
+func (c *childEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	c.dispatcher = dispatcher
+}
+
+func (c *childEndpoint) IsAttached() bool {
+	return c.dispatcher != nil
+}
+
+type counterDispatcher struct {
+	count int
+}
+
+var _ stack.NetworkDispatcher = (*counterDispatcher)(nil)
+
+func (d *counterDispatcher) DeliverNetworkPacket(tcpip.LinkAddress, tcpip.LinkAddress, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) {
+	d.count++
+}
+
+func (d *counterDispatcher) DeliverOutboundPacket(tcpip.LinkAddress, tcpip.LinkAddress, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) {
+	panic("unimplemented")
+}
+
+func TestNestedLinkEndpoint(t *testing.T) {
+	const emptyAddress = tcpip.LinkAddress("")
+
+	var (
+		childEP  childEndpoint
+		nestedEP parentEndpoint
+		disp     counterDispatcher
+	)
+	nestedEP.Endpoint.Init(&childEP, &nestedEP)
+
+	if childEP.IsAttached() {
+		t.Error("On init, childEP.IsAttached() = true, want = false")
+	}
+	if nestedEP.IsAttached() {
+		t.Error("On init, nestedEP.IsAttached() = true, want = false")
+	}
+
+	nestedEP.Attach(&disp)
+	if disp.count != 0 {
+		t.Fatalf("After attach, got disp.count = %d, want = 0", disp.count)
+	}
+	if !childEP.IsAttached() {
+		t.Error("After attach, childEP.IsAttached() = false, want = true")
+	}
+	if !nestedEP.IsAttached() {
+		t.Error("After attach, nestedEP.IsAttached() = false, want = true")
+	}
+
+	nestedEP.DeliverNetworkPacket(emptyAddress, emptyAddress, header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{}))
+	if disp.count != 1 {
+		t.Errorf("After first packet with dispatcher attached, got disp.count = %d, want = 1", disp.count)
+	}
+
+	nestedEP.Attach(nil)
+	if childEP.IsAttached() {
+		t.Error("After detach, childEP.IsAttached() = true, want = false")
+	}
+	if nestedEP.IsAttached() {
+		t.Error("After detach, nestedEP.IsAttached() = true, want = false")
+	}
+
+	disp.count = 0
+	nestedEP.DeliverNetworkPacket(emptyAddress, emptyAddress, header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{}))
+	if disp.count != 0 {
+		t.Errorf("After second packet with dispatcher detached, got disp.count = %d, want = 0", disp.count)
+	}
+
+}
diff --git a/pkg/tcpip/link/packetsocket/BUILD b/pkg/tcpip/link/packetsocket/BUILD
new file mode 100644
index 000000000..6fff160ce
--- /dev/null
+++ b/pkg/tcpip/link/packetsocket/BUILD
@@ -0,0 +1,14 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "packetsocket",
+    srcs = ["endpoint.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/link/nested",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/packetsocket/endpoint.go b/pkg/tcpip/link/packetsocket/endpoint.go
new file mode 100644
index 000000000..3922c2a04
--- /dev/null
+++ b/pkg/tcpip/link/packetsocket/endpoint.go
@@ -0,0 +1,50 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package packetsocket provides a link layer endpoint that provides the ability
+// to loop outbound packets to any AF_PACKET sockets that may be interested in
+// the outgoing packet.
+package packetsocket
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/link/nested"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+type endpoint struct {
+	nested.Endpoint
+}
+
+// New creates a new packetsocket LinkEndpoint.
+func New(lower stack.LinkEndpoint) stack.LinkEndpoint {
+	e := &endpoint{}
+	e.Endpoint.Init(lower, e)
+	return e
+}
+
+// WritePacket implements stack.LinkEndpoint.WritePacket.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	e.Endpoint.DeliverOutboundPacket(r.RemoteLinkAddress, r.LocalLinkAddress, protocol, pkt)
+	return e.Endpoint.WritePacket(r, gso, protocol, pkt)
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, proto tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		e.Endpoint.DeliverOutboundPacket(pkt.EgressRoute.RemoteLinkAddress, pkt.EgressRoute.LocalLinkAddress, pkt.NetworkProtocolNumber, pkt)
+	}
+
+	return e.Endpoint.WritePackets(r, gso, pkts, proto)
+}
diff --git a/pkg/tcpip/link/qdisc/fifo/BUILD b/pkg/tcpip/link/qdisc/fifo/BUILD
index 054c213bc..1d0079bd6 100644
--- a/pkg/tcpip/link/qdisc/fifo/BUILD
+++ b/pkg/tcpip/link/qdisc/fifo/BUILD
@@ -14,6 +14,7 @@ go_library(
         "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
     ],
 )
diff --git a/pkg/tcpip/link/qdisc/fifo/endpoint.go b/pkg/tcpip/link/qdisc/fifo/endpoint.go
index 54432194d..fc1e34fc7 100644
--- a/pkg/tcpip/link/qdisc/fifo/endpoint.go
+++ b/pkg/tcpip/link/qdisc/fifo/endpoint.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
@@ -102,8 +103,13 @@ func (q *queueDispatcher) dispatchLoop() {
 }
 
 // DeliverNetworkPacket implements stack.NetworkDispatcher.DeliverNetworkPacket.
-func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
-	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
+func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
+}
+
+// DeliverOutboundPacket implements stack.NetworkDispatcher.DeliverOutboundPacket.
+func (e *endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverOutboundPacket(remote, local, protocol, pkt)
 }
 
 // Attach implements stack.LinkEndpoint.Attach.
@@ -146,7 +152,7 @@ func (e *endpoint) GSOMaxSize() uint32 {
 }
 
 // WritePacket implements stack.LinkEndpoint.WritePacket.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	// WritePacket caller's do not set the following fields in PacketBuffer
 	// so we populate them here.
 	newRoute := r.Clone()
@@ -154,7 +160,7 @@ func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.Ne
 	pkt.GSOOptions = gso
 	pkt.NetworkProtocolNumber = protocol
 	d := e.dispatchers[int(pkt.Hash)%len(e.dispatchers)]
-	if !d.q.enqueue(&pkt) {
+	if !d.q.enqueue(pkt) {
 		return tcpip.ErrNoBufferSpace
 	}
 	d.newPacketWaker.Assert()
@@ -193,6 +199,8 @@ func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketB
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	// TODO(gvisor.dev/issue/3267): Queue these packets as well once
+	// WriteRawPacket takes PacketBuffer instead of VectorisedView.
 	return e.lower.WriteRawPacket(vv)
 }
 
@@ -207,3 +215,13 @@ func (e *endpoint) Wait() {
 
 	e.wg.Wait()
 }
+
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType
+func (e *endpoint) ARPHardwareType() header.ARPHardwareType {
+	return e.lower.ARPHardwareType()
+}
+
+// AddHeader implements stack.LinkEndpoint.AddHeader.
+func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.lower.AddHeader(local, remote, protocol, pkt)
+}
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
index 0b5a6cf49..99313ee25 100644
--- a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
+++ b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
@@ -14,7 +14,7 @@
 
 // +build linux,amd64 linux,arm64
 // +build go1.12
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
index 44e25d475..f4c32c2da 100644
--- a/pkg/tcpip/link/rawfile/rawfile_unsafe.go
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -66,39 +66,14 @@ func NonBlockingWrite(fd int, buf []byte) *tcpip.Error {
 	return nil
 }
 
-// NonBlockingWrite3 writes up to three byte slices to a file descriptor in a
-// single syscall. It fails if partial data is written.
-func NonBlockingWrite3(fd int, b1, b2, b3 []byte) *tcpip.Error {
-	// If the is no second buffer, issue a regular write.
-	if len(b2) == 0 {
-		return NonBlockingWrite(fd, b1)
-	}
-
-	// We have two buffers. Build the iovec that represents them and issue
-	// a writev syscall.
-	iovec := [3]syscall.Iovec{
-		{
-			Base: &b1[0],
-			Len:  uint64(len(b1)),
-		},
-		{
-			Base: &b2[0],
-			Len:  uint64(len(b2)),
-		},
-	}
-	iovecLen := uintptr(2)
-
-	if len(b3) > 0 {
-		iovecLen++
-		iovec[2].Base = &b3[0]
-		iovec[2].Len = uint64(len(b3))
-	}
-
+// NonBlockingWriteIovec writes iovec to a file descriptor in a single syscall.
+// It fails if partial data is written.
+func NonBlockingWriteIovec(fd int, iovec []syscall.Iovec) *tcpip.Error {
+	iovecLen := uintptr(len(iovec))
 	_, _, e := syscall.RawSyscall(syscall.SYS_WRITEV, uintptr(fd), uintptr(unsafe.Pointer(&iovec[0])), iovecLen)
 	if e != 0 {
 		return TranslateErrno(e)
 	}
-
 	return nil
 }
 
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
index 0796d717e..7fb8a6c49 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -183,27 +183,33 @@ func (e *endpoint) LinkAddress() tcpip.LinkAddress {
 	return e.addr
 }
 
-// WritePacket writes outbound packets to the file descriptor. If it is not
-// currently writable, the packet is dropped.
-func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
-	// Add the ethernet header here.
-	eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
-	pkt.LinkHeader = buffer.View(eth)
+// AddHeader implements stack.LinkEndpoint.AddHeader.
+func (e *endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	// Add ethernet header if needed.
+	eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize))
 	ethHdr := &header.EthernetFields{
-		DstAddr: r.RemoteLinkAddress,
+		DstAddr: remote,
 		Type:    protocol,
 	}
-	if r.LocalLinkAddress != "" {
-		ethHdr.SrcAddr = r.LocalLinkAddress
+
+	// Preserve the src address if it's set in the route.
+	if local != "" {
+		ethHdr.SrcAddr = local
 	} else {
 		ethHdr.SrcAddr = e.addr
 	}
 	eth.Encode(ethHdr)
+}
+
+// WritePacket writes outbound packets to the file descriptor. If it is not
+// currently writable, the packet is dropped.
+func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	e.AddHeader(r.LocalLinkAddress, r.RemoteLinkAddress, protocol, pkt)
 
-	v := pkt.Data.ToView()
+	views := pkt.Views()
 	// Transmit the packet.
 	e.mu.Lock()
-	ok := e.tx.transmit(pkt.Header.View(), v)
+	ok := e.tx.transmit(views...)
 	e.mu.Unlock()
 
 	if !ok {
@@ -220,10 +226,10 @@ func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack.PacketB
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	v := vv.ToView()
+	views := vv.Views()
 	// Transmit the packet.
 	e.mu.Lock()
-	ok := e.tx.transmit(v, buffer.View{})
+	ok := e.tx.transmit(views...)
 	e.mu.Unlock()
 
 	if !ok {
@@ -269,16 +275,18 @@ func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
 			rxb[i].Size = e.bufferSize
 		}
 
-		if n < header.EthernetMinimumSize {
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: buffer.View(b).ToVectorisedView(),
+		})
+
+		hdr, ok := pkt.LinkHeader().Consume(header.EthernetMinimumSize)
+		if !ok {
 			continue
 		}
+		eth := header.Ethernet(hdr)
 
 		// Send packet up the stack.
-		eth := header.Ethernet(b[:header.EthernetMinimumSize])
-		d.DeliverNetworkPacket(e, eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), stack.PacketBuffer{
-			Data:       buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(),
-			LinkHeader: buffer.View(eth),
-		})
+		d.DeliverNetworkPacket(eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), pkt)
 	}
 
 	// Clean state.
@@ -287,3 +295,8 @@ func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
 
 	e.completed.Done()
 }
+
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType
+func (*endpoint) ARPHardwareType() header.ARPHardwareType {
+	return header.ARPHardwareEther
+}
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
index 33f640b85..22d5c97f1 100644
--- a/pkg/tcpip/link/sharedmem/sharedmem_test.go
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -131,7 +131,7 @@ func newTestContext(t *testing.T, mtu, bufferSize uint32, addr tcpip.LinkAddress
 	return c
 }
 
-func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (c *testContext) DeliverNetworkPacket(remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	c.mu.Lock()
 	c.packets = append(c.packets, packetInfo{
 		addr:  remoteLinkAddr,
@@ -143,6 +143,10 @@ func (c *testContext) DeliverNetworkPacket(_ stack.LinkEndpoint, remoteLinkAddr,
 	c.packetCh <- struct{}{}
 }
 
+func (c *testContext) DeliverOutboundPacket(remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	panic("unimplemented")
+}
+
 func (c *testContext) cleanup() {
 	c.ep.Close()
 	closeFDs(&c.txCfg)
@@ -262,21 +266,23 @@ func TestSimpleSend(t *testing.T) {
 
 	for iters := 1000; iters > 0; iters-- {
 		func() {
+			hdrLen, dataLen := rand.Intn(10000), rand.Intn(10000)
+
 			// Prepare and send packet.
-			n := rand.Intn(10000)
-			hdr := buffer.NewPrependable(n + int(c.ep.MaxHeaderLength()))
-			hdrBuf := hdr.Prepend(n)
+			hdrBuf := buffer.NewView(hdrLen)
 			randomFill(hdrBuf)
 
-			n = rand.Intn(10000)
-			buf := buffer.NewView(n)
-			randomFill(buf)
+			data := buffer.NewView(dataLen)
+			randomFill(data)
+
+			pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+				ReserveHeaderBytes: hdrLen + int(c.ep.MaxHeaderLength()),
+				Data:               data.ToVectorisedView(),
+			})
+			copy(pkt.NetworkHeader().Push(hdrLen), hdrBuf)
 
 			proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
-			if err := c.ep.WritePacket(&r, nil /* gso */, proto, stack.PacketBuffer{
-				Header: hdr,
-				Data:   buf.ToVectorisedView(),
-			}); err != nil {
+			if err := c.ep.WritePacket(&r, nil /* gso */, proto, pkt); err != nil {
 				t.Fatalf("WritePacket failed: %v", err)
 			}
 
@@ -313,7 +319,7 @@ func TestSimpleSend(t *testing.T) {
 
 			// Compare contents skipping the ethernet header added by the
 			// endpoint.
-			merged := append(hdrBuf, buf...)
+			merged := append(hdrBuf, data...)
 			if uint32(len(contents)) < pi.Size {
 				t.Fatalf("Sum of buffers is less than packet size: %v < %v", len(contents), pi.Size)
 			}
@@ -340,14 +346,14 @@ func TestPreserveSrcAddressInSend(t *testing.T) {
 		LocalLinkAddress:  newLocalLinkAddress,
 	}
 
-	// WritePacket panics given a prependable with anything less than
-	// the minimum size of the ethernet header.
-	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		// WritePacket panics given a prependable with anything less than
+		// the minimum size of the ethernet header.
+		ReserveHeaderBytes: header.EthernetMinimumSize,
+	})
 
 	proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
-	if err := c.ep.WritePacket(&r, nil /* gso */, proto, stack.PacketBuffer{
-		Header: hdr,
-	}); err != nil {
+	if err := c.ep.WritePacket(&r, nil /* gso */, proto, pkt); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 
@@ -399,12 +405,12 @@ func TestFillTxQueue(t *testing.T) {
 	// until the tx queue if full.
 	ids := make(map[uint64]struct{})
 	for i := queuePipeSize / 40; i > 0; i-- {
-		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: int(c.ep.MaxHeaderLength()),
+			Data:               buf.ToVectorisedView(),
+		})
 
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
-			Header: hdr,
-			Data:   buf.ToVectorisedView(),
-		}); err != nil {
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, pkt); err != nil {
 			t.Fatalf("WritePacket failed unexpectedly: %v", err)
 		}
 
@@ -418,11 +424,11 @@ func TestFillTxQueue(t *testing.T) {
 	}
 
 	// Next attempt to write must fail.
-	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
-		Header: hdr,
-		Data:   buf.ToVectorisedView(),
-	}); err != want {
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(c.ep.MaxHeaderLength()),
+		Data:               buf.ToVectorisedView(),
+	})
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, pkt); err != want {
 		t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
 	}
 }
@@ -446,11 +452,11 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 
 	// Send two packets so that the id slice has at least two slots.
 	for i := 2; i > 0; i-- {
-		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
-			Header: hdr,
-			Data:   buf.ToVectorisedView(),
-		}); err != nil {
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: int(c.ep.MaxHeaderLength()),
+			Data:               buf.ToVectorisedView(),
+		})
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, pkt); err != nil {
 			t.Fatalf("WritePacket failed unexpectedly: %v", err)
 		}
 	}
@@ -469,11 +475,11 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 	// until the tx queue if full.
 	ids := make(map[uint64]struct{})
 	for i := queuePipeSize / 40; i > 0; i-- {
-		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
-			Header: hdr,
-			Data:   buf.ToVectorisedView(),
-		}); err != nil {
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: int(c.ep.MaxHeaderLength()),
+			Data:               buf.ToVectorisedView(),
+		})
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, pkt); err != nil {
 			t.Fatalf("WritePacket failed unexpectedly: %v", err)
 		}
 
@@ -487,11 +493,11 @@ func TestFillTxQueueAfterBadCompletion(t *testing.T) {
 	}
 
 	// Next attempt to write must fail.
-	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
-		Header: hdr,
-		Data:   buf.ToVectorisedView(),
-	}); err != want {
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(c.ep.MaxHeaderLength()),
+		Data:               buf.ToVectorisedView(),
+	})
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, pkt); err != want {
 		t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
 	}
 }
@@ -513,11 +519,11 @@ func TestFillTxMemory(t *testing.T) {
 	// we fill the memory.
 	ids := make(map[uint64]struct{})
 	for i := queueDataSize / bufferSize; i > 0; i-- {
-		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
-			Header: hdr,
-			Data:   buf.ToVectorisedView(),
-		}); err != nil {
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: int(c.ep.MaxHeaderLength()),
+			Data:               buf.ToVectorisedView(),
+		})
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, pkt); err != nil {
 			t.Fatalf("WritePacket failed unexpectedly: %v", err)
 		}
 
@@ -532,11 +538,11 @@ func TestFillTxMemory(t *testing.T) {
 	}
 
 	// Next attempt to write must fail.
-	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-	err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
-		Header: hdr,
-		Data:   buf.ToVectorisedView(),
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(c.ep.MaxHeaderLength()),
+		Data:               buf.ToVectorisedView(),
 	})
+	err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, pkt)
 	if want := tcpip.ErrWouldBlock; err != want {
 		t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
 	}
@@ -560,11 +566,11 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 	// Each packet is uses up one buffer, so write as many as possible
 	// until there is only one buffer left.
 	for i := queueDataSize/bufferSize - 1; i > 0; i-- {
-		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
-			Header: hdr,
-			Data:   buf.ToVectorisedView(),
-		}); err != nil {
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: int(c.ep.MaxHeaderLength()),
+			Data:               buf.ToVectorisedView(),
+		})
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, pkt); err != nil {
 			t.Fatalf("WritePacket failed unexpectedly: %v", err)
 		}
 
@@ -575,23 +581,22 @@ func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
 
 	// Attempt to write a two-buffer packet. It must fail.
 	{
-		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		uu := buffer.NewView(bufferSize).ToVectorisedView()
-		if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
-			Header: hdr,
-			Data:   uu,
-		}); err != want {
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: int(c.ep.MaxHeaderLength()),
+			Data:               buffer.NewView(bufferSize).ToVectorisedView(),
+		})
+		if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, pkt); err != want {
 			t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
 		}
 	}
 
 	// Attempt to write the one-buffer packet again. It must succeed.
 	{
-		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
-		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, stack.PacketBuffer{
-			Header: hdr,
-			Data:   buf.ToVectorisedView(),
-		}); err != nil {
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: int(c.ep.MaxHeaderLength()),
+			Data:               buf.ToVectorisedView(),
+		})
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, pkt); err != nil {
 			t.Fatalf("WritePacket failed unexpectedly: %v", err)
 		}
 	}
diff --git a/pkg/tcpip/link/sharedmem/tx.go b/pkg/tcpip/link/sharedmem/tx.go
index 6b8d7859d..44f421c2d 100644
--- a/pkg/tcpip/link/sharedmem/tx.go
+++ b/pkg/tcpip/link/sharedmem/tx.go
@@ -18,6 +18,7 @@ import (
 	"math"
 	"syscall"
 
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue"
 )
 
@@ -76,9 +77,9 @@ func (t *tx) cleanup() {
 	syscall.Munmap(t.data)
 }
 
-// transmit sends a packet made up of up to two buffers. Returns a boolean that
-// specifies whether the packet was successfully transmitted.
-func (t *tx) transmit(a, b []byte) bool {
+// transmit sends a packet made of bufs. Returns a boolean that specifies
+// whether the packet was successfully transmitted.
+func (t *tx) transmit(bufs ...buffer.View) bool {
 	// Pull completions from the tx queue and add their buffers back to the
 	// pool so that we can reuse them.
 	for {
@@ -93,7 +94,10 @@ func (t *tx) transmit(a, b []byte) bool {
 	}
 
 	bSize := t.bufs.entrySize
-	total := uint32(len(a) + len(b))
+	total := uint32(0)
+	for _, data := range bufs {
+		total += uint32(len(data))
+	}
 	bufCount := (total + bSize - 1) / bSize
 
 	// Allocate enough buffers to hold all the data.
@@ -115,7 +119,7 @@ func (t *tx) transmit(a, b []byte) bool {
 	// Copy data into allocated buffers.
 	nBuf := buf
 	var dBuf []byte
-	for _, data := range [][]byte{a, b} {
+	for _, data := range bufs {
 		for len(data) > 0 {
 			if len(dBuf) == 0 {
 				dBuf = t.data[nBuf.Offset:][:nBuf.Size]
diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD
index 230a8d53a..7cbc305e7 100644
--- a/pkg/tcpip/link/sniffer/BUILD
+++ b/pkg/tcpip/link/sniffer/BUILD
@@ -14,6 +14,7 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/link/nested",
         "//pkg/tcpip/stack",
     ],
 )
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
index da1c520ae..4fb127978 100644
--- a/pkg/tcpip/link/sniffer/sniffer.go
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/nested"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
@@ -47,26 +48,22 @@ var LogPackets uint32 = 1
 // LogPacketsToPCAP must be accessed atomically.
 var LogPacketsToPCAP uint32 = 1
 
-var transportProtocolMinSizes map[tcpip.TransportProtocolNumber]int = map[tcpip.TransportProtocolNumber]int{
-	header.ICMPv4ProtocolNumber: header.IPv4MinimumSize,
-	header.ICMPv6ProtocolNumber: header.IPv6MinimumSize,
-	header.UDPProtocolNumber:    header.UDPMinimumSize,
-	header.TCPProtocolNumber:    header.TCPMinimumSize,
-}
-
 type endpoint struct {
-	dispatcher stack.NetworkDispatcher
-	lower      stack.LinkEndpoint
+	nested.Endpoint
 	writer     io.Writer
 	maxPCAPLen uint32
 }
 
+var _ stack.GSOEndpoint = (*endpoint)(nil)
+var _ stack.LinkEndpoint = (*endpoint)(nil)
+var _ stack.NetworkDispatcher = (*endpoint)(nil)
+
 // New creates a new sniffer link-layer endpoint. It wraps around another
 // endpoint and logs packets and they traverse the endpoint.
 func New(lower stack.LinkEndpoint) stack.LinkEndpoint {
-	return &endpoint{
-		lower: lower,
-	}
+	sniffer := &endpoint{}
+	sniffer.Endpoint.Init(lower, sniffer)
+	return sniffer
 }
 
 func zoneOffset() (int32, error) {
@@ -110,62 +107,25 @@ func NewWithWriter(lower stack.LinkEndpoint, writer io.Writer, snapLen uint32) (
 	if err := writePCAPHeader(writer, snapLen); err != nil {
 		return nil, err
 	}
-	return &endpoint{
-		lower:      lower,
+	sniffer := &endpoint{
 		writer:     writer,
 		maxPCAPLen: snapLen,
-	}, nil
+	}
+	sniffer.Endpoint.Init(lower, sniffer)
+	return sniffer, nil
 }
 
 // DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is
 // called by the link-layer endpoint being wrapped when a packet arrives, and
 // logs the packet before forwarding to the actual dispatcher.
-func (e *endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
-	e.dumpPacket("recv", nil, protocol, &pkt)
-	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
-}
-
-// Attach implements the stack.LinkEndpoint interface. It saves the dispatcher
-// and registers with the lower endpoint as its dispatcher so that "e" is called
-// for inbound packets.
-func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
-	e.dispatcher = dispatcher
-	e.lower.Attach(e)
-}
-
-// IsAttached implements stack.LinkEndpoint.IsAttached.
-func (e *endpoint) IsAttached() bool {
-	return e.dispatcher != nil
+func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dumpPacket("recv", nil, protocol, pkt)
+	e.Endpoint.DeliverNetworkPacket(remote, local, protocol, pkt)
 }
 
-// MTU implements stack.LinkEndpoint.MTU. It just forwards the request to the
-// lower endpoint.
-func (e *endpoint) MTU() uint32 {
-	return e.lower.MTU()
-}
-
-// Capabilities implements stack.LinkEndpoint.Capabilities. It just forwards the
-// request to the lower endpoint.
-func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
-	return e.lower.Capabilities()
-}
-
-// MaxHeaderLength implements the stack.LinkEndpoint interface. It just forwards
-// the request to the lower endpoint.
-func (e *endpoint) MaxHeaderLength() uint16 {
-	return e.lower.MaxHeaderLength()
-}
-
-func (e *endpoint) LinkAddress() tcpip.LinkAddress {
-	return e.lower.LinkAddress()
-}
-
-// GSOMaxSize returns the maximum GSO packet size.
-func (e *endpoint) GSOMaxSize() uint32 {
-	if gso, ok := e.lower.(stack.GSOEndpoint); ok {
-		return gso.GSOMaxSize()
-	}
-	return 0
+// DeliverOutboundPacket implements stack.NetworkDispatcher.DeliverOutboundPacket.
+func (e *endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.Endpoint.DeliverOutboundPacket(remote, local, protocol, pkt)
 }
 
 func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
@@ -174,7 +134,7 @@ func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.Netw
 		logPacket(prefix, protocol, pkt, gso)
 	}
 	if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 {
-		totalLength := pkt.Header.UsedLength() + pkt.Data.Size()
+		totalLength := pkt.Size()
 		length := totalLength
 		if max := int(e.maxPCAPLen); length > max {
 			length = max
@@ -195,12 +155,11 @@ func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.Netw
 				length -= n
 			}
 		}
-		write(pkt.Header.View())
-		for _, view := range pkt.Data.Views() {
+		for _, v := range pkt.Views() {
 			if length == 0 {
 				break
 			}
-			write(view)
+			write(v)
 		}
 	}
 }
@@ -208,9 +167,9 @@ func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.Netw
 // WritePacket implements the stack.LinkEndpoint interface. It is called by
 // higher-level protocols to write packets; it just logs the packet and
 // forwards the request to the lower endpoint.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
-	e.dumpPacket("send", gso, protocol, &pkt)
-	return e.lower.WritePacket(r, gso, protocol, pkt)
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	e.dumpPacket("send", gso, protocol, pkt)
+	return e.Endpoint.WritePacket(r, gso, protocol, pkt)
 }
 
 // WritePackets implements the stack.LinkEndpoint interface. It is called by
@@ -220,20 +179,17 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
 		e.dumpPacket("send", gso, protocol, pkt)
 	}
-	return e.lower.WritePackets(r, gso, pkts, protocol)
+	return e.Endpoint.WritePackets(r, gso, pkts, protocol)
 }
 
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
-	e.dumpPacket("send", nil, 0, &stack.PacketBuffer{
+	e.dumpPacket("send", nil, 0, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: vv,
-	})
-	return e.lower.WriteRawPacket(vv)
+	}))
+	return e.Endpoint.WriteRawPacket(vv)
 }
 
-// Wait implements stack.LinkEndpoint.Wait.
-func (e *endpoint) Wait() { e.lower.Wait() }
-
 func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) {
 	// Figure out the network layer info.
 	var transProto uint8
@@ -244,12 +200,8 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 	var fragmentOffset uint16
 	var moreFragments bool
 
-	// Create a clone of pkt, including any headers if present. Avoid allocating
-	// backing memory for the clone.
-	views := [8]buffer.View{}
-	vv := buffer.NewVectorisedView(0, views[:0])
-	vv.AppendView(pkt.Header.View())
-	vv.Append(pkt.Data)
+	// Examine the packet using a new VV. Backing storage must not be written.
+	vv := buffer.NewVectorisedView(pkt.Size(), pkt.Views())
 
 	switch protocol {
 	case header.IPv4ProtocolNumber:
@@ -287,7 +239,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 		vv.TrimFront(header.ARPSize)
 		arp := header.ARP(hdr)
 		log.Infof(
-			"%s arp %v (%v) -> %v (%v) valid:%v",
+			"%s arp %s (%s) -> %s (%s) valid:%t",
 			prefix,
 			tcpip.Address(arp.ProtocolAddressSender()), tcpip.LinkAddress(arp.HardwareAddressSender()),
 			tcpip.Address(arp.ProtocolAddressTarget()), tcpip.LinkAddress(arp.HardwareAddressTarget()),
@@ -299,13 +251,6 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 		return
 	}
 
-	// We aren't guaranteed to have a transport header - it's possible for
-	// writes via raw endpoints to contain only network headers.
-	if minSize, ok := transportProtocolMinSizes[tcpip.TransportProtocolNumber(transProto)]; ok && vv.Size() < minSize {
-		log.Infof("%s %v -> %v transport protocol: %d, but no transport header found (possible raw packet)", prefix, src, dst, transProto)
-		return
-	}
-
 	// Figure out the transport layer info.
 	transName := "unknown"
 	srcPort := uint16(0)
@@ -346,7 +291,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 				icmpType = "info reply"
 			}
 		}
-		log.Infof("%s %s %v -> %v %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+		log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
 		return
 
 	case header.ICMPv6ProtocolNumber:
@@ -381,7 +326,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 		case header.ICMPv6RedirectMsg:
 			icmpType = "redirect message"
 		}
-		log.Infof("%s %s %v -> %v %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+		log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
 		return
 
 	case header.UDPProtocolNumber:
@@ -428,7 +373,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 					flagsStr[i] = ' '
 				}
 			}
-			details = fmt.Sprintf("flags:0x%02x (%v) seqnum: %v ack: %v win: %v xsum:0x%x", flags, string(flagsStr), tcp.SequenceNumber(), tcp.AckNumber(), tcp.WindowSize(), tcp.Checksum())
+			details = fmt.Sprintf("flags:0x%02x (%s) seqnum: %d ack: %d win: %d xsum:0x%x", flags, string(flagsStr), tcp.SequenceNumber(), tcp.AckNumber(), tcp.WindowSize(), tcp.Checksum())
 			if flags&header.TCPFlagSyn != 0 {
 				details += fmt.Sprintf(" options: %+v", header.ParseSynOptions(tcp.Options(), flags&header.TCPFlagAck != 0))
 			} else {
@@ -437,7 +382,7 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 		}
 
 	default:
-		log.Infof("%s %v -> %v unknown transport protocol: %d", prefix, src, dst, transProto)
+		log.Infof("%s %s -> %s unknown transport protocol: %d", prefix, src, dst, transProto)
 		return
 	}
 
@@ -445,5 +390,5 @@ func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.P
 		details += fmt.Sprintf(" gso: %+v", gso)
 	}
 
-	log.Infof("%s %s %v:%v -> %v:%v len:%d id:%04x %s", prefix, transName, src, srcPort, dst, dstPort, size, id, details)
+	log.Infof("%s %s %s:%d -> %s:%d len:%d id:%04x %s", prefix, transName, src, srcPort, dst, dstPort, size, id, details)
 }
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
index e0db6cf54..6c137f693 100644
--- a/pkg/tcpip/link/tun/BUILD
+++ b/pkg/tcpip/link/tun/BUILD
@@ -12,6 +12,7 @@ go_library(
     visibility = ["//visibility:public"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/refs",
         "//pkg/sync",
         "//pkg/syserror",
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
index 617446ea2..3b1510a33 100644
--- a/pkg/tcpip/link/tun/device.go
+++ b/pkg/tcpip/link/tun/device.go
@@ -18,6 +18,7 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -64,14 +65,14 @@ func (d *Device) beforeSave() {
 }
 
 // Release implements fs.FileOperations.Release.
-func (d *Device) Release() {
+func (d *Device) Release(ctx context.Context) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
 	// Decrease refcount if there is an endpoint associated with this file.
 	if d.endpoint != nil {
 		d.endpoint.RemoveNotify(d.notifyHandle)
-		d.endpoint.DecRef()
+		d.endpoint.DecRef(ctx)
 		d.endpoint = nil
 	}
 }
@@ -139,6 +140,7 @@ func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkE
 			stack:    s,
 			nicID:    id,
 			name:     name,
+			isTap:    prefix == "tap",
 		}
 		endpoint.Endpoint.LinkEPCapabilities = linkCaps
 		if endpoint.name == "" {
@@ -213,12 +215,11 @@ func (d *Device) Write(data []byte) (int64, error) {
 		remote = tcpip.LinkAddress(zeroMAC[:])
 	}
 
-	pkt := stack.PacketBuffer{
-		Data: buffer.View(data).ToVectorisedView(),
-	}
-	if ethHdr != nil {
-		pkt.LinkHeader = buffer.View(ethHdr)
-	}
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: len(ethHdr),
+		Data:               buffer.View(data).ToVectorisedView(),
+	})
+	copy(pkt.LinkHeader().Push(len(ethHdr)), ethHdr)
 	endpoint.InjectLinkAddr(protocol, remote, pkt)
 	return dataLen, nil
 }
@@ -263,33 +264,22 @@ func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) {
 	// If the packet does not already have link layer header, and the route
 	// does not exist, we can't compute it. This is possibly a raw packet, tun
 	// device doesn't support this at the moment.
-	if info.Pkt.LinkHeader == nil && info.Route.RemoteLinkAddress == "" {
+	if info.Pkt.LinkHeader().View().IsEmpty() && info.Route.RemoteLinkAddress == "" {
 		return nil, false
 	}
 
 	// Ethernet header (TAP only).
 	if d.hasFlags(linux.IFF_TAP) {
 		// Add ethernet header if not provided.
-		if info.Pkt.LinkHeader == nil {
-			hdr := &header.EthernetFields{
-				SrcAddr: info.Route.LocalLinkAddress,
-				DstAddr: info.Route.RemoteLinkAddress,
-				Type:    info.Proto,
-			}
-			if hdr.SrcAddr == "" {
-				hdr.SrcAddr = d.endpoint.LinkAddress()
-			}
-
-			eth := make(header.Ethernet, header.EthernetMinimumSize)
-			eth.Encode(hdr)
-			vv.AppendView(buffer.View(eth))
-		} else {
-			vv.AppendView(info.Pkt.LinkHeader)
+		if info.Pkt.LinkHeader().View().IsEmpty() {
+			d.endpoint.AddHeader(info.Route.LocalLinkAddress, info.Route.RemoteLinkAddress, info.Proto, info.Pkt)
 		}
+		vv.AppendView(info.Pkt.LinkHeader().View())
 	}
 
 	// Append upper headers.
-	vv.AppendView(buffer.View(info.Pkt.Header.View()[len(info.Pkt.LinkHeader):]))
+	vv.AppendView(info.Pkt.NetworkHeader().View())
+	vv.AppendView(info.Pkt.TransportHeader().View())
 	// Append data payload.
 	vv.Append(info.Pkt.Data)
 
@@ -348,11 +338,46 @@ type tunEndpoint struct {
 	stack *stack.Stack
 	nicID tcpip.NICID
 	name  string
+	isTap bool
 }
 
 // DecRef decrements refcount of e, removes NIC if refcount goes to 0.
-func (e *tunEndpoint) DecRef() {
-	e.DecRefWithDestructor(func() {
+func (e *tunEndpoint) DecRef(ctx context.Context) {
+	e.DecRefWithDestructor(ctx, func(context.Context) {
 		e.stack.RemoveNIC(e.nicID)
 	})
 }
+
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
+func (e *tunEndpoint) ARPHardwareType() header.ARPHardwareType {
+	if e.isTap {
+		return header.ARPHardwareEther
+	}
+	return header.ARPHardwareNone
+}
+
+// AddHeader implements stack.LinkEndpoint.AddHeader.
+func (e *tunEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	if !e.isTap {
+		return
+	}
+	eth := header.Ethernet(pkt.LinkHeader().Push(header.EthernetMinimumSize))
+	hdr := &header.EthernetFields{
+		SrcAddr: local,
+		DstAddr: remote,
+		Type:    protocol,
+	}
+	if hdr.SrcAddr == "" {
+		hdr.SrcAddr = e.LinkAddress()
+	}
+
+	eth.Encode(hdr)
+}
+
+// MaxHeaderLength returns the maximum size of the link layer header.
+func (e *tunEndpoint) MaxHeaderLength() uint16 {
+	if e.isTap {
+		return header.EthernetMinimumSize
+	}
+	return 0
+}
diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD
index 0956d2c65..ee84c3d96 100644
--- a/pkg/tcpip/link/waitable/BUILD
+++ b/pkg/tcpip/link/waitable/BUILD
@@ -12,6 +12,7 @@ go_library(
         "//pkg/gate",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
     ],
 )
@@ -25,6 +26,7 @@ go_test(
     deps = [
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
         "//pkg/tcpip/stack",
     ],
 )
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
index 2b3741276..b152a0f26 100644
--- a/pkg/tcpip/link/waitable/waitable.go
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/gate"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
@@ -50,12 +51,21 @@ func New(lower stack.LinkEndpoint) *Endpoint {
 // It is called by the link-layer endpoint being wrapped when a packet arrives,
 // and only forwards to the actual dispatcher if Wait or WaitDispatch haven't
 // been called.
-func (e *Endpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (e *Endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	if !e.dispatchGate.Enter() {
 		return
 	}
 
-	e.dispatcher.DeliverNetworkPacket(e, remote, local, protocol, pkt)
+	e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
+	e.dispatchGate.Leave()
+}
+
+// DeliverOutboundPacket implements stack.NetworkDispatcher.DeliverOutboundPacket.
+func (e *Endpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	if !e.dispatchGate.Enter() {
+		return
+	}
+	e.dispatcher.DeliverOutboundPacket(remote, local, protocol, pkt)
 	e.dispatchGate.Leave()
 }
 
@@ -99,7 +109,7 @@ func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
 // WritePacket implements stack.LinkEndpoint.WritePacket. It is called by
 // higher-level protocols to write packets. It only forwards packets to the
 // lower endpoint if Wait or WaitWrite haven't been called.
-func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	if !e.writeGate.Enter() {
 		return nil
 	}
@@ -147,3 +157,13 @@ func (e *Endpoint) WaitDispatch() {
 
 // Wait implements stack.LinkEndpoint.Wait.
 func (e *Endpoint) Wait() {}
+
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
+func (e *Endpoint) ARPHardwareType() header.ARPHardwareType {
+	return e.lower.ARPHardwareType()
+}
+
+// AddHeader implements stack.LinkEndpoint.AddHeader.
+func (e *Endpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.lower.AddHeader(local, remote, protocol, pkt)
+}
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
index 54eb5322b..94827fc56 100644
--- a/pkg/tcpip/link/waitable/waitable_test.go
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -19,6 +19,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
@@ -35,10 +36,14 @@ type countedEndpoint struct {
 	dispatcher stack.NetworkDispatcher
 }
 
-func (e *countedEndpoint) DeliverNetworkPacket(linkEP stack.LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (e *countedEndpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	e.dispatchCount++
 }
 
+func (e *countedEndpoint) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	panic("unimplemented")
+}
+
 func (e *countedEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
 	e.attachCount++
 	e.dispatcher = dispatcher
@@ -65,7 +70,7 @@ func (e *countedEndpoint) LinkAddress() tcpip.LinkAddress {
 	return e.linkAddr
 }
 
-func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	e.writeCount++
 	return nil
 }
@@ -81,29 +86,39 @@ func (e *countedEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
 	return nil
 }
 
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
+func (*countedEndpoint) ARPHardwareType() header.ARPHardwareType {
+	panic("unimplemented")
+}
+
 // Wait implements stack.LinkEndpoint.Wait.
 func (*countedEndpoint) Wait() {}
 
+// AddHeader implements stack.LinkEndpoint.AddHeader.
+func (e *countedEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	panic("unimplemented")
+}
+
 func TestWaitWrite(t *testing.T) {
 	ep := &countedEndpoint{}
 	wep := New(ep)
 
 	// Write and check that it goes through.
-	wep.WritePacket(nil, nil /* gso */, 0, stack.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, stack.NewPacketBuffer(stack.PacketBufferOptions{}))
 	if want := 1; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
 
 	// Wait on dispatches, then try to write. It must go through.
 	wep.WaitDispatch()
-	wep.WritePacket(nil, nil /* gso */, 0, stack.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, stack.NewPacketBuffer(stack.PacketBufferOptions{}))
 	if want := 2; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
 
 	// Wait on writes, then try to write. It must not go through.
 	wep.WaitWrite()
-	wep.WritePacket(nil, nil /* gso */, 0, stack.PacketBuffer{})
+	wep.WritePacket(nil, nil /* gso */, 0, stack.NewPacketBuffer(stack.PacketBufferOptions{}))
 	if want := 2; ep.writeCount != want {
 		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
 	}
@@ -120,21 +135,21 @@ func TestWaitDispatch(t *testing.T) {
 	}
 
 	// Dispatch and check that it goes through.
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, stack.NewPacketBuffer(stack.PacketBufferOptions{}))
 	if want := 1; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on writes, then try to dispatch. It must go through.
 	wep.WaitWrite()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, stack.NewPacketBuffer(stack.PacketBufferOptions{}))
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
 
 	// Wait on dispatches, then try to dispatch. It must not go through.
 	wep.WaitDispatch()
-	ep.dispatcher.DeliverNetworkPacket(ep, "", "", 0, stack.PacketBuffer{})
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, stack.NewPacketBuffer(stack.PacketBufferOptions{}))
 	if want := 2; ep.dispatchCount != want {
 		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
 	}
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
index 9d0797af7..1ad788a17 100644
--- a/pkg/tcpip/network/arp/arp.go
+++ b/pkg/tcpip/network/arp/arp.go
@@ -80,7 +80,7 @@ func (e *endpoint) MaxHeaderLength() uint16 {
 
 func (e *endpoint) Close() {}
 
-func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, stack.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, *stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
@@ -94,16 +94,12 @@ func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList
 	return 0, tcpip.ErrNotSupported
 }
 
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
-func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	v, ok := pkt.Data.PullUp(header.ARPSize)
-	if !ok {
-		return
-	}
-	h := header.ARP(v)
+func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	h := header.ARP(pkt.NetworkHeader().View())
 	if !h.IsValid() {
 		return
 	}
@@ -114,17 +110,17 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 		if e.linkAddrCache.CheckLocalAddress(e.nicID, header.IPv4ProtocolNumber, localAddr) == 0 {
 			return // we have no useful answer, ignore the request
 		}
-		hdr := buffer.NewPrependable(int(e.linkEP.MaxHeaderLength()) + header.ARPSize)
-		packet := header.ARP(hdr.Prepend(header.ARPSize))
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: int(e.linkEP.MaxHeaderLength()) + header.ARPSize,
+		})
+		packet := header.ARP(pkt.NetworkHeader().Push(header.ARPSize))
 		packet.SetIPv4OverEthernet()
 		packet.SetOp(header.ARPReply)
 		copy(packet.HardwareAddressSender(), r.LocalLinkAddress[:])
 		copy(packet.ProtocolAddressSender(), h.ProtocolAddressTarget())
 		copy(packet.HardwareAddressTarget(), h.HardwareAddressSender())
 		copy(packet.ProtocolAddressTarget(), h.ProtocolAddressSender())
-		e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, stack.PacketBuffer{
-			Header: hdr,
-		})
+		_ = e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, pkt)
 		fallthrough // also fill the cache from requests
 	case header.ARPReply:
 		addr := tcpip.Address(h.ProtocolAddressSender())
@@ -164,28 +160,31 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
 }
 
 // LinkAddressRequest implements stack.LinkAddressResolver.LinkAddressRequest.
-func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error {
+func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP stack.LinkEndpoint) *tcpip.Error {
 	r := &stack.Route{
-		RemoteLinkAddress: broadcastMAC,
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+	if len(r.RemoteLinkAddress) == 0 {
+		r.RemoteLinkAddress = header.EthernetBroadcastAddress
 	}
 
-	hdr := buffer.NewPrependable(int(linkEP.MaxHeaderLength()) + header.ARPSize)
-	h := header.ARP(hdr.Prepend(header.ARPSize))
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(linkEP.MaxHeaderLength()) + header.ARPSize,
+	})
+	h := header.ARP(pkt.NetworkHeader().Push(header.ARPSize))
 	h.SetIPv4OverEthernet()
 	h.SetOp(header.ARPRequest)
 	copy(h.HardwareAddressSender(), linkEP.LinkAddress())
 	copy(h.ProtocolAddressSender(), localAddr)
 	copy(h.ProtocolAddressTarget(), addr)
 
-	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, stack.PacketBuffer{
-		Header: hdr,
-	})
+	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, pkt)
 }
 
 // ResolveStaticAddress implements stack.LinkAddressResolver.ResolveStaticAddress.
 func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
 	if addr == header.IPv4Broadcast {
-		return broadcastMAC, true
+		return header.EthernetBroadcastAddress, true
 	}
 	if header.IsV4MulticastAddress(addr) {
 		return header.EthernetAddressFromMulticastIPv4Address(addr), true
@@ -209,7 +208,14 @@ func (*protocol) Close() {}
 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}
 
-var broadcastMAC = tcpip.LinkAddress([]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff})
+// Parse implements stack.NetworkProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
+	_, ok = pkt.NetworkHeader().Consume(header.ARPSize)
+	if !ok {
+		return 0, false, false
+	}
+	return 0, false, true
+}
 
 // NewProtocol returns an ARP network protocol.
 func NewProtocol() stack.NetworkProtocol {
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
index 1646d9cde..c2c3e6891 100644
--- a/pkg/tcpip/network/arp/arp_test.go
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -32,10 +32,14 @@ import (
 )
 
 const (
-	stackLinkAddr = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c")
-	stackAddr1    = tcpip.Address("\x0a\x00\x00\x01")
-	stackAddr2    = tcpip.Address("\x0a\x00\x00\x02")
-	stackAddrBad  = tcpip.Address("\x0a\x00\x00\x03")
+	stackLinkAddr1 = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c")
+	stackLinkAddr2 = tcpip.LinkAddress("\x0b\x0b\x0c\x0c\x0d\x0d")
+	stackAddr1     = tcpip.Address("\x0a\x00\x00\x01")
+	stackAddr2     = tcpip.Address("\x0a\x00\x00\x02")
+	stackAddrBad   = tcpip.Address("\x0a\x00\x00\x03")
+
+	defaultChannelSize = 1
+	defaultMTU         = 65536
 )
 
 type testContext struct {
@@ -50,8 +54,7 @@ func newTestContext(t *testing.T) *testContext {
 		TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol4()},
 	})
 
-	const defaultMTU = 65536
-	ep := channel.New(256, defaultMTU, stackLinkAddr)
+	ep := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr1)
 	wep := stack.LinkEndpoint(ep)
 
 	if testing.Verbose() {
@@ -103,9 +106,9 @@ func TestDirectRequest(t *testing.T) {
 
 	inject := func(addr tcpip.Address) {
 		copy(h.ProtocolAddressTarget(), addr)
-		c.linkEP.InjectInbound(arp.ProtocolNumber, stack.PacketBuffer{
+		c.linkEP.InjectInbound(arp.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 			Data: v.ToVectorisedView(),
-		})
+		}))
 	}
 
 	for i, address := range []tcpip.Address{stackAddr1, stackAddr2} {
@@ -115,11 +118,11 @@ func TestDirectRequest(t *testing.T) {
 			if pi.Proto != arp.ProtocolNumber {
 				t.Fatalf("expected ARP response, got network protocol number %d", pi.Proto)
 			}
-			rep := header.ARP(pi.Pkt.Header.View())
+			rep := header.ARP(pi.Pkt.NetworkHeader().View())
 			if !rep.IsValid() {
-				t.Fatalf("invalid ARP response pi.Pkt.Header.UsedLength()=%d", pi.Pkt.Header.UsedLength())
+				t.Fatalf("invalid ARP response: len = %d; response = %x", len(rep), rep)
 			}
-			if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr; got != want {
+			if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr1; got != want {
 				t.Errorf("got HardwareAddressSender = %s, want = %s", got, want)
 			}
 			if got, want := tcpip.Address(rep.ProtocolAddressSender()), tcpip.Address(h.ProtocolAddressTarget()); got != want {
@@ -144,3 +147,44 @@ func TestDirectRequest(t *testing.T) {
 		t.Errorf("stackAddrBad: unexpected packet sent, Proto=%v", pkt.Proto)
 	}
 }
+
+func TestLinkAddressRequest(t *testing.T) {
+	tests := []struct {
+		name           string
+		remoteLinkAddr tcpip.LinkAddress
+		expectLinkAddr tcpip.LinkAddress
+	}{
+		{
+			name:           "Unicast",
+			remoteLinkAddr: stackLinkAddr2,
+			expectLinkAddr: stackLinkAddr2,
+		},
+		{
+			name:           "Multicast",
+			remoteLinkAddr: "",
+			expectLinkAddr: header.EthernetBroadcastAddress,
+		},
+	}
+
+	for _, test := range tests {
+		p := arp.NewProtocol()
+		linkRes, ok := p.(stack.LinkAddressResolver)
+		if !ok {
+			t.Fatal("expected ARP protocol to implement stack.LinkAddressResolver")
+		}
+
+		linkEP := channel.New(defaultChannelSize, defaultMTU, stackLinkAddr1)
+		if err := linkRes.LinkAddressRequest(stackAddr1, stackAddr2, test.remoteLinkAddr, linkEP); err != nil {
+			t.Errorf("got p.LinkAddressRequest(%s, %s, %s, _) = %s", stackAddr1, stackAddr2, test.remoteLinkAddr, err)
+		}
+
+		pkt, ok := linkEP.Read()
+		if !ok {
+			t.Fatal("expected to send a link address request")
+		}
+
+		if got, want := pkt.Route.RemoteLinkAddress, test.expectLinkAddr; got != want {
+			t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", got, want)
+		}
+	}
+}
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
index f42abc4bb..1827666c5 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -17,28 +17,58 @@
 package fragmentation
 
 import (
+	"errors"
 	"fmt"
 	"log"
 	"time"
 
 	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 )
 
-// DefaultReassembleTimeout is based on the linux stack: net.ipv4.ipfrag_time.
-const DefaultReassembleTimeout = 30 * time.Second
+const (
+	// DefaultReassembleTimeout is based on the linux stack: net.ipv4.ipfrag_time.
+	DefaultReassembleTimeout = 30 * time.Second
 
-// HighFragThreshold is the threshold at which we start trimming old
-// fragmented packets. Linux uses a default value of 4 MB. See
-// net.ipv4.ipfrag_high_thresh for more information.
-const HighFragThreshold = 4 << 20 // 4MB
+	// HighFragThreshold is the threshold at which we start trimming old
+	// fragmented packets. Linux uses a default value of 4 MB. See
+	// net.ipv4.ipfrag_high_thresh for more information.
+	HighFragThreshold = 4 << 20 // 4MB
 
-// LowFragThreshold is the threshold we reach to when we start dropping
-// older fragmented packets. It's important that we keep enough room for newer
-// packets to be re-assembled. Hence, this needs to be lower than
-// HighFragThreshold enough. Linux uses a default value of 3 MB. See
-// net.ipv4.ipfrag_low_thresh for more information.
-const LowFragThreshold = 3 << 20 // 3MB
+	// LowFragThreshold is the threshold we reach to when we start dropping
+	// older fragmented packets. It's important that we keep enough room for newer
+	// packets to be re-assembled. Hence, this needs to be lower than
+	// HighFragThreshold enough. Linux uses a default value of 3 MB. See
+	// net.ipv4.ipfrag_low_thresh for more information.
+	LowFragThreshold = 3 << 20 // 3MB
+
+	// minBlockSize is the minimum block size for fragments.
+	minBlockSize = 1
+)
+
+var (
+	// ErrInvalidArgs indicates to the caller that that an invalid argument was
+	// provided.
+	ErrInvalidArgs = errors.New("invalid args")
+)
+
+// FragmentID is the identifier for a fragment.
+type FragmentID struct {
+	// Source is the source address of the fragment.
+	Source tcpip.Address
+
+	// Destination is the destination address of the fragment.
+	Destination tcpip.Address
+
+	// ID is the identification value of the fragment.
+	//
+	// This is a uint32 because IPv6 uses a 32-bit identification value.
+	ID uint32
+
+	// The protocol for the packet.
+	Protocol uint8
+}
 
 // Fragmentation is the main structure that other modules
 // of the stack should use to implement IP Fragmentation.
@@ -46,14 +76,17 @@ type Fragmentation struct {
 	mu           sync.Mutex
 	highLimit    int
 	lowLimit     int
-	reassemblers map[uint32]*reassembler
+	reassemblers map[FragmentID]*reassembler
 	rList        reassemblerList
 	size         int
 	timeout      time.Duration
+	blockSize    uint16
 }
 
 // NewFragmentation creates a new Fragmentation.
 //
+// blockSize specifies the fragment block size, in bytes.
+//
 // highMemoryLimit specifies the limit on the memory consumed
 // by the fragments stored by Fragmentation (overhead of internal data-structures
 // is not accounted). Fragments are dropped when the limit is reached.
@@ -64,7 +97,7 @@ type Fragmentation struct {
 // reassemblingTimeout specifies the maximum time allowed to reassemble a packet.
 // Fragments are lazily evicted only when a new a packet with an
 // already existing fragmentation-id arrives after the timeout.
-func NewFragmentation(highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration) *Fragmentation {
+func NewFragmentation(blockSize uint16, highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration) *Fragmentation {
 	if lowMemoryLimit >= highMemoryLimit {
 		lowMemoryLimit = highMemoryLimit
 	}
@@ -73,17 +106,46 @@ func NewFragmentation(highMemoryLimit, lowMemoryLimit int, reassemblingTimeout t
 		lowMemoryLimit = 0
 	}
 
+	if blockSize < minBlockSize {
+		blockSize = minBlockSize
+	}
+
 	return &Fragmentation{
-		reassemblers: make(map[uint32]*reassembler),
+		reassemblers: make(map[FragmentID]*reassembler),
 		highLimit:    highMemoryLimit,
 		lowLimit:     lowMemoryLimit,
 		timeout:      reassemblingTimeout,
+		blockSize:    blockSize,
 	}
 }
 
-// Process processes an incoming fragment belonging to an ID
-// and returns a complete packet when all the packets belonging to that ID have been received.
-func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, error) {
+// Process processes an incoming fragment belonging to an ID and returns a
+// complete packet when all the packets belonging to that ID have been received.
+//
+// [first, last] is the range of the fragment bytes.
+//
+// first must be a multiple of the block size f is configured with. The size
+// of the fragment data must be a multiple of the block size, unless there are
+// no fragments following this fragment (more set to false).
+func (f *Fragmentation) Process(id FragmentID, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, error) {
+	if first > last {
+		return buffer.VectorisedView{}, false, fmt.Errorf("first=%d is greater than last=%d: %w", first, last, ErrInvalidArgs)
+	}
+
+	if first%f.blockSize != 0 {
+		return buffer.VectorisedView{}, false, fmt.Errorf("first=%d is not a multiple of block size=%d: %w", first, f.blockSize, ErrInvalidArgs)
+	}
+
+	fragmentSize := last - first + 1
+	if more && fragmentSize%f.blockSize != 0 {
+		return buffer.VectorisedView{}, false, fmt.Errorf("fragment size=%d bytes is not a multiple of block size=%d on non-final fragment: %w", fragmentSize, f.blockSize, ErrInvalidArgs)
+	}
+
+	if l := vv.Size(); l < int(fragmentSize) {
+		return buffer.VectorisedView{}, false, fmt.Errorf("got fragment size=%d bytes less than the expected fragment size=%d bytes (first=%d last=%d): %w", l, fragmentSize, first, last, ErrInvalidArgs)
+	}
+	vv.CapLength(int(fragmentSize))
+
 	f.mu.Lock()
 	r, ok := f.reassemblers[id]
 	if ok && r.tooOld(f.timeout) {
diff --git a/pkg/tcpip/network/fragmentation/fragmentation_test.go b/pkg/tcpip/network/fragmentation/fragmentation_test.go
index 72c0f53be..9eedd33c4 100644
--- a/pkg/tcpip/network/fragmentation/fragmentation_test.go
+++ b/pkg/tcpip/network/fragmentation/fragmentation_test.go
@@ -15,6 +15,7 @@
 package fragmentation
 
 import (
+	"errors"
 	"reflect"
 	"testing"
 	"time"
@@ -33,7 +34,7 @@ func vv(size int, pieces ...string) buffer.VectorisedView {
 }
 
 type processInput struct {
-	id    uint32
+	id    FragmentID
 	first uint16
 	last  uint16
 	more  bool
@@ -53,8 +54,8 @@ var processTestCases = []struct {
 	{
 		comment: "One ID",
 		in: []processInput{
-			{id: 0, first: 0, last: 1, more: true, vv: vv(2, "01")},
-			{id: 0, first: 2, last: 3, more: false, vv: vv(2, "23")},
+			{id: FragmentID{ID: 0}, first: 0, last: 1, more: true, vv: vv(2, "01")},
+			{id: FragmentID{ID: 0}, first: 2, last: 3, more: false, vv: vv(2, "23")},
 		},
 		out: []processOutput{
 			{vv: buffer.VectorisedView{}, done: false},
@@ -64,10 +65,10 @@ var processTestCases = []struct {
 	{
 		comment: "Two IDs",
 		in: []processInput{
-			{id: 0, first: 0, last: 1, more: true, vv: vv(2, "01")},
-			{id: 1, first: 0, last: 1, more: true, vv: vv(2, "ab")},
-			{id: 1, first: 2, last: 3, more: false, vv: vv(2, "cd")},
-			{id: 0, first: 2, last: 3, more: false, vv: vv(2, "23")},
+			{id: FragmentID{ID: 0}, first: 0, last: 1, more: true, vv: vv(2, "01")},
+			{id: FragmentID{ID: 1}, first: 0, last: 1, more: true, vv: vv(2, "ab")},
+			{id: FragmentID{ID: 1}, first: 2, last: 3, more: false, vv: vv(2, "cd")},
+			{id: FragmentID{ID: 0}, first: 2, last: 3, more: false, vv: vv(2, "23")},
 		},
 		out: []processOutput{
 			{vv: buffer.VectorisedView{}, done: false},
@@ -81,7 +82,7 @@ var processTestCases = []struct {
 func TestFragmentationProcess(t *testing.T) {
 	for _, c := range processTestCases {
 		t.Run(c.comment, func(t *testing.T) {
-			f := NewFragmentation(1024, 512, DefaultReassembleTimeout)
+			f := NewFragmentation(minBlockSize, 1024, 512, DefaultReassembleTimeout)
 			for i, in := range c.in {
 				vv, done, err := f.Process(in.id, in.first, in.last, in.more, in.vv)
 				if err != nil {
@@ -110,14 +111,14 @@ func TestFragmentationProcess(t *testing.T) {
 
 func TestReassemblingTimeout(t *testing.T) {
 	timeout := time.Millisecond
-	f := NewFragmentation(1024, 512, timeout)
+	f := NewFragmentation(minBlockSize, 1024, 512, timeout)
 	// Send first fragment with id = 0, first = 0, last = 0, and more = true.
-	f.Process(0, 0, 0, true, vv(1, "0"))
+	f.Process(FragmentID{}, 0, 0, true, vv(1, "0"))
 	// Sleep more than the timeout.
 	time.Sleep(2 * timeout)
 	// Send another fragment that completes a packet.
 	// However, no packet should be reassembled because the fragment arrived after the timeout.
-	_, done, err := f.Process(0, 1, 1, false, vv(1, "1"))
+	_, done, err := f.Process(FragmentID{}, 1, 1, false, vv(1, "1"))
 	if err != nil {
 		t.Fatalf("f.Process(0, 1, 1, false, vv(1, \"1\")) failed: %v", err)
 	}
@@ -127,35 +128,35 @@ func TestReassemblingTimeout(t *testing.T) {
 }
 
 func TestMemoryLimits(t *testing.T) {
-	f := NewFragmentation(3, 1, DefaultReassembleTimeout)
+	f := NewFragmentation(minBlockSize, 3, 1, DefaultReassembleTimeout)
 	// Send first fragment with id = 0.
-	f.Process(0, 0, 0, true, vv(1, "0"))
+	f.Process(FragmentID{ID: 0}, 0, 0, true, vv(1, "0"))
 	// Send first fragment with id = 1.
-	f.Process(1, 0, 0, true, vv(1, "1"))
+	f.Process(FragmentID{ID: 1}, 0, 0, true, vv(1, "1"))
 	// Send first fragment with id = 2.
-	f.Process(2, 0, 0, true, vv(1, "2"))
+	f.Process(FragmentID{ID: 2}, 0, 0, true, vv(1, "2"))
 
 	// Send first fragment with id = 3. This should caused id = 0 and id = 1 to be
 	// evicted.
-	f.Process(3, 0, 0, true, vv(1, "3"))
+	f.Process(FragmentID{ID: 3}, 0, 0, true, vv(1, "3"))
 
-	if _, ok := f.reassemblers[0]; ok {
+	if _, ok := f.reassemblers[FragmentID{ID: 0}]; ok {
 		t.Errorf("Memory limits are not respected: id=0 has not been evicted.")
 	}
-	if _, ok := f.reassemblers[1]; ok {
+	if _, ok := f.reassemblers[FragmentID{ID: 1}]; ok {
 		t.Errorf("Memory limits are not respected: id=1 has not been evicted.")
 	}
-	if _, ok := f.reassemblers[3]; !ok {
+	if _, ok := f.reassemblers[FragmentID{ID: 3}]; !ok {
 		t.Errorf("Implementation of memory limits is wrong: id=3 is not present.")
 	}
 }
 
 func TestMemoryLimitsIgnoresDuplicates(t *testing.T) {
-	f := NewFragmentation(1, 0, DefaultReassembleTimeout)
+	f := NewFragmentation(minBlockSize, 1, 0, DefaultReassembleTimeout)
 	// Send first fragment with id = 0.
-	f.Process(0, 0, 0, true, vv(1, "0"))
+	f.Process(FragmentID{}, 0, 0, true, vv(1, "0"))
 	// Send the same packet again.
-	f.Process(0, 0, 0, true, vv(1, "0"))
+	f.Process(FragmentID{}, 0, 0, true, vv(1, "0"))
 
 	got := f.size
 	want := 1
@@ -163,3 +164,97 @@ func TestMemoryLimitsIgnoresDuplicates(t *testing.T) {
 		t.Errorf("Wrong size, duplicates are not handled correctly: got=%d, want=%d.", got, want)
 	}
 }
+
+func TestErrors(t *testing.T) {
+	tests := []struct {
+		name      string
+		blockSize uint16
+		first     uint16
+		last      uint16
+		more      bool
+		data      string
+		err       error
+	}{
+		{
+			name:      "exact block size without more",
+			blockSize: 2,
+			first:     2,
+			last:      3,
+			more:      false,
+			data:      "01",
+		},
+		{
+			name:      "exact block size with more",
+			blockSize: 2,
+			first:     2,
+			last:      3,
+			more:      true,
+			data:      "01",
+		},
+		{
+			name:      "exact block size with more and extra data",
+			blockSize: 2,
+			first:     2,
+			last:      3,
+			more:      true,
+			data:      "012",
+		},
+		{
+			name:      "exact block size with more and too little data",
+			blockSize: 2,
+			first:     2,
+			last:      3,
+			more:      true,
+			data:      "0",
+			err:       ErrInvalidArgs,
+		},
+		{
+			name:      "not exact block size with more",
+			blockSize: 2,
+			first:     2,
+			last:      2,
+			more:      true,
+			data:      "0",
+			err:       ErrInvalidArgs,
+		},
+		{
+			name:      "not exact block size without more",
+			blockSize: 2,
+			first:     2,
+			last:      2,
+			more:      false,
+			data:      "0",
+		},
+		{
+			name:      "first not a multiple of block size",
+			blockSize: 2,
+			first:     3,
+			last:      4,
+			more:      true,
+			data:      "01",
+			err:       ErrInvalidArgs,
+		},
+		{
+			name:      "first more than last",
+			blockSize: 2,
+			first:     4,
+			last:      3,
+			more:      true,
+			data:      "01",
+			err:       ErrInvalidArgs,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			f := NewFragmentation(test.blockSize, HighFragThreshold, LowFragThreshold, DefaultReassembleTimeout)
+			_, done, err := f.Process(FragmentID{}, test.first, test.last, test.more, vv(len(test.data), test.data))
+			if !errors.Is(err, test.err) {
+				t.Errorf("got Proceess(_, %d, %d, %t, %q) = (_, _, %v), want = (_, _, %v)", test.first, test.last, test.more, test.data, err, test.err)
+			}
+			if done {
+				t.Errorf("got Proceess(_, %d, %d, %t, %q) = (_, true, _), want = (_, false, _)", test.first, test.last, test.more, test.data)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go
index 0a83d81f2..50d30bbf0 100644
--- a/pkg/tcpip/network/fragmentation/reassembler.go
+++ b/pkg/tcpip/network/fragmentation/reassembler.go
@@ -32,7 +32,7 @@ type hole struct {
 
 type reassembler struct {
 	reassemblerEntry
-	id           uint32
+	id           FragmentID
 	size         int
 	mu           sync.Mutex
 	holes        []hole
@@ -42,7 +42,7 @@ type reassembler struct {
 	creationTime time.Time
 }
 
-func newReassembler(id uint32) *reassembler {
+func newReassembler(id FragmentID) *reassembler {
 	r := &reassembler{
 		id:           id,
 		holes:        make([]hole, 0, 16),
diff --git a/pkg/tcpip/network/fragmentation/reassembler_test.go b/pkg/tcpip/network/fragmentation/reassembler_test.go
index 7eee0710d..dff7c9dcb 100644
--- a/pkg/tcpip/network/fragmentation/reassembler_test.go
+++ b/pkg/tcpip/network/fragmentation/reassembler_test.go
@@ -94,7 +94,7 @@ var holesTestCases = []struct {
 
 func TestUpdateHoles(t *testing.T) {
 	for _, c := range holesTestCases {
-		r := newReassembler(0)
+		r := newReassembler(FragmentID{})
 		for _, i := range c.in {
 			r.updateHoles(i.first, i.last, i.more)
 		}
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
index 4c20301c6..e6768258a 100644
--- a/pkg/tcpip/network/ip_test.go
+++ b/pkg/tcpip/network/ip_test.go
@@ -96,7 +96,7 @@ func (t *testObject) checkValues(protocol tcpip.TransportProtocolNumber, vv buff
 // DeliverTransportPacket is called by network endpoints after parsing incoming
 // packets. This is used by the test object to verify that the results of the
 // parsing are expected.
-func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt stack.PacketBuffer) {
+func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt *stack.PacketBuffer) {
 	t.checkValues(protocol, pkt.Data, r.RemoteAddress, r.LocalAddress)
 	t.dataCalls++
 }
@@ -104,7 +104,7 @@ func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.Trans
 // DeliverTransportControlPacket is called by network endpoints after parsing
 // incoming control (ICMP) packets. This is used by the test object to verify
 // that the results of the parsing are expected.
-func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
+func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 	t.checkValues(trans, pkt.Data, remote, local)
 	if typ != t.typ {
 		t.t.Errorf("typ = %v, want %v", typ, t.typ)
@@ -150,19 +150,19 @@ func (*testObject) Wait() {}
 // WritePacket is called by network endpoints after producing a packet and
 // writing it to the link endpoint. This is used by the test object to verify
 // that the produced packet is as expected.
-func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	var prot tcpip.TransportProtocolNumber
 	var srcAddr tcpip.Address
 	var dstAddr tcpip.Address
 
 	if t.v4 {
-		h := header.IPv4(pkt.Header.View())
+		h := header.IPv4(pkt.NetworkHeader().View())
 		prot = tcpip.TransportProtocolNumber(h.Protocol())
 		srcAddr = h.SourceAddress()
 		dstAddr = h.DestinationAddress()
 
 	} else {
-		h := header.IPv6(pkt.Header.View())
+		h := header.IPv6(pkt.NetworkHeader().View())
 		prot = tcpip.TransportProtocolNumber(h.NextHeader())
 		srcAddr = h.SourceAddress()
 		dstAddr = h.DestinationAddress()
@@ -172,14 +172,24 @@ func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.Ne
 }
 
 // WritePackets implements stack.LinkEndpoint.WritePackets.
-func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+func (*testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	panic("not implemented")
 }
 
-func (t *testObject) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error {
+func (*testObject) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
+func (*testObject) ARPHardwareType() header.ARPHardwareType {
+	panic("not implemented")
+}
+
+// AddHeader implements stack.LinkEndpoint.AddHeader.
+func (*testObject) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	panic("not implemented")
+}
+
 func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
 	s := stack.New(stack.Options{
 		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
@@ -233,8 +243,11 @@ func TestIPv4Send(t *testing.T) {
 		payload[i] = uint8(i)
 	}
 
-	// Allocate the header buffer.
-	hdr := buffer.NewPrependable(int(ep.MaxHeaderLength()))
+	// Setup the packet buffer.
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(ep.MaxHeaderLength()),
+		Data:               payload.ToVectorisedView(),
+	})
 
 	// Issue the write.
 	o.protocol = 123
@@ -246,10 +259,11 @@ func TestIPv4Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
-		Header: hdr,
-		Data:   payload.ToVectorisedView(),
-	}); err != nil {
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{
+		Protocol: 123,
+		TTL:      123,
+		TOS:      stack.DefaultTOS,
+	}, pkt); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 }
@@ -289,9 +303,13 @@ func TestIPv4Receive(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	ep.HandlePacket(&r, stack.PacketBuffer{
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: view.ToVectorisedView(),
 	})
+	if _, _, ok := proto.Parse(pkt); !ok {
+		t.Fatalf("failed to parse packet: %x", pkt.Data.ToView())
+	}
+	ep.HandlePacket(&r, pkt)
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -378,10 +396,7 @@ func TestIPv4ReceiveControl(t *testing.T) {
 			o.typ = c.expectedTyp
 			o.extra = c.expectedExtra
 
-			vv := view[:len(view)-c.trunc].ToVectorisedView()
-			ep.HandlePacket(&r, stack.PacketBuffer{
-				Data: vv,
-			})
+			ep.HandlePacket(&r, truncatedPacket(view, c.trunc, header.IPv4MinimumSize))
 			if want := c.expectedCount; o.controlCalls != want {
 				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
 			}
@@ -444,17 +459,25 @@ func TestIPv4FragmentationReceive(t *testing.T) {
 	}
 
 	// Send first segment.
-	ep.HandlePacket(&r, stack.PacketBuffer{
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: frag1.ToVectorisedView(),
 	})
+	if _, _, ok := proto.Parse(pkt); !ok {
+		t.Fatalf("failed to parse packet: %x", pkt.Data.ToView())
+	}
+	ep.HandlePacket(&r, pkt)
 	if o.dataCalls != 0 {
 		t.Fatalf("Bad number of data calls: got %x, want 0", o.dataCalls)
 	}
 
 	// Send second segment.
-	ep.HandlePacket(&r, stack.PacketBuffer{
+	pkt = stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: frag2.ToVectorisedView(),
 	})
+	if _, _, ok := proto.Parse(pkt); !ok {
+		t.Fatalf("failed to parse packet: %x", pkt.Data.ToView())
+	}
+	ep.HandlePacket(&r, pkt)
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -474,8 +497,11 @@ func TestIPv6Send(t *testing.T) {
 		payload[i] = uint8(i)
 	}
 
-	// Allocate the header buffer.
-	hdr := buffer.NewPrependable(int(ep.MaxHeaderLength()))
+	// Setup the packet buffer.
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(ep.MaxHeaderLength()),
+		Data:               payload.ToVectorisedView(),
+	})
 
 	// Issue the write.
 	o.protocol = 123
@@ -487,10 +513,11 @@ func TestIPv6Send(t *testing.T) {
 	if err != nil {
 		t.Fatalf("could not find route: %v", err)
 	}
-	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{Protocol: 123, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
-		Header: hdr,
-		Data:   payload.ToVectorisedView(),
-	}); err != nil {
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{
+		Protocol: 123,
+		TTL:      123,
+		TOS:      stack.DefaultTOS,
+	}, pkt); err != nil {
 		t.Fatalf("WritePacket failed: %v", err)
 	}
 }
@@ -530,9 +557,13 @@ func TestIPv6Receive(t *testing.T) {
 		t.Fatalf("could not find route: %v", err)
 	}
 
-	ep.HandlePacket(&r, stack.PacketBuffer{
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: view.ToVectorisedView(),
 	})
+	if _, _, ok := proto.Parse(pkt); !ok {
+		t.Fatalf("failed to parse packet: %x", pkt.Data.ToView())
+	}
+	ep.HandlePacket(&r, pkt)
 	if o.dataCalls != 1 {
 		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
 	}
@@ -644,12 +675,23 @@ func TestIPv6ReceiveControl(t *testing.T) {
 			// Set ICMPv6 checksum.
 			icmp.SetChecksum(header.ICMPv6Checksum(icmp, outerSrcAddr, localIpv6Addr, buffer.VectorisedView{}))
 
-			ep.HandlePacket(&r, stack.PacketBuffer{
-				Data: view[:len(view)-c.trunc].ToVectorisedView(),
-			})
+			ep.HandlePacket(&r, truncatedPacket(view, c.trunc, header.IPv6MinimumSize))
 			if want := c.expectedCount; o.controlCalls != want {
 				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
 			}
 		})
 	}
 }
+
+// truncatedPacket returns a PacketBuffer based on a truncated view. If view,
+// after truncation, is large enough to hold a network header, it makes part of
+// view the packet's NetworkHeader and the rest its Data. Otherwise all of view
+// becomes Data.
+func truncatedPacket(view buffer.View, trunc, netHdrLen int) *stack.PacketBuffer {
+	v := view[:len(view)-trunc]
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: v.ToVectorisedView(),
+	})
+	_, _ = pkt.NetworkHeader().Consume(netHdrLen)
+	return pkt
+}
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
index 78420d6e6..d142b4ffa 100644
--- a/pkg/tcpip/network/ipv4/BUILD
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -34,6 +34,6 @@ go_test(
         "//pkg/tcpip/transport/tcp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
-        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
index 4cbefe5ab..067d770f3 100644
--- a/pkg/tcpip/network/ipv4/icmp.go
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -24,7 +24,7 @@ import (
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
-func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
 	if !ok {
 		return
@@ -56,9 +56,12 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, pkt stack.PacketBuffer) {
+func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer) {
 	stats := r.Stats()
 	received := stats.ICMP.V4PacketsReceived
+	// TODO(gvisor.dev/issue/170): ICMP packets don't have their
+	// TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a
+	// full explanation.
 	v, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
 	if !ok {
 		received.Invalid.Increment()
@@ -86,27 +89,55 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt stack.PacketBuffer) {
 			return
 		}
 
+		// Make a copy of data before pkt gets sent to raw socket.
+		// DeliverTransportPacket will take ownership of pkt.
+		replyData := pkt.Data.Clone(nil)
+		replyData.TrimFront(header.ICMPv4MinimumSize)
+
 		// It's possible that a raw socket expects to receive this.
 		h.SetChecksum(wantChecksum)
-		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, stack.PacketBuffer{
-			Data:          pkt.Data.Clone(nil),
-			NetworkHeader: append(buffer.View(nil), pkt.NetworkHeader...),
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, pkt)
+
+		remoteLinkAddr := r.RemoteLinkAddress
+
+		// As per RFC 1122 section 3.2.1.3, when a host sends any datagram, the IP
+		// source address MUST be one of its own IP addresses (but not a broadcast
+		// or multicast address).
+		localAddr := r.LocalAddress
+		if r.IsInboundBroadcast() || header.IsV4MulticastAddress(r.LocalAddress) {
+			localAddr = ""
+		}
+
+		r, err := r.Stack().FindRoute(e.NICID(), localAddr, r.RemoteAddress, ProtocolNumber, false /* multicastLoop */)
+		if err != nil {
+			// If we cannot find a route to the destination, silently drop the packet.
+			return
+		}
+		defer r.Release()
+
+		// Use the remote link address from the incoming packet.
+		r.ResolveWith(remoteLinkAddr)
+
+		// Prepare a reply packet.
+		icmpHdr := make(header.ICMPv4, header.ICMPv4MinimumSize)
+		copy(icmpHdr, h)
+		icmpHdr.SetType(header.ICMPv4EchoReply)
+		icmpHdr.SetChecksum(0)
+		icmpHdr.SetChecksum(^header.Checksum(icmpHdr, header.ChecksumVV(replyData, 0)))
+		dataVV := buffer.View(icmpHdr).ToVectorisedView()
+		dataVV.Append(replyData)
+		replyPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: int(r.MaxHeaderLength()),
+			Data:               dataVV,
 		})
 
-		vv := pkt.Data.Clone(nil)
-		vv.TrimFront(header.ICMPv4MinimumSize)
-		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv4MinimumSize)
-		pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
-		copy(pkt, h)
-		pkt.SetType(header.ICMPv4EchoReply)
-		pkt.SetChecksum(0)
-		pkt.SetChecksum(^header.Checksum(pkt, header.ChecksumVV(vv, 0)))
+		// Send out the reply packet.
 		sent := stats.ICMP.V4PacketsSent
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
-			Header:          hdr,
-			Data:            vv,
-			TransportHeader: buffer.View(pkt),
-		}); err != nil {
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{
+			Protocol: header.ICMPv4ProtocolNumber,
+			TTL:      r.DefaultTTL(),
+			TOS:      stack.DefaultTOS,
+		}, replyPkt); err != nil {
 			sent.Dropped.Increment()
 			return
 		}
@@ -122,6 +153,9 @@ func (e *endpoint) handleICMP(r *stack.Route, pkt stack.PacketBuffer) {
 
 		pkt.Data.TrimFront(header.ICMPv4MinimumSize)
 		switch h.Code() {
+		case header.ICMPv4HostUnreachable:
+			e.handleControl(stack.ControlNoRoute, 0, pkt)
+
 		case header.ICMPv4PortUnreachable:
 			e.handleControl(stack.ControlPortUnreachable, 0, pkt)
 
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
index 64046cbbf..3cd48ceb3 100644
--- a/pkg/tcpip/network/ipv4/ipv4.go
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -44,30 +44,32 @@ const (
 
 	// buckets is the number of identifier buckets.
 	buckets = 2048
+
+	// The size of a fragment block, in bytes, as per RFC 791 section 3.1,
+	// page 14.
+	fragmentblockSize = 8
 )
 
 type endpoint struct {
-	nicID         tcpip.NICID
-	id            stack.NetworkEndpointID
-	prefixLen     int
-	linkEP        stack.LinkEndpoint
-	dispatcher    stack.TransportDispatcher
-	fragmentation *fragmentation.Fragmentation
-	protocol      *protocol
-	stack         *stack.Stack
+	nicID      tcpip.NICID
+	id         stack.NetworkEndpointID
+	prefixLen  int
+	linkEP     stack.LinkEndpoint
+	dispatcher stack.TransportDispatcher
+	protocol   *protocol
+	stack      *stack.Stack
 }
 
 // NewEndpoint creates a new ipv4 endpoint.
 func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
 	e := &endpoint{
-		nicID:         nicID,
-		id:            stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
-		prefixLen:     addrWithPrefix.PrefixLen,
-		linkEP:        linkEP,
-		dispatcher:    dispatcher,
-		fragmentation: fragmentation.NewFragmentation(fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout),
-		protocol:      p,
-		stack:         st,
+		nicID:      nicID,
+		id:         stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
+		prefixLen:  addrWithPrefix.PrefixLen,
+		linkEP:     linkEP,
+		dispatcher: dispatcher,
+		protocol:   p,
+		stack:      st,
 	}
 
 	return e, nil
@@ -124,14 +126,12 @@ func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
 }
 
 // writePacketFragments calls e.linkEP.WritePacket with each packet fragment to
-// write. It assumes that the IP header is entirely in pkt.Header but does not
-// assume that only the IP header is in pkt.Header. It assumes that the input
-// packet's stated length matches the length of the header+payload. mtu
-// includes the IP header and options. This does not support the DontFragment
-// IP flag.
-func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int, pkt stack.PacketBuffer) *tcpip.Error {
+// write. It assumes that the IP header is already present in pkt.NetworkHeader.
+// pkt.TransportHeader may be set. mtu includes the IP header and options. This
+// does not support the DontFragment IP flag.
+func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int, pkt *stack.PacketBuffer) *tcpip.Error {
 	// This packet is too big, it needs to be fragmented.
-	ip := header.IPv4(pkt.Header.View())
+	ip := header.IPv4(pkt.NetworkHeader().View())
 	flags := ip.Flags()
 
 	// Update mtu to take into account the header, which will exist in all
@@ -145,91 +145,88 @@ func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int,
 
 	outerMTU := innerMTU + int(ip.HeaderLength())
 	offset := ip.FragmentOffset()
-	originalAvailableLength := pkt.Header.AvailableLength()
+
+	// Keep the length reserved for link-layer, we need to create fragments with
+	// the same reserved length.
+	reservedForLink := pkt.AvailableHeaderBytes()
+
+	// Destroy the packet, pull all payloads out for fragmentation.
+	transHeader, data := pkt.TransportHeader().View(), pkt.Data
+
+	// Where possible, the first fragment that is sent has the same
+	// number of bytes reserved for header as the input packet. The link-layer
+	// endpoint may depend on this for looking at, eg, L4 headers.
+	transFitsFirst := len(transHeader) <= innerMTU
+
 	for i := 0; i < n; i++ {
-		// Where possible, the first fragment that is sent has the same
-		// pkt.Header.UsedLength() as the input packet. The link-layer
-		// endpoint may depend on this for looking at, eg, L4 headers.
-		h := ip
-		if i > 0 {
-			pkt.Header = buffer.NewPrependable(int(ip.HeaderLength()) + originalAvailableLength)
-			h = header.IPv4(pkt.Header.Prepend(int(ip.HeaderLength())))
-			copy(h, ip[:ip.HeaderLength()])
+		reserve := reservedForLink + int(ip.HeaderLength())
+		if i == 0 && transFitsFirst {
+			// Reserve for transport header if it's going to be put in the first
+			// fragment.
+			reserve += len(transHeader)
 		}
+		fragPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: reserve,
+		})
+		fragPkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
+
+		// Copy data for the fragment.
+		avail := innerMTU
+
+		if n := len(transHeader); n > 0 {
+			if n > avail {
+				n = avail
+			}
+			if i == 0 && transFitsFirst {
+				copy(fragPkt.TransportHeader().Push(n), transHeader)
+			} else {
+				fragPkt.Data.AppendView(transHeader[:n:n])
+			}
+			transHeader = transHeader[n:]
+			avail -= n
+		}
+
+		if avail > 0 {
+			n := data.Size()
+			if n > avail {
+				n = avail
+			}
+			data.ReadToVV(&fragPkt.Data, n)
+			avail -= n
+		}
+
+		copied := uint16(innerMTU - avail)
+
+		// Set lengths in header and calculate checksum.
+		h := header.IPv4(fragPkt.NetworkHeader().Push(len(ip)))
+		copy(h, ip)
 		if i != n-1 {
 			h.SetTotalLength(uint16(outerMTU))
 			h.SetFlagsFragmentOffset(flags|header.IPv4FlagMoreFragments, offset)
 		} else {
-			h.SetTotalLength(uint16(h.HeaderLength()) + uint16(pkt.Data.Size()))
+			h.SetTotalLength(uint16(h.HeaderLength()) + copied)
 			h.SetFlagsFragmentOffset(flags, offset)
 		}
 		h.SetChecksum(0)
 		h.SetChecksum(^h.CalculateChecksum())
-		offset += uint16(innerMTU)
-		if i > 0 {
-			newPayload := pkt.Data.Clone(nil)
-			newPayload.CapLength(innerMTU)
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, stack.PacketBuffer{
-				Header:        pkt.Header,
-				Data:          newPayload,
-				NetworkHeader: buffer.View(h),
-			}); err != nil {
-				return err
-			}
-			r.Stats().IP.PacketsSent.Increment()
-			pkt.Data.TrimFront(newPayload.Size())
-			continue
-		}
-		// Special handling for the first fragment because it comes
-		// from the header.
-		if outerMTU >= pkt.Header.UsedLength() {
-			// This fragment can fit all of pkt.Header and possibly
-			// some of pkt.Data, too.
-			newPayload := pkt.Data.Clone(nil)
-			newPayloadLength := outerMTU - pkt.Header.UsedLength()
-			newPayload.CapLength(newPayloadLength)
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, stack.PacketBuffer{
-				Header:        pkt.Header,
-				Data:          newPayload,
-				NetworkHeader: buffer.View(h),
-			}); err != nil {
-				return err
-			}
-			r.Stats().IP.PacketsSent.Increment()
-			pkt.Data.TrimFront(newPayloadLength)
-		} else {
-			// The fragment is too small to fit all of pkt.Header.
-			startOfHdr := pkt.Header
-			startOfHdr.TrimBack(pkt.Header.UsedLength() - outerMTU)
-			emptyVV := buffer.NewVectorisedView(0, []buffer.View{})
-			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, stack.PacketBuffer{
-				Header:        startOfHdr,
-				Data:          emptyVV,
-				NetworkHeader: buffer.View(h),
-			}); err != nil {
-				return err
-			}
-			r.Stats().IP.PacketsSent.Increment()
-			// Add the unused bytes of pkt.Header into the pkt.Data
-			// that remains to be sent.
-			restOfHdr := pkt.Header.View()[outerMTU:]
-			tmp := buffer.NewVectorisedView(len(restOfHdr), []buffer.View{buffer.NewViewFromBytes(restOfHdr)})
-			tmp.Append(pkt.Data)
-			pkt.Data = tmp
+		offset += copied
+
+		// Send out the fragment.
+		if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, fragPkt); err != nil {
+			return err
 		}
+		r.Stats().IP.PacketsSent.Increment()
 	}
 	return nil
 }
 
-func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) header.IPv4 {
-	ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
-	length := uint16(hdr.UsedLength() + payloadSize)
-	id := uint32(0)
-	if length > header.IPv4MaximumHeaderSize+8 {
-		// Packets of 68 bytes or less are required by RFC 791 to not be
-		// fragmented, so we only assign ids to larger packets.
-		id = atomic.AddUint32(&e.protocol.ids[hashRoute(r, params.Protocol, e.protocol.hashIV)%buckets], 1)
-	}
+func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams) {
+	ip := header.IPv4(pkt.NetworkHeader().Push(header.IPv4MinimumSize))
+	length := uint16(pkt.Size())
+	// RFC 6864 section 4.3 mandates uniqueness of ID values for non-atomic
+	// datagrams. Since the DF bit is never being set here, all datagrams
+	// are non-atomic and need an ID.
+	id := atomic.AddUint32(&e.protocol.ids[hashRoute(r, params.Protocol, e.protocol.hashIV)%buckets], 1)
 	ip.Encode(&header.IPv4Fields{
 		IHL:         header.IPv4MinimumSize,
 		TotalLength: length,
@@ -241,61 +238,46 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 		DstAddr:     r.RemoteAddress,
 	})
 	ip.SetChecksum(^ip.CalculateChecksum())
-	return ip
+	pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt stack.PacketBuffer) *tcpip.Error {
-	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
-	pkt.NetworkHeader = buffer.View(ip)
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
+	e.addIPHeader(r, pkt, params)
 
-	nicName := e.stack.FindNICNameFromID(e.NICID())
 	// iptables filtering. All packets that reach here are locally
 	// generated.
+	nicName := e.stack.FindNICNameFromID(e.NICID())
 	ipt := e.stack.IPTables()
-	if ok := ipt.Check(stack.Output, &pkt, gso, r, "", nicName); !ok {
+	if ok := ipt.Check(stack.Output, pkt, gso, r, "", nicName); !ok {
 		// iptables is telling us to drop the packet.
 		return nil
 	}
 
+	// If the packet is manipulated as per NAT Ouput rules, handle packet
+	// based on destination address and do not send the packet to link layer.
+	// TODO(gvisor.dev/issue/170): We should do this for every packet, rather than
+	// only NATted packets, but removing this check short circuits broadcasts
+	// before they are sent out to other hosts.
 	if pkt.NatDone {
-		// If the packet is manipulated as per NAT Ouput rules, handle packet
-		// based on destination address and do not send the packet to link layer.
-		netHeader := header.IPv4(pkt.NetworkHeader)
+		netHeader := header.IPv4(pkt.NetworkHeader().View())
 		ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress())
 		if err == nil {
-			src := netHeader.SourceAddress()
-			dst := netHeader.DestinationAddress()
-			route := r.ReverseRoute(src, dst)
-
-			views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
-			views[0] = pkt.Header.View()
-			views = append(views, pkt.Data.Views()...)
-			packet := stack.PacketBuffer{
-				Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views)}
-			ep.HandlePacket(&route, packet)
+			route := r.ReverseRoute(netHeader.SourceAddress(), netHeader.DestinationAddress())
+			ep.HandlePacket(&route, pkt)
 			return nil
 		}
 	}
 
 	if r.Loop&stack.PacketLoop != 0 {
-		// The inbound path expects the network header to still be in
-		// the PacketBuffer's Data field.
-		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
-		views[0] = pkt.Header.View()
-		views = append(views, pkt.Data.Views()...)
 		loopedR := r.MakeLoopedRoute()
-
-		e.HandlePacket(&loopedR, stack.PacketBuffer{
-			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
-		})
-
+		e.HandlePacket(&loopedR, pkt)
 		loopedR.Release()
 	}
 	if r.Loop&stack.PacketOut == 0 {
 		return nil
 	}
-	if pkt.Header.UsedLength()+pkt.Data.Size() > int(e.linkEP.MTU()) && (gso == nil || gso.Type == stack.GSONone) {
+	if pkt.Size() > int(e.linkEP.MTU()) && (gso == nil || gso.Type == stack.GSONone) {
 		return e.writePacketFragments(r, gso, int(e.linkEP.MTU()), pkt)
 	}
 	if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
@@ -315,8 +297,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 	}
 
 	for pkt := pkts.Front(); pkt != nil; {
-		ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
-		pkt.NetworkHeader = buffer.View(ip)
+		e.addIPHeader(r, pkt, params)
 		pkt = pkt.Next()
 	}
 
@@ -341,24 +322,17 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 			continue
 		}
 		if _, ok := natPkts[pkt]; ok {
-			netHeader := header.IPv4(pkt.NetworkHeader)
-			ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress())
-			if err == nil {
+			netHeader := header.IPv4(pkt.NetworkHeader().View())
+			if ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress()); err == nil {
 				src := netHeader.SourceAddress()
 				dst := netHeader.DestinationAddress()
 				route := r.ReverseRoute(src, dst)
-
-				views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
-				views[0] = pkt.Header.View()
-				views = append(views, pkt.Data.Views()...)
-				packet := stack.PacketBuffer{
-					Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views)}
-				ep.HandlePacket(&route, packet)
+				ep.HandlePacket(&route, pkt)
 				n++
 				continue
 			}
 		}
-		if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, *pkt); err != nil {
+		if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
 			r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
 			return n, err
 		}
@@ -370,7 +344,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
-func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
 	// The packet already has an IP header, but there are a few required
 	// checks.
 	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
@@ -396,13 +370,12 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuf
 
 	// Set the packet ID when zero.
 	if ip.ID() == 0 {
-		id := uint32(0)
-		if pkt.Data.Size() > header.IPv4MaximumHeaderSize+8 {
-			// Packets of 68 bytes or less are required by RFC 791 to not be
-			// fragmented, so we only assign ids to larger packets.
-			id = atomic.AddUint32(&e.protocol.ids[hashRoute(r, 0 /* protocol */, e.protocol.hashIV)%buckets], 1)
+		// RFC 6864 section 4.3 mandates uniqueness of ID values for
+		// non-atomic datagrams, so assign an ID to all such datagrams
+		// according to the definition given in RFC 6864 section 4.
+		if ip.Flags()&header.IPv4FlagDontFragment == 0 || ip.Flags()&header.IPv4FlagMoreFragments != 0 || ip.FragmentOffset() > 0 {
+			ip.SetID(uint16(atomic.AddUint32(&e.protocol.ids[hashRoute(r, 0 /* protocol */, e.protocol.hashIV)%buckets], 1)))
 		}
-		ip.SetID(uint16(id))
 	}
 
 	// Always set the checksum.
@@ -418,43 +391,28 @@ func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuf
 
 	r.Stats().IP.PacketsSent.Increment()
 
-	ip = ip[:ip.HeaderLength()]
-	pkt.Header = buffer.NewPrependableFromView(buffer.View(ip))
-	pkt.Data.TrimFront(int(ip.HeaderLength()))
 	return e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, pkt)
 }
 
 // HandlePacket is called by the link layer when new ipv4 packets arrive for
 // this endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	headerView, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
-	if !ok {
-		r.Stats().IP.MalformedPacketsReceived.Increment()
-		return
-	}
-	h := header.IPv4(headerView)
-	if !h.IsValid(pkt.Data.Size()) {
+func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	h := header.IPv4(pkt.NetworkHeader().View())
+	if !h.IsValid(pkt.Data.Size() + pkt.NetworkHeader().View().Size() + pkt.TransportHeader().View().Size()) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
 		return
 	}
-	pkt.NetworkHeader = headerView[:h.HeaderLength()]
-
-	hlen := int(h.HeaderLength())
-	tlen := int(h.TotalLength())
-	pkt.Data.TrimFront(hlen)
-	pkt.Data.CapLength(tlen - hlen)
 
 	// iptables filtering. All packets that reach here are intended for
 	// this machine and will not be forwarded.
 	ipt := e.stack.IPTables()
-	if ok := ipt.Check(stack.Input, &pkt, nil, nil, "", ""); !ok {
+	if ok := ipt.Check(stack.Input, pkt, nil, nil, "", ""); !ok {
 		// iptables is telling us to drop the packet.
 		return
 	}
 
-	more := (h.Flags() & header.IPv4FlagMoreFragments) != 0
-	if more || h.FragmentOffset() != 0 {
-		if pkt.Data.Size() == 0 {
+	if h.More() || h.FragmentOffset() != 0 {
+		if pkt.Data.Size()+pkt.TransportHeader().View().Size() == 0 {
 			// Drop the packet as it's marked as a fragment but has
 			// no payload.
 			r.Stats().IP.MalformedPacketsReceived.Increment()
@@ -473,7 +431,20 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 		}
 		var ready bool
 		var err error
-		pkt.Data, ready, err = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, more, pkt.Data)
+		pkt.Data, ready, err = e.protocol.fragmentation.Process(
+			// As per RFC 791 section 2.3, the identification value is unique
+			// for a source-destination pair and protocol.
+			fragmentation.FragmentID{
+				Source:      h.SourceAddress(),
+				Destination: h.DestinationAddress(),
+				ID:          uint32(h.ID()),
+				Protocol:    h.Protocol(),
+			},
+			h.FragmentOffset(),
+			last,
+			h.More(),
+			pkt.Data,
+		)
 		if err != nil {
 			r.Stats().IP.MalformedPacketsReceived.Increment()
 			r.Stats().IP.MalformedFragmentsReceived.Increment()
@@ -485,7 +456,6 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 	}
 	p := h.TransportProtocol()
 	if p == header.ICMPv4ProtocolNumber {
-		headerView.CapLength(hlen)
 		e.handleICMP(r, pkt)
 		return
 	}
@@ -504,6 +474,8 @@ type protocol struct {
 	// uint8 portion of it is meaningful and it must be accessed
 	// atomically.
 	defaultTTL uint32
+
+	fragmentation *fragmentation.Fragmentation
 }
 
 // Number returns the ipv4 protocol number.
@@ -565,6 +537,39 @@ func (*protocol) Close() {}
 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}
 
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
+	hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return 0, false, false
+	}
+	ipHdr := header.IPv4(hdr)
+
+	// Header may have options, determine the true header length.
+	headerLen := int(ipHdr.HeaderLength())
+	if headerLen < header.IPv4MinimumSize {
+		// TODO(gvisor.dev/issue/2404): Per RFC 791, IHL needs to be at least 5 in
+		// order for the packet to be valid. Figure out if we want to reject this
+		// case.
+		headerLen = header.IPv4MinimumSize
+	}
+	hdr, ok = pkt.NetworkHeader().Consume(headerLen)
+	if !ok {
+		return 0, false, false
+	}
+	ipHdr = header.IPv4(hdr)
+
+	// If this is a fragment, don't bother parsing the transport header.
+	parseTransportHeader := true
+	if ipHdr.More() || ipHdr.FragmentOffset() != 0 {
+		parseTransportHeader = false
+	}
+
+	pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
+	pkt.Data.CapLength(int(ipHdr.TotalLength()) - len(hdr))
+	return ipHdr.TransportProtocol(), parseTransportHeader, true
+}
+
 // calculateMTU calculates the network-layer payload MTU based on the link-layer
 // payload mtu.
 func calculateMTU(mtu uint32) uint32 {
@@ -596,5 +601,10 @@ func NewProtocol() stack.NetworkProtocol {
 	}
 	hashIV := r[buckets]
 
-	return &protocol{ids: ids, hashIV: hashIV, defaultTTL: DefaultTTL}
+	return &protocol{
+		ids:           ids,
+		hashIV:        hashIV,
+		defaultTTL:    DefaultTTL,
+		fragmentation: fragmentation.NewFragmentation(fragmentblockSize, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout),
+	}
 }
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
index 36035c820..afd3ac06d 100644
--- a/pkg/tcpip/network/ipv4/ipv4_test.go
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -17,6 +17,7 @@ package ipv4_test
 import (
 	"bytes"
 	"encoding/hex"
+	"fmt"
 	"math/rand"
 	"testing"
 
@@ -91,15 +92,11 @@ func TestExcludeBroadcast(t *testing.T) {
 	})
 }
 
-// makeHdrAndPayload generates a randomize packet. hdrLength indicates how much
+// makeRandPkt generates a randomize packet. hdrLength indicates how much
 // data should already be in the header before WritePacket. extraLength
 // indicates how much extra space should be in the header. The payload is made
 // from many Views of the sizes listed in viewSizes.
-func makeHdrAndPayload(hdrLength int, extraLength int, viewSizes []int) (buffer.Prependable, buffer.VectorisedView) {
-	hdr := buffer.NewPrependable(hdrLength + extraLength)
-	hdr.Prepend(hdrLength)
-	rand.Read(hdr.View())
-
+func makeRandPkt(hdrLength int, extraLength int, viewSizes []int) *stack.PacketBuffer {
 	var views []buffer.View
 	totalLength := 0
 	for _, s := range viewSizes {
@@ -108,18 +105,26 @@ func makeHdrAndPayload(hdrLength int, extraLength int, viewSizes []int) (buffer.
 		views = append(views, newView)
 		totalLength += s
 	}
-	payload := buffer.NewVectorisedView(totalLength, views)
-	return hdr, payload
+
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: hdrLength + extraLength,
+		Data:               buffer.NewVectorisedView(totalLength, views),
+	})
+	pkt.NetworkProtocolNumber = header.IPv4ProtocolNumber
+	if _, err := rand.Read(pkt.TransportHeader().Push(hdrLength)); err != nil {
+		panic(fmt.Sprintf("rand.Read: %s", err))
+	}
+	return pkt
 }
 
 // comparePayloads compared the contents of all the packets against the contents
 // of the source packet.
-func compareFragments(t *testing.T, packets []stack.PacketBuffer, sourcePacketInfo stack.PacketBuffer, mtu uint32) {
+func compareFragments(t *testing.T, packets []*stack.PacketBuffer, sourcePacketInfo *stack.PacketBuffer, mtu uint32) {
 	t.Helper()
 	// Make a complete array of the sourcePacketInfo packet.
-	source := header.IPv4(packets[0].Header.View()[:header.IPv4MinimumSize])
-	source = append(source, sourcePacketInfo.Header.View()...)
-	source = append(source, sourcePacketInfo.Data.ToView()...)
+	source := header.IPv4(packets[0].NetworkHeader().View()[:header.IPv4MinimumSize])
+	vv := buffer.NewVectorisedView(sourcePacketInfo.Size(), sourcePacketInfo.Views())
+	source = append(source, vv.ToView()...)
 
 	// Make a copy of the IP header, which will be modified in some fields to make
 	// an expected header.
@@ -132,8 +137,7 @@ func compareFragments(t *testing.T, packets []stack.PacketBuffer, sourcePacketIn
 	var reassembledPayload []byte
 	for i, packet := range packets {
 		// Confirm that the packet is valid.
-		allBytes := packet.Header.View().ToVectorisedView()
-		allBytes.Append(packet.Data)
+		allBytes := buffer.NewVectorisedView(packet.Size(), packet.Views())
 		ip := header.IPv4(allBytes.ToView())
 		if !ip.IsValid(len(ip)) {
 			t.Errorf("IP packet is invalid:\n%s", hex.Dump(ip))
@@ -144,12 +148,22 @@ func compareFragments(t *testing.T, packets []stack.PacketBuffer, sourcePacketIn
 		if got, want := len(ip), int(mtu); got > want {
 			t.Errorf("fragment is too large, got %d want %d", got, want)
 		}
-		if got, want := packet.Header.UsedLength(), sourcePacketInfo.Header.UsedLength()+header.IPv4MinimumSize; i == 0 && want < int(mtu) && got != want {
-			t.Errorf("first fragment hdr parts should have unmodified length if possible: got %d, want %d", got, want)
+		if i == 0 {
+			got := packet.NetworkHeader().View().Size() + packet.TransportHeader().View().Size()
+			// sourcePacketInfo does not have NetworkHeader added, simulate one.
+			want := header.IPv4MinimumSize + sourcePacketInfo.TransportHeader().View().Size()
+			// Check that it kept the transport header in packet.TransportHeader if
+			// it fits in the first fragment.
+			if want < int(mtu) && got != want {
+				t.Errorf("first fragment hdr parts should have unmodified length if possible: got %d, want %d", got, want)
+			}
 		}
-		if got, want := packet.Header.AvailableLength(), sourcePacketInfo.Header.AvailableLength()-header.IPv4MinimumSize; got != want {
+		if got, want := packet.AvailableHeaderBytes(), sourcePacketInfo.AvailableHeaderBytes()-header.IPv4MinimumSize; got != want {
 			t.Errorf("fragment #%d should have the same available space for prepending as source: got %d, want %d", i, got, want)
 		}
+		if got, want := packet.NetworkProtocolNumber, sourcePacketInfo.NetworkProtocolNumber; got != want {
+			t.Errorf("fragment #%d has wrong network protocol number: got %d, want %d", i, got, want)
+		}
 		if i < len(packets)-1 {
 			sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()|header.IPv4FlagMoreFragments, offset)
 		} else {
@@ -174,7 +188,7 @@ func compareFragments(t *testing.T, packets []stack.PacketBuffer, sourcePacketIn
 
 type errorChannel struct {
 	*channel.Endpoint
-	Ch                    chan stack.PacketBuffer
+	Ch                    chan *stack.PacketBuffer
 	packetCollectorErrors []*tcpip.Error
 }
 
@@ -184,7 +198,7 @@ type errorChannel struct {
 func newErrorChannel(size int, mtu uint32, linkAddr tcpip.LinkAddress, packetCollectorErrors []*tcpip.Error) *errorChannel {
 	return &errorChannel{
 		Endpoint:              channel.New(size, mtu, linkAddr),
-		Ch:                    make(chan stack.PacketBuffer, size),
+		Ch:                    make(chan *stack.PacketBuffer, size),
 		packetCollectorErrors: packetCollectorErrors,
 	}
 }
@@ -203,7 +217,7 @@ func (e *errorChannel) Drain() int {
 }
 
 // WritePacket stores outbound packets into the channel.
-func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) *tcpip.Error {
+func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
 	select {
 	case e.Ch <- pkt:
 	default:
@@ -281,22 +295,19 @@ func TestFragmentation(t *testing.T) {
 
 	for _, ft := range fragTests {
 		t.Run(ft.description, func(t *testing.T) {
-			hdr, payload := makeHdrAndPayload(ft.hdrLength, ft.extraLength, ft.payloadViewsSizes)
-			source := stack.PacketBuffer{
-				Header: hdr,
-				// Save the source payload because WritePacket will modify it.
-				Data: payload.Clone(nil),
-			}
+			pkt := makeRandPkt(ft.hdrLength, ft.extraLength, ft.payloadViewsSizes)
+			source := pkt.Clone()
 			c := buildContext(t, nil, ft.mtu)
-			err := c.Route.WritePacket(ft.gso, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, stack.PacketBuffer{
-				Header: hdr,
-				Data:   payload,
-			})
+			err := c.Route.WritePacket(ft.gso, stack.NetworkHeaderParams{
+				Protocol: tcp.ProtocolNumber,
+				TTL:      42,
+				TOS:      stack.DefaultTOS,
+			}, pkt)
 			if err != nil {
 				t.Errorf("err got %v, want %v", err, nil)
 			}
 
-			var results []stack.PacketBuffer
+			var results []*stack.PacketBuffer
 		L:
 			for {
 				select {
@@ -336,12 +347,13 @@ func TestFragmentationErrors(t *testing.T) {
 
 	for _, ft := range fragTests {
 		t.Run(ft.description, func(t *testing.T) {
-			hdr, payload := makeHdrAndPayload(ft.hdrLength, header.IPv4MinimumSize, ft.payloadViewsSizes)
+			pkt := makeRandPkt(ft.hdrLength, header.IPv4MinimumSize, ft.payloadViewsSizes)
 			c := buildContext(t, ft.packetCollectorErrors, ft.mtu)
-			err := c.Route.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{Protocol: tcp.ProtocolNumber, TTL: 42, TOS: stack.DefaultTOS}, stack.PacketBuffer{
-				Header: hdr,
-				Data:   payload,
-			})
+			err := c.Route.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{
+				Protocol: tcp.ProtocolNumber,
+				TTL:      42,
+				TOS:      stack.DefaultTOS,
+			}, pkt)
 			for i := 0; i < len(ft.packetCollectorErrors)-1; i++ {
 				if got, want := ft.packetCollectorErrors[i], (*tcpip.Error)(nil); got != want {
 					t.Errorf("ft.packetCollectorErrors[%d] got %v, want %v", i, got, want)
@@ -460,9 +472,9 @@ func TestInvalidFragments(t *testing.T) {
 			s.CreateNIC(nicID, sniffer.New(ep))
 
 			for _, pkt := range tc.packets {
-				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, stack.PacketBuffer{
+				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, stack.NewPacketBuffer(stack.PacketBufferOptions{
 					Data: buffer.NewVectorisedView(len(pkt), []buffer.View{pkt}),
-				})
+				}))
 			}
 
 			if got, want := s.Stats().IP.MalformedPacketsReceived.Value(), tc.wantMalformedIPPackets; got != want {
@@ -478,12 +490,16 @@ func TestInvalidFragments(t *testing.T) {
 // TestReceiveFragments feeds fragments in through the incoming packet path to
 // test reassembly
 func TestReceiveFragments(t *testing.T) {
-	const addr1 = "\x0c\xa8\x00\x01" // 192.168.0.1
-	const addr2 = "\x0c\xa8\x00\x02" // 192.168.0.2
-	const nicID = 1
+	const (
+		nicID = 1
+
+		addr1 = "\x0c\xa8\x00\x01" // 192.168.0.1
+		addr2 = "\x0c\xa8\x00\x02" // 192.168.0.2
+		addr3 = "\x0c\xa8\x00\x03" // 192.168.0.3
+	)
 
 	// Build and return a UDP header containing payload.
-	udpGen := func(payloadLen int, multiplier uint8) buffer.View {
+	udpGen := func(payloadLen int, multiplier uint8, src, dst tcpip.Address) buffer.View {
 		payload := buffer.NewView(payloadLen)
 		for i := 0; i < len(payload); i++ {
 			payload[i] = uint8(i) * multiplier
@@ -499,20 +515,29 @@ func TestReceiveFragments(t *testing.T) {
 			Length:  uint16(udpLength),
 		})
 		copy(u.Payload(), payload)
-		sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength))
+		sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, src, dst, uint16(udpLength))
 		sum = header.Checksum(payload, sum)
 		u.SetChecksum(^u.CalculateChecksum(sum))
 		return hdr.View()
 	}
 
 	// UDP header plus a payload of 0..256
-	ipv4Payload1 := udpGen(256, 1)
-	udpPayload1 := ipv4Payload1[header.UDPMinimumSize:]
+	ipv4Payload1Addr1ToAddr2 := udpGen(256, 1, addr1, addr2)
+	udpPayload1Addr1ToAddr2 := ipv4Payload1Addr1ToAddr2[header.UDPMinimumSize:]
+	ipv4Payload1Addr3ToAddr2 := udpGen(256, 1, addr3, addr2)
+	udpPayload1Addr3ToAddr2 := ipv4Payload1Addr3ToAddr2[header.UDPMinimumSize:]
 	// UDP header plus a payload of 0..256 in increments of 2.
-	ipv4Payload2 := udpGen(128, 2)
-	udpPayload2 := ipv4Payload2[header.UDPMinimumSize:]
+	ipv4Payload2Addr1ToAddr2 := udpGen(128, 2, addr1, addr2)
+	udpPayload2Addr1ToAddr2 := ipv4Payload2Addr1ToAddr2[header.UDPMinimumSize:]
+	// UDP header plus a payload of 0..256 in increments of 3.
+	// Used to test cases where the fragment blocks are not a multiple of
+	// the fragment block size of 8 (RFC 791 section 3.1 page 14).
+	ipv4Payload3Addr1ToAddr2 := udpGen(127, 3, addr1, addr2)
+	udpPayload3Addr1ToAddr2 := ipv4Payload3Addr1ToAddr2[header.UDPMinimumSize:]
 
 	type fragmentData struct {
+		srcAddr        tcpip.Address
+		dstAddr        tcpip.Address
 		id             uint16
 		flags          uint8
 		fragmentOffset uint16
@@ -528,22 +553,40 @@ func TestReceiveFragments(t *testing.T) {
 			name: "No fragmentation",
 			fragments: []fragmentData{
 				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          0,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1Addr1ToAddr2,
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
+		},
+		{
+			name: "No fragmentation with size not a multiple of fragment block size",
+			fragments: []fragmentData{
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
 					id:             1,
 					flags:          0,
 					fragmentOffset: 0,
-					payload:        ipv4Payload1,
+					payload:        ipv4Payload3Addr1ToAddr2,
 				},
 			},
-			expectedPayloads: [][]byte{udpPayload1},
+			expectedPayloads: [][]byte{udpPayload3Addr1ToAddr2},
 		},
 		{
 			name: "More fragments without payload",
 			fragments: []fragmentData{
 				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
 					id:             1,
 					flags:          header.IPv4FlagMoreFragments,
 					fragmentOffset: 0,
-					payload:        ipv4Payload1,
+					payload:        ipv4Payload1Addr1ToAddr2,
 				},
 			},
 			expectedPayloads: nil,
@@ -552,10 +595,12 @@ func TestReceiveFragments(t *testing.T) {
 			name: "Non-zero fragment offset without payload",
 			fragments: []fragmentData{
 				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
 					id:             1,
 					flags:          0,
 					fragmentOffset: 8,
-					payload:        ipv4Payload1,
+					payload:        ipv4Payload1Addr1ToAddr2,
 				},
 			},
 			expectedPayloads: nil,
@@ -564,34 +609,86 @@ func TestReceiveFragments(t *testing.T) {
 			name: "Two fragments",
 			fragments: []fragmentData{
 				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
 					id:             1,
 					flags:          header.IPv4FlagMoreFragments,
 					fragmentOffset: 0,
-					payload:        ipv4Payload1[:64],
+					payload:        ipv4Payload1Addr1ToAddr2[:64],
 				},
 				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
 					id:             1,
 					flags:          0,
 					fragmentOffset: 64,
-					payload:        ipv4Payload1[64:],
+					payload:        ipv4Payload1Addr1ToAddr2[64:],
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
+		},
+		{
+			name: "Two fragments with last fragment size not a multiple of fragment block size",
+			fragments: []fragmentData{
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload3Addr1ToAddr2[:64],
+				},
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          0,
+					fragmentOffset: 64,
+					payload:        ipv4Payload3Addr1ToAddr2[64:],
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload3Addr1ToAddr2},
+		},
+		{
+			name: "Two fragments with first fragment size not a multiple of fragment block size",
+			fragments: []fragmentData{
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload3Addr1ToAddr2[:63],
+				},
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          0,
+					fragmentOffset: 63,
+					payload:        ipv4Payload3Addr1ToAddr2[63:],
 				},
 			},
-			expectedPayloads: [][]byte{udpPayload1},
+			expectedPayloads: nil,
 		},
 		{
 			name: "Second fragment has MoreFlags set",
 			fragments: []fragmentData{
 				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
 					id:             1,
 					flags:          header.IPv4FlagMoreFragments,
 					fragmentOffset: 0,
-					payload:        ipv4Payload1[:64],
+					payload:        ipv4Payload1Addr1ToAddr2[:64],
 				},
 				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
 					id:             1,
 					flags:          header.IPv4FlagMoreFragments,
 					fragmentOffset: 64,
-					payload:        ipv4Payload1[64:],
+					payload:        ipv4Payload1Addr1ToAddr2[64:],
 				},
 			},
 			expectedPayloads: nil,
@@ -600,16 +697,20 @@ func TestReceiveFragments(t *testing.T) {
 			name: "Two fragments with different IDs",
 			fragments: []fragmentData{
 				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
 					id:             1,
 					flags:          header.IPv4FlagMoreFragments,
 					fragmentOffset: 0,
-					payload:        ipv4Payload1[:64],
+					payload:        ipv4Payload1Addr1ToAddr2[:64],
 				},
 				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
 					id:             2,
 					flags:          0,
 					fragmentOffset: 64,
-					payload:        ipv4Payload1[64:],
+					payload:        ipv4Payload1Addr1ToAddr2[64:],
 				},
 			},
 			expectedPayloads: nil,
@@ -618,31 +719,91 @@ func TestReceiveFragments(t *testing.T) {
 			name: "Two interleaved fragmented packets",
 			fragments: []fragmentData{
 				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
 					id:             1,
 					flags:          header.IPv4FlagMoreFragments,
 					fragmentOffset: 0,
-					payload:        ipv4Payload1[:64],
+					payload:        ipv4Payload1Addr1ToAddr2[:64],
 				},
 				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
 					id:             2,
 					flags:          header.IPv4FlagMoreFragments,
 					fragmentOffset: 0,
-					payload:        ipv4Payload2[:64],
+					payload:        ipv4Payload2Addr1ToAddr2[:64],
 				},
 				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
 					id:             1,
 					flags:          0,
 					fragmentOffset: 64,
-					payload:        ipv4Payload1[64:],
+					payload:        ipv4Payload1Addr1ToAddr2[64:],
 				},
 				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
 					id:             2,
 					flags:          0,
 					fragmentOffset: 64,
-					payload:        ipv4Payload2[64:],
+					payload:        ipv4Payload2Addr1ToAddr2[64:],
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2, udpPayload2Addr1ToAddr2},
+		},
+		{
+			name: "Two interleaved fragmented packets from different sources but with same ID",
+			fragments: []fragmentData{
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1Addr1ToAddr2[:64],
+				},
+				{
+					srcAddr:        addr3,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1Addr3ToAddr2[:32],
+				},
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          0,
+					fragmentOffset: 64,
+					payload:        ipv4Payload1Addr1ToAddr2[64:],
+				},
+				{
+					srcAddr:        addr3,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          0,
+					fragmentOffset: 32,
+					payload:        ipv4Payload1Addr3ToAddr2[32:],
 				},
 			},
-			expectedPayloads: [][]byte{udpPayload1, udpPayload2},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2, udpPayload1Addr3ToAddr2},
+		},
+		{
+			name: "Fragment without followup",
+			fragments: []fragmentData{
+				{
+					srcAddr:        addr1,
+					dstAddr:        addr2,
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1Addr1ToAddr2[:64],
+				},
+			},
+			expectedPayloads: nil,
 		},
 	}
 
@@ -691,16 +852,16 @@ func TestReceiveFragments(t *testing.T) {
 					FragmentOffset: frag.fragmentOffset,
 					TTL:            64,
 					Protocol:       uint8(header.UDPProtocolNumber),
-					SrcAddr:        addr1,
-					DstAddr:        addr2,
+					SrcAddr:        frag.srcAddr,
+					DstAddr:        frag.dstAddr,
 				})
 
 				vv := hdr.View().ToVectorisedView()
 				vv.AppendView(frag.payload)
 
-				e.InjectInbound(header.IPv4ProtocolNumber, stack.PacketBuffer{
+				e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 					Data: vv,
-				})
+				}))
 			}
 
 			if got, want := s.Stats().UDP.PacketsReceived.Value(), uint64(len(test.expectedPayloads)); got != want {
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
index 3f71fc520..bcc64994e 100644
--- a/pkg/tcpip/network/ipv6/BUILD
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -14,7 +14,6 @@ go_library(
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
         "//pkg/tcpip/network/fragmentation",
-        "//pkg/tcpip/network/hash",
         "//pkg/tcpip/stack",
     ],
 )
@@ -39,6 +38,6 @@ go_test(
         "//pkg/tcpip/transport/icmp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
-        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
index bdf3a0d25..39ae19295 100644
--- a/pkg/tcpip/network/ipv6/icmp.go
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -27,7 +27,7 @@ import (
 // the original packet that caused the ICMP one to be sent. This information is
 // used to find out which transport endpoint must be notified about the ICMP
 // packet.
-func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 	h, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
 	if !ok {
 		return
@@ -70,17 +70,20 @@ func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt stack.
 	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
 }
 
-func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.PacketBuffer, hasFragmentHeader bool) {
+func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragmentHeader bool) {
 	stats := r.Stats().ICMP
 	sent := stats.V6PacketsSent
 	received := stats.V6PacketsReceived
+	// TODO(gvisor.dev/issue/170): ICMP packets don't have their
+	// TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a
+	// full explanation.
 	v, ok := pkt.Data.PullUp(header.ICMPv6HeaderSize)
 	if !ok {
 		received.Invalid.Increment()
 		return
 	}
 	h := header.ICMPv6(v)
-	iph := header.IPv6(netHeader)
+	iph := header.IPv6(pkt.NetworkHeader().View())
 
 	// Validate ICMPv6 checksum before processing the packet.
 	//
@@ -125,6 +128,8 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 		}
 		pkt.Data.TrimFront(header.ICMPv6DstUnreachableMinimumSize)
 		switch header.ICMPv6(hdr).Code() {
+		case header.ICMPv6NetworkUnreachable:
+			e.handleControl(stack.ControlNetworkUnreachable, 0, pkt)
 		case header.ICMPv6PortUnreachable:
 			e.handleControl(stack.ControlPortUnreachable, 0, pkt)
 		}
@@ -271,8 +276,10 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 		optsSerializer := header.NDPOptionsSerializer{
 			header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress),
 		}
-		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertMinimumSize + int(optsSerializer.Length()))
-		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertMinimumSize + int(optsSerializer.Length()),
+		})
+		packet := header.ICMPv6(pkt.TransportHeader().Push(header.ICMPv6NeighborAdvertSize))
 		packet.SetType(header.ICMPv6NeighborAdvert)
 		na := header.NDPNeighborAdvert(packet.NDPPayload())
 		na.SetSolicitedFlag(solicited)
@@ -288,9 +295,7 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 		//
 		// The IP Hop Limit field has a value of 255, i.e., the packet
 		// could not possibly have been forwarded by a router.
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}, stack.PacketBuffer{
-			Header: hdr,
-		}); err != nil {
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}, pkt); err != nil {
 			sent.Dropped.Increment()
 			return
 		}
@@ -379,21 +384,40 @@ func (e *endpoint) handleICMP(r *stack.Route, netHeader buffer.View, pkt stack.P
 
 	case header.ICMPv6EchoRequest:
 		received.EchoRequest.Increment()
-		icmpHdr, ok := pkt.Data.PullUp(header.ICMPv6EchoMinimumSize)
+		icmpHdr, ok := pkt.TransportHeader().Consume(header.ICMPv6EchoMinimumSize)
 		if !ok {
 			received.Invalid.Increment()
 			return
 		}
-		pkt.Data.TrimFront(header.ICMPv6EchoMinimumSize)
-		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize)
-		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
+
+		remoteLinkAddr := r.RemoteLinkAddress
+
+		// As per RFC 4291 section 2.7, multicast addresses must not be used as
+		// source addresses in IPv6 packets.
+		localAddr := r.LocalAddress
+		if header.IsV6MulticastAddress(r.LocalAddress) {
+			localAddr = ""
+		}
+
+		r, err := r.Stack().FindRoute(e.NICID(), localAddr, r.RemoteAddress, ProtocolNumber, false /* multicastLoop */)
+		if err != nil {
+			// If we cannot find a route to the destination, silently drop the packet.
+			return
+		}
+		defer r.Release()
+
+		// Use the link address from the source of the original packet.
+		r.ResolveWith(remoteLinkAddr)
+
+		replyPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize,
+			Data:               pkt.Data,
+		})
+		packet := header.ICMPv6(replyPkt.TransportHeader().Push(header.ICMPv6EchoMinimumSize))
 		copy(packet, icmpHdr)
 		packet.SetType(header.ICMPv6EchoReply)
 		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, pkt.Data))
-		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
-			Header: hdr,
-			Data:   pkt.Data,
-		}); err != nil {
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, replyPkt); err != nil {
 			sent.Dropped.Increment()
 			return
 		}
@@ -491,8 +515,6 @@ const (
 	icmpV6LengthOffset = 25
 )
 
-var broadcastMAC = tcpip.LinkAddress([]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff})
-
 var _ stack.LinkAddressResolver = (*protocol)(nil)
 
 // LinkAddressProtocol implements stack.LinkAddressResolver.
@@ -501,7 +523,7 @@ func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
 }
 
 // LinkAddressRequest implements stack.LinkAddressResolver.
-func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error {
+func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP stack.LinkEndpoint) *tcpip.Error {
 	snaddr := header.SolicitedNodeAddr(addr)
 
 	// TODO(b/148672031): Use stack.FindRoute instead of manually creating the
@@ -510,19 +532,25 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	r := &stack.Route{
 		LocalAddress:      localAddr,
 		RemoteAddress:     snaddr,
-		RemoteLinkAddress: header.EthernetAddressFromMulticastIPv6Address(snaddr),
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+	if len(r.RemoteLinkAddress) == 0 {
+		r.RemoteLinkAddress = header.EthernetAddressFromMulticastIPv6Address(snaddr)
 	}
-	hdr := buffer.NewPrependable(int(linkEP.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize)
-	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
-	pkt.SetType(header.ICMPv6NeighborSolicit)
-	copy(pkt[icmpV6OptOffset-len(addr):], addr)
-	pkt[icmpV6OptOffset] = ndpOptSrcLinkAddr
-	pkt[icmpV6LengthOffset] = 1
-	copy(pkt[icmpV6LengthOffset+1:], linkEP.LinkAddress())
-	pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
-
-	length := uint16(hdr.UsedLength())
-	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(linkEP.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize,
+	})
+	icmpHdr := header.ICMPv6(pkt.TransportHeader().Push(header.ICMPv6NeighborAdvertSize))
+	icmpHdr.SetType(header.ICMPv6NeighborSolicit)
+	copy(icmpHdr[icmpV6OptOffset-len(addr):], addr)
+	icmpHdr[icmpV6OptOffset] = ndpOptSrcLinkAddr
+	icmpHdr[icmpV6LengthOffset] = 1
+	copy(icmpHdr[icmpV6LengthOffset+1:], linkEP.LinkAddress())
+	icmpHdr.SetChecksum(header.ICMPv6Checksum(icmpHdr, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+
+	length := uint16(pkt.Size())
+	ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize))
 	ip.Encode(&header.IPv6Fields{
 		PayloadLength: length,
 		NextHeader:    uint8(header.ICMPv6ProtocolNumber),
@@ -532,9 +560,7 @@ func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.
 	})
 
 	// TODO(stijlist): count this in ICMP stats.
-	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, stack.PacketBuffer{
-		Header: hdr,
-	})
+	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, pkt)
 }
 
 // ResolveStaticAddress implements stack.LinkAddressResolver.
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
index d412ff688..2a2f7de01 100644
--- a/pkg/tcpip/network/ipv6/icmp_test.go
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -34,6 +34,9 @@ const (
 	linkAddr0 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
 	linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e")
 	linkAddr2 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0f")
+
+	defaultChannelSize = 1
+	defaultMTU         = 65536
 )
 
 var (
@@ -57,7 +60,7 @@ func (*stubLinkEndpoint) LinkAddress() tcpip.LinkAddress {
 	return ""
 }
 
-func (*stubLinkEndpoint) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, stack.PacketBuffer) *tcpip.Error {
+func (*stubLinkEndpoint) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) *tcpip.Error {
 	return nil
 }
 
@@ -67,7 +70,7 @@ type stubDispatcher struct {
 	stack.TransportDispatcher
 }
 
-func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, stack.PacketBuffer) {
+func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, *stack.PacketBuffer) {
 }
 
 type stubLinkAddressCache struct {
@@ -179,36 +182,33 @@ func TestICMPCounts(t *testing.T) {
 		},
 	}
 
-	handleIPv6Payload := func(hdr buffer.Prependable) {
-		payloadLength := hdr.UsedLength()
-		ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+	handleIPv6Payload := func(icmp header.ICMPv6) {
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: header.IPv6MinimumSize,
+			Data:               buffer.View(icmp).ToVectorisedView(),
+		})
+		ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize))
 		ip.Encode(&header.IPv6Fields{
-			PayloadLength: uint16(payloadLength),
+			PayloadLength: uint16(len(icmp)),
 			NextHeader:    uint8(header.ICMPv6ProtocolNumber),
 			HopLimit:      header.NDPHopLimit,
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
-		ep.HandlePacket(&r, stack.PacketBuffer{
-			Data: hdr.View().ToVectorisedView(),
-		})
+		ep.HandlePacket(&r, pkt)
 	}
 
 	for _, typ := range types {
-		extraDataLen := len(typ.extraData)
-		hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen)
-		extraData := buffer.View(hdr.Prepend(extraDataLen))
-		copy(extraData, typ.extraData)
-		pkt := header.ICMPv6(hdr.Prepend(typ.size))
-		pkt.SetType(typ.typ)
-		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView()))
-
-		handleIPv6Payload(hdr)
+		icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+		copy(icmp[typ.size:], typ.extraData)
+		icmp.SetType(typ.typ)
+		icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+		handleIPv6Payload(icmp)
 	}
 
 	// Construct an empty ICMP packet so that
 	// Stats().ICMP.ICMPv6ReceivedPacketStats.Invalid is incremented.
-	handleIPv6Payload(buffer.NewPrependable(header.IPv6MinimumSize))
+	handleIPv6Payload(header.ICMPv6(buffer.NewView(header.IPv6MinimumSize)))
 
 	icmpv6Stats := s.Stats().ICMP.V6PacketsReceived
 	visitStats(reflect.ValueOf(&icmpv6Stats).Elem(), func(name string, s *tcpip.StatCounter) {
@@ -261,8 +261,7 @@ func newTestContext(t *testing.T) *testContext {
 		}),
 	}
 
-	const defaultMTU = 65536
-	c.linkEP0 = channel.New(256, defaultMTU, linkAddr0)
+	c.linkEP0 = channel.New(defaultChannelSize, defaultMTU, linkAddr0)
 
 	wrappedEP0 := stack.LinkEndpoint(endpointWithResolutionCapability{LinkEndpoint: c.linkEP0})
 	if testing.Verbose() {
@@ -275,7 +274,7 @@ func newTestContext(t *testing.T) *testContext {
 		t.Fatalf("AddAddress lladdr0: %v", err)
 	}
 
-	c.linkEP1 = channel.New(256, defaultMTU, linkAddr1)
+	c.linkEP1 = channel.New(defaultChannelSize, defaultMTU, linkAddr1)
 	wrappedEP1 := stack.LinkEndpoint(endpointWithResolutionCapability{LinkEndpoint: c.linkEP1})
 	if err := c.s1.CreateNIC(1, wrappedEP1); err != nil {
 		t.Fatalf("CreateNIC failed: %v", err)
@@ -325,12 +324,10 @@ func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.
 	pi, _ := args.src.ReadContext(context.Background())
 
 	{
-		views := []buffer.View{pi.Pkt.Header.View(), pi.Pkt.Data.ToView()}
-		size := pi.Pkt.Header.UsedLength() + pi.Pkt.Data.Size()
-		vv := buffer.NewVectorisedView(size, views)
-		args.dst.InjectLinkAddr(pi.Proto, args.dst.LinkAddress(), stack.PacketBuffer{
-			Data: vv,
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: buffer.NewVectorisedView(pi.Pkt.Size(), pi.Pkt.Views()),
 		})
+		args.dst.InjectLinkAddr(pi.Proto, args.dst.LinkAddress(), pkt)
 	}
 
 	if pi.Proto != ProtocolNumber {
@@ -342,7 +339,9 @@ func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.
 		t.Errorf("got remote link address = %s, want = %s", pi.Route.RemoteLinkAddress, args.remoteLinkAddr)
 	}
 
-	ipv6 := header.IPv6(pi.Pkt.Header.View())
+	// Pull the full payload since network header. Needed for header.IPv6 to
+	// extract its payload.
+	ipv6 := header.IPv6(stack.PayloadSince(pi.Pkt.NetworkHeader()))
 	transProto := tcpip.TransportProtocolNumber(ipv6.NextHeader())
 	if transProto != header.ICMPv6ProtocolNumber {
 		t.Errorf("unexpected transport protocol number %d", transProto)
@@ -546,26 +545,24 @@ func TestICMPChecksumValidationSimple(t *testing.T) {
 			}
 
 			handleIPv6Payload := func(checksum bool) {
-				extraDataLen := len(typ.extraData)
-				hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen)
-				extraData := buffer.View(hdr.Prepend(extraDataLen))
-				copy(extraData, typ.extraData)
-				pkt := header.ICMPv6(hdr.Prepend(typ.size))
-				pkt.SetType(typ.typ)
+				icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+				copy(icmp[typ.size:], typ.extraData)
+				icmp.SetType(typ.typ)
 				if checksum {
-					pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, extraData.ToVectorisedView()))
+					icmp.SetChecksum(header.ICMPv6Checksum(icmp, lladdr1, lladdr0, buffer.View{}.ToVectorisedView()))
 				}
-				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize))
 				ip.Encode(&header.IPv6Fields{
-					PayloadLength: uint16(typ.size + extraDataLen),
+					PayloadLength: uint16(len(icmp)),
 					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
 					HopLimit:      header.NDPHopLimit,
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
-					Data: hdr.View().ToVectorisedView(),
+				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+					Data: buffer.NewVectorisedView(len(ip)+len(icmp), []buffer.View{buffer.View(ip), buffer.View(icmp)}),
 				})
+				e.InjectInbound(ProtocolNumber, pkt)
 			}
 
 			stats := s.Stats().ICMP.V6PacketsReceived
@@ -724,12 +721,12 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 			handleIPv6Payload := func(typ header.ICMPv6Type, size, payloadSize int, payloadFn func(buffer.View), checksum bool) {
 				icmpSize := size + payloadSize
 				hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
-				pkt := header.ICMPv6(hdr.Prepend(icmpSize))
-				pkt.SetType(typ)
-				payloadFn(pkt.Payload())
+				icmpHdr := header.ICMPv6(hdr.Prepend(icmpSize))
+				icmpHdr.SetType(typ)
+				payloadFn(icmpHdr.Payload())
 
 				if checksum {
-					pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+					icmpHdr.SetChecksum(header.ICMPv6Checksum(icmpHdr, lladdr1, lladdr0, buffer.VectorisedView{}))
 				}
 
 				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
@@ -740,9 +737,10 @@ func TestICMPChecksumValidationWithPayload(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 					Data: hdr.View().ToVectorisedView(),
 				})
+				e.InjectInbound(ProtocolNumber, pkt)
 			}
 
 			stats := s.Stats().ICMP.V6PacketsReceived
@@ -900,14 +898,14 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 
 			handleIPv6Payload := func(typ header.ICMPv6Type, size, payloadSize int, payloadFn func(buffer.View), checksum bool) {
 				hdr := buffer.NewPrependable(header.IPv6MinimumSize + size)
-				pkt := header.ICMPv6(hdr.Prepend(size))
-				pkt.SetType(typ)
+				icmpHdr := header.ICMPv6(hdr.Prepend(size))
+				icmpHdr.SetType(typ)
 
 				payload := buffer.NewView(payloadSize)
 				payloadFn(payload)
 
 				if checksum {
-					pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, payload.ToVectorisedView()))
+					icmpHdr.SetChecksum(header.ICMPv6Checksum(icmpHdr, lladdr1, lladdr0, payload.ToVectorisedView()))
 				}
 
 				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
@@ -918,9 +916,10 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 					SrcAddr:       lladdr1,
 					DstAddr:       lladdr0,
 				})
-				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+				pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 					Data: buffer.NewVectorisedView(header.IPv6MinimumSize+size+payloadSize, []buffer.View{hdr.View(), payload}),
 				})
+				e.InjectInbound(ProtocolNumber, pkt)
 			}
 
 			stats := s.Stats().ICMP.V6PacketsReceived
@@ -958,3 +957,47 @@ func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
 		})
 	}
 }
+
+func TestLinkAddressRequest(t *testing.T) {
+	snaddr := header.SolicitedNodeAddr(lladdr0)
+	mcaddr := header.EthernetAddressFromMulticastIPv6Address(snaddr)
+
+	tests := []struct {
+		name           string
+		remoteLinkAddr tcpip.LinkAddress
+		expectLinkAddr tcpip.LinkAddress
+	}{
+		{
+			name:           "Unicast",
+			remoteLinkAddr: linkAddr1,
+			expectLinkAddr: linkAddr1,
+		},
+		{
+			name:           "Multicast",
+			remoteLinkAddr: "",
+			expectLinkAddr: mcaddr,
+		},
+	}
+
+	for _, test := range tests {
+		p := NewProtocol()
+		linkRes, ok := p.(stack.LinkAddressResolver)
+		if !ok {
+			t.Fatalf("expected IPv6 protocol to implement stack.LinkAddressResolver")
+		}
+
+		linkEP := channel.New(defaultChannelSize, defaultMTU, linkAddr0)
+		if err := linkRes.LinkAddressRequest(lladdr0, lladdr1, test.remoteLinkAddr, linkEP); err != nil {
+			t.Errorf("got p.LinkAddressRequest(%s, %s, %s, _) = %s", lladdr0, lladdr1, test.remoteLinkAddr, err)
+		}
+
+		pkt, ok := linkEP.Read()
+		if !ok {
+			t.Fatal("expected to send a link address request")
+		}
+
+		if got, want := pkt.Route.RemoteLinkAddress, test.expectLinkAddr; got != want {
+			t.Errorf("got pkt.Route.RemoteLinkAddress = %s, want = %s", got, want)
+		}
+	}
+}
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
index daf1fcbc6..0ade655b2 100644
--- a/pkg/tcpip/network/ipv6/ipv6.go
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -28,7 +28,6 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
-	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
 
@@ -52,7 +51,6 @@ type endpoint struct {
 	linkEP        stack.LinkEndpoint
 	linkAddrCache stack.LinkAddressCache
 	dispatcher    stack.TransportDispatcher
-	fragmentation *fragmentation.Fragmentation
 	protocol      *protocol
 }
 
@@ -101,9 +99,9 @@ func (e *endpoint) GSOMaxSize() uint32 {
 	return 0
 }
 
-func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) header.IPv6 {
-	length := uint16(hdr.UsedLength() + payloadSize)
-	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+func (e *endpoint) addIPHeader(r *stack.Route, pkt *stack.PacketBuffer, params stack.NetworkHeaderParams) {
+	length := uint16(pkt.Size())
+	ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize))
 	ip.Encode(&header.IPv6Fields{
 		PayloadLength: length,
 		NextHeader:    uint8(params.Protocol),
@@ -112,25 +110,20 @@ func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadS
 		SrcAddr:       r.LocalAddress,
 		DstAddr:       r.RemoteAddress,
 	})
-	return ip
+	pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber
 }
 
 // WritePacket writes a packet to the given destination address and protocol.
-func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt stack.PacketBuffer) *tcpip.Error {
-	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
-	pkt.NetworkHeader = buffer.View(ip)
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
+	e.addIPHeader(r, pkt, params)
 
 	if r.Loop&stack.PacketLoop != 0 {
-		// The inbound path expects the network header to still be in
-		// the PacketBuffer's Data field.
-		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
-		views[0] = pkt.Header.View()
-		views = append(views, pkt.Data.Views()...)
 		loopedR := r.MakeLoopedRoute()
 
-		e.HandlePacket(&loopedR, stack.PacketBuffer{
-			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
-		})
+		e.HandlePacket(&loopedR, stack.NewPacketBuffer(stack.PacketBufferOptions{
+			// The inbound path expects an unparsed packet.
+			Data: buffer.NewVectorisedView(pkt.Size(), pkt.Views()),
+		}))
 
 		loopedR.Release()
 	}
@@ -152,8 +145,7 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 	}
 
 	for pb := pkts.Front(); pb != nil; pb = pb.Next() {
-		ip := e.addIPHeader(r, &pb.Header, pb.Data.Size(), params)
-		pb.NetworkHeader = buffer.View(ip)
+		e.addIPHeader(r, pb, params)
 	}
 
 	n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
@@ -163,30 +155,28 @@ func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.Packe
 
 // WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
 // supported by IPv6.
-func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
+func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
 	// TODO(b/146666412): Support IPv6 header-included packets.
 	return tcpip.ErrNotSupported
 }
 
 // HandlePacket is called by the link layer when new ipv6 packets arrive for
 // this endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
-	headerView, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
-	if !ok {
-		r.Stats().IP.MalformedPacketsReceived.Increment()
-		return
-	}
-	h := header.IPv6(headerView)
-	if !h.IsValid(pkt.Data.Size()) {
+func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	h := header.IPv6(pkt.NetworkHeader().View())
+	if !h.IsValid(pkt.Data.Size() + pkt.NetworkHeader().View().Size() + pkt.TransportHeader().View().Size()) {
 		r.Stats().IP.MalformedPacketsReceived.Increment()
 		return
 	}
 
-	pkt.NetworkHeader = headerView[:header.IPv6MinimumSize]
-	pkt.Data.TrimFront(header.IPv6MinimumSize)
-	pkt.Data.CapLength(int(h.PayloadLength()))
-
-	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), pkt.Data)
+	// vv consists of:
+	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
+	// - The transport header, if present.
+	// - Any other payload data.
+	vv := pkt.NetworkHeader().View()[header.IPv6MinimumSize:].ToVectorisedView()
+	vv.AppendView(pkt.TransportHeader().View())
+	vv.Append(pkt.Data)
+	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), vv)
 	hasFragmentHeader := false
 
 	for firstHeader := true; ; firstHeader = false {
@@ -262,9 +252,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 		case header.IPv6FragmentExtHdr:
 			hasFragmentHeader = true
 
-			fragmentOffset := extHdr.FragmentOffset()
-			more := extHdr.More()
-			if !more && fragmentOffset == 0 {
+			if extHdr.IsAtomic() {
 				// This fragment extension header indicates that this packet is an
 				// atomic fragment. An atomic fragment is a fragment that contains
 				// all the data required to reassemble a full packet. As per RFC 6946,
@@ -277,9 +265,9 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 			// Don't consume the iterator if we have the first fragment because we
 			// will use it to validate that the first fragment holds the upper layer
 			// header.
-			rawPayload := it.AsRawHeader(fragmentOffset != 0 /* consume */)
+			rawPayload := it.AsRawHeader(extHdr.FragmentOffset() != 0 /* consume */)
 
-			if fragmentOffset == 0 {
+			if extHdr.FragmentOffset() == 0 {
 				// Check that the iterator ends with a raw payload as the first fragment
 				// should include all headers up to and including any upper layer
 				// headers, as per RFC 8200 section 4.5; only upper layer data
@@ -332,7 +320,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 			}
 
 			// The packet is a fragment, let's try to reassemble it.
-			start := fragmentOffset * header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit
+			start := extHdr.FragmentOffset() * header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit
 			last := start + uint16(fragmentPayloadLen) - 1
 
 			// Drop the packet if the fragmentOffset is incorrect. i.e the
@@ -345,7 +333,21 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 			}
 
 			var ready bool
-			pkt.Data, ready, err = e.fragmentation.Process(hash.IPv6FragmentHash(h, extHdr.ID()), start, last, more, rawPayload.Buf)
+			// Note that pkt doesn't have its transport header set after reassembly,
+			// and won't until DeliverNetworkPacket sets it.
+			pkt.Data, ready, err = e.protocol.fragmentation.Process(
+				// IPv6 ignores the Protocol field since the ID only needs to be unique
+				// across source-destination pairs, as per RFC 8200 section 4.5.
+				fragmentation.FragmentID{
+					Source:      h.SourceAddress(),
+					Destination: h.DestinationAddress(),
+					ID:          extHdr.ID(),
+				},
+				start,
+				last,
+				extHdr.More(),
+				rawPayload.Buf,
+			)
 			if err != nil {
 				r.Stats().IP.MalformedPacketsReceived.Increment()
 				r.Stats().IP.MalformedFragmentsReceived.Increment()
@@ -394,10 +396,17 @@ func (e *endpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
 		case header.IPv6RawPayloadHeader:
 			// If the last header in the payload isn't a known IPv6 extension header,
 			// handle it as if it is transport layer data.
+
+			// For unfragmented packets, extHdr still contains the transport header.
+			// Get rid of it.
+			//
+			// For reassembled fragments, pkt.TransportHeader is unset, so this is a
+			// no-op and pkt.Data begins with the transport header.
+			extHdr.Buf.TrimFront(pkt.TransportHeader().View().Size())
 			pkt.Data = extHdr.Buf
 
 			if p := tcpip.TransportProtocolNumber(extHdr.Identifier); p == header.ICMPv6ProtocolNumber {
-				e.handleICMP(r, headerView, pkt, hasFragmentHeader)
+				e.handleICMP(r, pkt, hasFragmentHeader)
 			} else {
 				r.Stats().IP.PacketsDelivered.Increment()
 				// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1 error
@@ -429,7 +438,8 @@ type protocol struct {
 	// defaultTTL is the current default TTL for the protocol. Only the
 	// uint8 portion of it is meaningful and it must be accessed
 	// atomically.
-	defaultTTL uint32
+	defaultTTL    uint32
+	fragmentation *fragmentation.Fragmentation
 }
 
 // Number returns the ipv6 protocol number.
@@ -462,7 +472,6 @@ func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWi
 		linkEP:        linkEP,
 		linkAddrCache: linkAddrCache,
 		dispatcher:    dispatcher,
-		fragmentation: fragmentation.NewFragmentation(fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout),
 		protocol:      p,
 	}, nil
 }
@@ -505,6 +514,77 @@ func (*protocol) Close() {}
 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}
 
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
+	hdr, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	if !ok {
+		return 0, false, false
+	}
+	ipHdr := header.IPv6(hdr)
+
+	// dataClone consists of:
+	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
+	// - The transport header, if present.
+	// - Any other payload data.
+	views := [8]buffer.View{}
+	dataClone := pkt.Data.Clone(views[:])
+	dataClone.TrimFront(header.IPv6MinimumSize)
+	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataClone)
+
+	// Iterate over the IPv6 extensions to find their length.
+	//
+	// Parsing occurs again in HandlePacket because we don't track the
+	// extensions in PacketBuffer. Unfortunately, that means HandlePacket
+	// has to do the parsing work again.
+	var nextHdr tcpip.TransportProtocolNumber
+	foundNext := true
+	extensionsSize := 0
+traverseExtensions:
+	for extHdr, done, err := it.Next(); ; extHdr, done, err = it.Next() {
+		if err != nil {
+			break
+		}
+		// If we exhaust the extension list, the entire packet is the IPv6 header
+		// and (possibly) extensions.
+		if done {
+			extensionsSize = dataClone.Size()
+			foundNext = false
+			break
+		}
+
+		switch extHdr := extHdr.(type) {
+		case header.IPv6FragmentExtHdr:
+			// If this is an atomic fragment, we don't have to treat it specially.
+			if !extHdr.More() && extHdr.FragmentOffset() == 0 {
+				continue
+			}
+			// This is a non-atomic fragment and has to be re-assembled before we can
+			// examine the payload for a transport header.
+			foundNext = false
+
+		case header.IPv6RawPayloadHeader:
+			// We've found the payload after any extensions.
+			extensionsSize = dataClone.Size() - extHdr.Buf.Size()
+			nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier)
+			break traverseExtensions
+
+		default:
+			// Any other extension is a no-op, keep looping until we find the payload.
+		}
+	}
+
+	// Put the IPv6 header with extensions in pkt.NetworkHeader().
+	hdr, ok = pkt.NetworkHeader().Consume(header.IPv6MinimumSize + extensionsSize)
+	if !ok {
+		panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data.Size()))
+	}
+	ipHdr = header.IPv6(hdr)
+	pkt.Data.CapLength(int(ipHdr.PayloadLength()))
+	pkt.NetworkProtocolNumber = header.IPv6ProtocolNumber
+
+	return nextHdr, foundNext, true
+}
+
 // calculateMTU calculates the network-layer payload MTU based on the link-layer
 // payload mtu.
 func calculateMTU(mtu uint32) uint32 {
@@ -517,5 +597,8 @@ func calculateMTU(mtu uint32) uint32 {
 
 // NewProtocol returns an IPv6 network protocol.
 func NewProtocol() stack.NetworkProtocol {
-	return &protocol{defaultTTL: DefaultTTL}
+	return &protocol{
+		defaultTTL:    DefaultTTL,
+		fragmentation: fragmentation.NewFragmentation(header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit, fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout),
+	}
 }
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
index 841a0cb7a..081afb051 100644
--- a/pkg/tcpip/network/ipv6/ipv6_test.go
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -65,9 +65,9 @@ func testReceiveICMP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		DstAddr:       dst,
 	})
 
-	e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+	e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: hdr.View().ToVectorisedView(),
-	})
+	}))
 
 	stats := s.Stats().ICMP.V6PacketsReceived
 
@@ -123,9 +123,9 @@ func testReceiveUDP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst
 		DstAddr:       dst,
 	})
 
-	e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+	e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: hdr.View().ToVectorisedView(),
-	})
+	}))
 
 	stat := s.Stats().UDP.PacketsReceived
 
@@ -637,9 +637,9 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 				DstAddr:       addr2,
 			})
 
-			e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+			e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 				Data: hdr.View().ToVectorisedView(),
-			})
+			}))
 
 			stats := s.Stats().UDP.PacketsReceived
 
@@ -673,20 +673,27 @@ func TestReceiveIPv6ExtHdrs(t *testing.T) {
 
 // fragmentData holds the IPv6 payload for a fragmented IPv6 packet.
 type fragmentData struct {
+	srcAddr tcpip.Address
+	dstAddr tcpip.Address
 	nextHdr uint8
 	data    buffer.VectorisedView
 }
 
 func TestReceiveIPv6Fragments(t *testing.T) {
-	const nicID = 1
-	const udpPayload1Length = 256
-	const udpPayload2Length = 128
-	const fragmentExtHdrLen = 8
-	// Note, not all routing extension headers will be 8 bytes but this test
-	// uses 8 byte routing extension headers for most sub tests.
-	const routingExtHdrLen = 8
-
-	udpGen := func(payload []byte, multiplier uint8) buffer.View {
+	const (
+		nicID             = 1
+		udpPayload1Length = 256
+		udpPayload2Length = 128
+		// Used to test cases where the fragment blocks are not a multiple of
+		// the fragment block size of 8 (RFC 8200 section 4.5).
+		udpPayload3Length = 127
+		fragmentExtHdrLen = 8
+		// Note, not all routing extension headers will be 8 bytes but this test
+		// uses 8 byte routing extension headers for most sub tests.
+		routingExtHdrLen = 8
+	)
+
+	udpGen := func(payload []byte, multiplier uint8, src, dst tcpip.Address) buffer.View {
 		payloadLen := len(payload)
 		for i := 0; i < payloadLen; i++ {
 			payload[i] = uint8(i) * multiplier
@@ -702,19 +709,27 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 			Length:  uint16(udpLength),
 		})
 		copy(u.Payload(), payload)
-		sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength))
+		sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, src, dst, uint16(udpLength))
 		sum = header.Checksum(payload, sum)
 		u.SetChecksum(^u.CalculateChecksum(sum))
 		return hdr.View()
 	}
 
-	var udpPayload1Buf [udpPayload1Length]byte
-	udpPayload1 := udpPayload1Buf[:]
-	ipv6Payload1 := udpGen(udpPayload1, 1)
+	var udpPayload1Addr1ToAddr2Buf [udpPayload1Length]byte
+	udpPayload1Addr1ToAddr2 := udpPayload1Addr1ToAddr2Buf[:]
+	ipv6Payload1Addr1ToAddr2 := udpGen(udpPayload1Addr1ToAddr2, 1, addr1, addr2)
+
+	var udpPayload1Addr3ToAddr2Buf [udpPayload1Length]byte
+	udpPayload1Addr3ToAddr2 := udpPayload1Addr3ToAddr2Buf[:]
+	ipv6Payload1Addr3ToAddr2 := udpGen(udpPayload1Addr3ToAddr2, 4, addr3, addr2)
 
-	var udpPayload2Buf [udpPayload2Length]byte
-	udpPayload2 := udpPayload2Buf[:]
-	ipv6Payload2 := udpGen(udpPayload2, 2)
+	var udpPayload2Addr1ToAddr2Buf [udpPayload2Length]byte
+	udpPayload2Addr1ToAddr2 := udpPayload2Addr1ToAddr2Buf[:]
+	ipv6Payload2Addr1ToAddr2 := udpGen(udpPayload2Addr1ToAddr2, 2, addr1, addr2)
+
+	var udpPayload3Addr1ToAddr2Buf [udpPayload3Length]byte
+	udpPayload3Addr1ToAddr2 := udpPayload3Addr1ToAddr2Buf[:]
+	ipv6Payload3Addr1ToAddr2 := udpGen(udpPayload3Addr1ToAddr2, 3, addr1, addr2)
 
 	tests := []struct {
 		name             string
@@ -726,34 +741,98 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 			name: "No fragmentation",
 			fragments: []fragmentData{
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: uint8(header.UDPProtocolNumber),
-					data:    ipv6Payload1.ToVectorisedView(),
+					data:    ipv6Payload1Addr1ToAddr2.ToVectorisedView(),
 				},
 			},
-			expectedPayloads: [][]byte{udpPayload1},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
 		},
 		{
 			name: "Atomic fragment",
 			fragments: []fragmentData{
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
-						fragmentExtHdrLen+len(ipv6Payload1),
+						fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2),
 						[]buffer.View{
 							// Fragment extension header.
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 0, 0, 0, 0, 0}),
 
-							ipv6Payload1,
+							ipv6Payload1Addr1ToAddr2,
 						},
 					),
 				},
 			},
-			expectedPayloads: [][]byte{udpPayload1},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
+		},
+		{
+			name: "Atomic fragment with size not a multiple of fragment block size",
+			fragments: []fragmentData{
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload3Addr1ToAddr2),
+						[]buffer.View{
+							// Fragment extension header.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 0, 0, 0, 0, 0}),
+
+							ipv6Payload3Addr1ToAddr2,
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload3Addr1ToAddr2},
 		},
 		{
 			name: "Two fragments",
 			fragments: []fragmentData{
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1Addr1ToAddr2[:64],
+						},
+					),
+				},
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1Addr1ToAddr2[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
+		},
+		{
+			name: "Two fragments with last fragment size not a multiple of fragment block size",
+			fragments: []fragmentData{
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
 						fragmentExtHdrLen+64,
@@ -763,31 +842,73 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 							// Fragment offset = 0, More = true, ID = 1
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
 
-							ipv6Payload1[:64],
+							ipv6Payload3Addr1ToAddr2[:64],
 						},
 					),
 				},
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
-						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						fragmentExtHdrLen+len(ipv6Payload3Addr1ToAddr2)-64,
 						[]buffer.View{
 							// Fragment extension header.
 							//
 							// Fragment offset = 8, More = false, ID = 1
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
 
-							ipv6Payload1[64:],
+							ipv6Payload3Addr1ToAddr2[64:],
 						},
 					),
 				},
 			},
-			expectedPayloads: [][]byte{udpPayload1},
+			expectedPayloads: [][]byte{udpPayload3Addr1ToAddr2},
+		},
+		{
+			name: "Two fragments with first fragment size not a multiple of fragment block size",
+			fragments: []fragmentData{
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+63,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload3Addr1ToAddr2[:63],
+						},
+					),
+				},
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload3Addr1ToAddr2)-63,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload3Addr1ToAddr2[63:],
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
 		},
 		{
 			name: "Two fragments with different IDs",
 			fragments: []fragmentData{
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
 						fragmentExtHdrLen+64,
@@ -797,21 +918,23 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 							// Fragment offset = 0, More = true, ID = 1
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
 
-							ipv6Payload1[:64],
+							ipv6Payload1Addr1ToAddr2[:64],
 						},
 					),
 				},
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
-						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64,
 						[]buffer.View{
 							// Fragment extension header.
 							//
 							// Fragment offset = 8, More = false, ID = 2
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 2}),
 
-							ipv6Payload1[64:],
+							ipv6Payload1Addr1ToAddr2[64:],
 						},
 					),
 				},
@@ -822,6 +945,8 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 			name: "Two fragments with per-fragment routing header with zero segments left",
 			fragments: []fragmentData{
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: routingExtHdrID,
 					data: buffer.NewVectorisedView(
 						routingExtHdrLen+fragmentExtHdrLen+64,
@@ -836,14 +961,16 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 							// Fragment offset = 0, More = true, ID = 1
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
 
-							ipv6Payload1[:64],
+							ipv6Payload1Addr1ToAddr2[:64],
 						},
 					),
 				},
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: routingExtHdrID,
 					data: buffer.NewVectorisedView(
-						routingExtHdrLen+fragmentExtHdrLen+len(ipv6Payload1)-64,
+						routingExtHdrLen+fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64,
 						[]buffer.View{
 							// Routing extension header.
 							//
@@ -855,17 +982,19 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 							// Fragment offset = 8, More = false, ID = 1
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
 
-							ipv6Payload1[64:],
+							ipv6Payload1Addr1ToAddr2[64:],
 						},
 					),
 				},
 			},
-			expectedPayloads: [][]byte{udpPayload1},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
 		},
 		{
 			name: "Two fragments with per-fragment routing header with non-zero segments left",
 			fragments: []fragmentData{
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: routingExtHdrID,
 					data: buffer.NewVectorisedView(
 						routingExtHdrLen+fragmentExtHdrLen+64,
@@ -880,14 +1009,16 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 							// Fragment offset = 0, More = true, ID = 1
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
 
-							ipv6Payload1[:64],
+							ipv6Payload1Addr1ToAddr2[:64],
 						},
 					),
 				},
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: routingExtHdrID,
 					data: buffer.NewVectorisedView(
-						routingExtHdrLen+fragmentExtHdrLen+len(ipv6Payload1)-64,
+						routingExtHdrLen+fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64,
 						[]buffer.View{
 							// Routing extension header.
 							//
@@ -899,7 +1030,7 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 							// Fragment offset = 9, More = false, ID = 1
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 72, 0, 0, 0, 1}),
 
-							ipv6Payload1[64:],
+							ipv6Payload1Addr1ToAddr2[64:],
 						},
 					),
 				},
@@ -910,6 +1041,8 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 			name: "Two fragments with routing header with zero segments left",
 			fragments: []fragmentData{
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
 						routingExtHdrLen+fragmentExtHdrLen+64,
@@ -924,31 +1057,35 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 							// Segments left = 0.
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 1, 0, 2, 3, 4, 5}),
 
-							ipv6Payload1[:64],
+							ipv6Payload1Addr1ToAddr2[:64],
 						},
 					),
 				},
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
-						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64,
 						[]buffer.View{
 							// Fragment extension header.
 							//
 							// Fragment offset = 9, More = false, ID = 1
 							buffer.View([]byte{routingExtHdrID, 0, 0, 72, 0, 0, 0, 1}),
 
-							ipv6Payload1[64:],
+							ipv6Payload1Addr1ToAddr2[64:],
 						},
 					),
 				},
 			},
-			expectedPayloads: [][]byte{udpPayload1},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2},
 		},
 		{
 			name: "Two fragments with routing header with non-zero segments left",
 			fragments: []fragmentData{
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
 						routingExtHdrLen+fragmentExtHdrLen+64,
@@ -963,21 +1100,23 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 							// Segments left = 1.
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 1, 1, 2, 3, 4, 5}),
 
-							ipv6Payload1[:64],
+							ipv6Payload1Addr1ToAddr2[:64],
 						},
 					),
 				},
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
-						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64,
 						[]buffer.View{
 							// Fragment extension header.
 							//
 							// Fragment offset = 9, More = false, ID = 1
 							buffer.View([]byte{routingExtHdrID, 0, 0, 72, 0, 0, 0, 1}),
 
-							ipv6Payload1[64:],
+							ipv6Payload1Addr1ToAddr2[64:],
 						},
 					),
 				},
@@ -988,6 +1127,8 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 			name: "Two fragments with routing header with zero segments left across fragments",
 			fragments: []fragmentData{
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
 						// The length of this payload is fragmentExtHdrLen+8 because the
@@ -1008,12 +1149,14 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 					),
 				},
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
 						// The length of this payload is
-						// fragmentExtHdrLen+8+len(ipv6Payload1) because the last 8 bytes of
+						// fragmentExtHdrLen+8+len(ipv6Payload1Addr1ToAddr2) because the last 8 bytes of
 						// the 16 byte routing extension header is in this fagment.
-						fragmentExtHdrLen+8+len(ipv6Payload1),
+						fragmentExtHdrLen+8+len(ipv6Payload1Addr1ToAddr2),
 						[]buffer.View{
 							// Fragment extension header.
 							//
@@ -1023,7 +1166,7 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 							// Routing extension header (part 2)
 							buffer.View([]byte{6, 7, 8, 9, 10, 11, 12, 13}),
 
-							ipv6Payload1,
+							ipv6Payload1Addr1ToAddr2,
 						},
 					),
 				},
@@ -1034,6 +1177,8 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 			name: "Two fragments with routing header with non-zero segments left across fragments",
 			fragments: []fragmentData{
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
 						// The length of this payload is fragmentExtHdrLen+8 because the
@@ -1054,12 +1199,14 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 					),
 				},
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
 						// The length of this payload is
-						// fragmentExtHdrLen+8+len(ipv6Payload1) because the last 8 bytes of
+						// fragmentExtHdrLen+8+len(ipv6Payload1Addr1ToAddr2) because the last 8 bytes of
 						// the 16 byte routing extension header is in this fagment.
-						fragmentExtHdrLen+8+len(ipv6Payload1),
+						fragmentExtHdrLen+8+len(ipv6Payload1Addr1ToAddr2),
 						[]buffer.View{
 							// Fragment extension header.
 							//
@@ -1069,7 +1216,7 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 							// Routing extension header (part 2)
 							buffer.View([]byte{6, 7, 8, 9, 10, 11, 12, 13}),
 
-							ipv6Payload1,
+							ipv6Payload1Addr1ToAddr2,
 						},
 					),
 				},
@@ -1082,6 +1229,8 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 			name: "Two fragments with atomic",
 			fragments: []fragmentData{
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
 						fragmentExtHdrLen+64,
@@ -1091,47 +1240,53 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 							// Fragment offset = 0, More = true, ID = 1
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
 
-							ipv6Payload1[:64],
+							ipv6Payload1Addr1ToAddr2[:64],
 						},
 					),
 				},
 				// This fragment has the same ID as the other fragments but is an atomic
 				// fragment. It should not interfere with the other fragments.
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
-						fragmentExtHdrLen+len(ipv6Payload2),
+						fragmentExtHdrLen+len(ipv6Payload2Addr1ToAddr2),
 						[]buffer.View{
 							// Fragment extension header.
 							//
 							// Fragment offset = 0, More = false, ID = 1
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 0, 0, 0, 0, 1}),
 
-							ipv6Payload2,
+							ipv6Payload2Addr1ToAddr2,
 						},
 					),
 				},
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
-						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64,
 						[]buffer.View{
 							// Fragment extension header.
 							//
 							// Fragment offset = 8, More = false, ID = 1
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
 
-							ipv6Payload1[64:],
+							ipv6Payload1Addr1ToAddr2[64:],
 						},
 					),
 				},
 			},
-			expectedPayloads: [][]byte{udpPayload2, udpPayload1},
+			expectedPayloads: [][]byte{udpPayload2Addr1ToAddr2, udpPayload1Addr1ToAddr2},
 		},
 		{
 			name: "Two interleaved fragmented packets",
 			fragments: []fragmentData{
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
 						fragmentExtHdrLen+64,
@@ -1141,11 +1296,13 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 							// Fragment offset = 0, More = true, ID = 1
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
 
-							ipv6Payload1[:64],
+							ipv6Payload1Addr1ToAddr2[:64],
 						},
 					),
 				},
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
 						fragmentExtHdrLen+32,
@@ -1155,40 +1312,114 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 							// Fragment offset = 0, More = true, ID = 2
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 2}),
 
-							ipv6Payload2[:32],
+							ipv6Payload2Addr1ToAddr2[:32],
 						},
 					),
 				},
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
-						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64,
 						[]buffer.View{
 							// Fragment extension header.
 							//
 							// Fragment offset = 8, More = false, ID = 1
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
 
-							ipv6Payload1[64:],
+							ipv6Payload1Addr1ToAddr2[64:],
 						},
 					),
 				},
 				{
+					srcAddr: addr1,
+					dstAddr: addr2,
 					nextHdr: fragmentExtHdrID,
 					data: buffer.NewVectorisedView(
-						fragmentExtHdrLen+len(ipv6Payload2)-32,
+						fragmentExtHdrLen+len(ipv6Payload2Addr1ToAddr2)-32,
 						[]buffer.View{
 							// Fragment extension header.
 							//
 							// Fragment offset = 4, More = false, ID = 2
 							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 32, 0, 0, 0, 2}),
 
-							ipv6Payload2[32:],
+							ipv6Payload2Addr1ToAddr2[32:],
 						},
 					),
 				},
 			},
-			expectedPayloads: [][]byte{udpPayload1, udpPayload2},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2, udpPayload2Addr1ToAddr2},
+		},
+		{
+			name: "Two interleaved fragmented packets from different sources but with same ID",
+			fragments: []fragmentData{
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1Addr1ToAddr2[:64],
+						},
+					),
+				},
+				{
+					srcAddr: addr3,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+32,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1Addr3ToAddr2[:32],
+						},
+					),
+				},
+				{
+					srcAddr: addr1,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1Addr1ToAddr2[64:],
+						},
+					),
+				},
+				{
+					srcAddr: addr3,
+					dstAddr: addr2,
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1Addr1ToAddr2)-32,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 4, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 32, 0, 0, 0, 1}),
+
+							ipv6Payload1Addr3ToAddr2[32:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1Addr1ToAddr2, udpPayload1Addr3ToAddr2},
 		},
 	}
 
@@ -1231,16 +1462,16 @@ func TestReceiveIPv6Fragments(t *testing.T) {
 					PayloadLength: uint16(f.data.Size()),
 					NextHeader:    f.nextHdr,
 					HopLimit:      255,
-					SrcAddr:       addr1,
-					DstAddr:       addr2,
+					SrcAddr:       f.srcAddr,
+					DstAddr:       f.dstAddr,
 				})
 
 				vv := hdr.View().ToVectorisedView()
 				vv.Append(f.data)
 
-				e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+				e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 					Data: vv,
-				})
+				}))
 			}
 
 			if got, want := s.Stats().UDP.PacketsReceived.Value(), uint64(len(test.expectedPayloads)); got != want {
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
index 12b70f7e9..fe159b24f 100644
--- a/pkg/tcpip/network/ipv6/ndp_test.go
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -136,9 +136,9 @@ func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 
-			e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+			e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 				Data: hdr.View().ToVectorisedView(),
-			})
+			}))
 
 			linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil)
 			if linkAddr != test.expectedLinkAddr {
@@ -380,9 +380,9 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 
-			e.InjectLinkAddr(ProtocolNumber, test.nsSrcLinkAddr, stack.PacketBuffer{
+			e.InjectLinkAddr(ProtocolNumber, test.nsSrcLinkAddr, stack.NewPacketBuffer(stack.PacketBufferOptions{
 				Data: hdr.View().ToVectorisedView(),
-			})
+			}))
 
 			if test.nsInvalid {
 				if got := invalid.Value(); got != 1 {
@@ -410,7 +410,7 @@ func TestNeighorSolicitationResponse(t *testing.T) {
 				t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, test.naDstLinkAddr)
 			}
 
-			checker.IPv6(t, p.Pkt.Header.View(),
+			checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
 				checker.SrcAddr(test.naSrc),
 				checker.DstAddr(test.naDst),
 				checker.TTL(header.NDPHopLimit),
@@ -497,9 +497,9 @@ func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
 				t.Fatalf("got invalid = %d, want = 0", got)
 			}
 
-			e.InjectInbound(ProtocolNumber, stack.PacketBuffer{
+			e.InjectInbound(ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 				Data: hdr.View().ToVectorisedView(),
-			})
+			}))
 
 			linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil)
 			if linkAddr != test.expectedLinkAddr {
@@ -551,26 +551,31 @@ func TestNDPValidation(t *testing.T) {
 		return s, ep, r
 	}
 
-	handleIPv6Payload := func(hdr buffer.Prependable, hopLimit uint8, atomicFragment bool, ep stack.NetworkEndpoint, r *stack.Route) {
+	handleIPv6Payload := func(payload buffer.View, hopLimit uint8, atomicFragment bool, ep stack.NetworkEndpoint, r *stack.Route) {
 		nextHdr := uint8(header.ICMPv6ProtocolNumber)
+		var extensions buffer.View
 		if atomicFragment {
-			bytes := hdr.Prepend(header.IPv6FragmentExtHdrLength)
-			bytes[0] = nextHdr
+			extensions = buffer.NewView(header.IPv6FragmentExtHdrLength)
+			extensions[0] = nextHdr
 			nextHdr = uint8(header.IPv6FragmentExtHdrIdentifier)
 		}
 
-		payloadLength := hdr.UsedLength()
-		ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: header.IPv6MinimumSize + len(extensions),
+			Data:               payload.ToVectorisedView(),
+		})
+		ip := header.IPv6(pkt.NetworkHeader().Push(header.IPv6MinimumSize + len(extensions)))
 		ip.Encode(&header.IPv6Fields{
-			PayloadLength: uint16(payloadLength),
+			PayloadLength: uint16(len(payload) + len(extensions)),
 			NextHeader:    nextHdr,
 			HopLimit:      hopLimit,
 			SrcAddr:       r.LocalAddress,
 			DstAddr:       r.RemoteAddress,
 		})
-		ep.HandlePacket(r, stack.PacketBuffer{
-			Data: hdr.View().ToVectorisedView(),
-		})
+		if n := copy(ip[header.IPv6MinimumSize:], extensions); n != len(extensions) {
+			t.Fatalf("expected to write %d bytes of extensions, but wrote %d", len(extensions), n)
+		}
+		ep.HandlePacket(r, pkt)
 	}
 
 	var tllData [header.NDPLinkLayerAddressSize]byte
@@ -676,14 +681,11 @@ func TestNDPValidation(t *testing.T) {
 					invalid := stats.Invalid
 					typStat := typ.statCounter(stats)
 
-					extraDataLen := len(typ.extraData)
-					hdr := buffer.NewPrependable(header.IPv6MinimumSize + typ.size + extraDataLen + header.IPv6FragmentExtHdrLength)
-					extraData := buffer.View(hdr.Prepend(extraDataLen))
-					copy(extraData, typ.extraData)
-					pkt := header.ICMPv6(hdr.Prepend(typ.size))
-					pkt.SetType(typ.typ)
-					pkt.SetCode(test.code)
-					pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, extraData.ToVectorisedView()))
+					icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+					copy(icmp[typ.size:], typ.extraData)
+					icmp.SetType(typ.typ)
+					icmp.SetCode(test.code)
+					icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
 
 					// Rx count of the NDP message should initially be 0.
 					if got := typStat.Value(); got != 0 {
@@ -699,7 +701,7 @@ func TestNDPValidation(t *testing.T) {
 						t.FailNow()
 					}
 
-					handleIPv6Payload(hdr, test.hopLimit, test.atomicFragment, ep, &r)
+					handleIPv6Payload(buffer.View(icmp), test.hopLimit, test.atomicFragment, ep, &r)
 
 					// Rx count of the NDP packet should have increased.
 					if got := typStat.Value(); got != 1 {
@@ -884,9 +886,9 @@ func TestRouterAdvertValidation(t *testing.T) {
 				t.Fatalf("got rxRA = %d, want = 0", got)
 			}
 
-			e.InjectInbound(header.IPv6ProtocolNumber, stack.PacketBuffer{
+			e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 				Data: hdr.View().ToVectorisedView(),
-			})
+			}))
 
 			if got := rxRA.Value(); got != 1 {
 				t.Fatalf("got rxRA = %d, want = 1", got)
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
index b937cb84b..f6d592eb5 100644
--- a/pkg/tcpip/ports/ports.go
+++ b/pkg/tcpip/ports/ports.go
@@ -52,19 +52,35 @@ type Flags struct {
 	//
 	// LoadBalanced takes precidence over MostRecent.
 	LoadBalanced bool
+
+	// TupleOnly represents TCP SO_REUSEADDR.
+	TupleOnly bool
 }
 
-func (f Flags) bits() reuseFlag {
-	var rf reuseFlag
+// Bits converts the Flags to their bitset form.
+func (f Flags) Bits() BitFlags {
+	var rf BitFlags
 	if f.MostRecent {
-		rf |= mostRecentFlag
+		rf |= MostRecentFlag
 	}
 	if f.LoadBalanced {
-		rf |= loadBalancedFlag
+		rf |= LoadBalancedFlag
+	}
+	if f.TupleOnly {
+		rf |= TupleOnlyFlag
 	}
 	return rf
 }
 
+// Effective returns the effective behavior of a flag config.
+func (f Flags) Effective() Flags {
+	e := f
+	if e.LoadBalanced && e.MostRecent {
+		e.MostRecent = false
+	}
+	return e
+}
+
 // PortManager manages allocating, reserving and releasing ports.
 type PortManager struct {
 	mu             sync.RWMutex
@@ -78,83 +94,166 @@ type PortManager struct {
 	hint uint32
 }
 
-type reuseFlag int
+// BitFlags is a bitset representation of Flags.
+type BitFlags uint32
 
 const (
-	mostRecentFlag reuseFlag = 1 << iota
-	loadBalancedFlag
+	// MostRecentFlag represents Flags.MostRecent.
+	MostRecentFlag BitFlags = 1 << iota
+
+	// LoadBalancedFlag represents Flags.LoadBalanced.
+	LoadBalancedFlag
+
+	// TupleOnlyFlag represents Flags.TupleOnly.
+	TupleOnlyFlag
+
+	// nextFlag is the value that the next added flag will have.
+	//
+	// It is used to calculate FlagMask below. It is also the number of
+	// valid flag states.
 	nextFlag
 
-	flagMask = nextFlag - 1
+	// FlagMask is a bit mask for BitFlags.
+	FlagMask = nextFlag - 1
+
+	// MultiBindFlagMask contains the flags that allow binding the same
+	// tuple multiple times.
+	MultiBindFlagMask = MostRecentFlag | LoadBalancedFlag
 )
 
-type portNode struct {
-	// refs stores the count for each possible flag combination.
+// ToFlags converts the bitset into a Flags struct.
+func (f BitFlags) ToFlags() Flags {
+	return Flags{
+		MostRecent:   f&MostRecentFlag != 0,
+		LoadBalanced: f&LoadBalancedFlag != 0,
+		TupleOnly:    f&TupleOnlyFlag != 0,
+	}
+}
+
+// FlagCounter counts how many references each flag combination has.
+type FlagCounter struct {
+	// refs stores the count for each possible flag combination, (0 though
+	// FlagMask).
 	refs [nextFlag]int
 }
 
-func (p portNode) totalRefs() int {
+// AddRef increases the reference count for a specific flag combination.
+func (c *FlagCounter) AddRef(flags BitFlags) {
+	c.refs[flags]++
+}
+
+// DropRef decreases the reference count for a specific flag combination.
+func (c *FlagCounter) DropRef(flags BitFlags) {
+	c.refs[flags]--
+}
+
+// TotalRefs calculates the total number of references for all flag
+// combinations.
+func (c FlagCounter) TotalRefs() int {
 	var total int
-	for _, r := range p.refs {
+	for _, r := range c.refs {
 		total += r
 	}
 	return total
 }
 
-// flagRefs returns the number of references with all specified flags.
-func (p portNode) flagRefs(flags reuseFlag) int {
+// FlagRefs returns the number of references with all specified flags.
+func (c FlagCounter) FlagRefs(flags BitFlags) int {
 	var total int
-	for i, r := range p.refs {
-		if reuseFlag(i)&flags == flags {
+	for i, r := range c.refs {
+		if BitFlags(i)&flags == flags {
 			total += r
 		}
 	}
 	return total
 }
 
-// allRefsHave returns if all references have all specified flags.
-func (p portNode) allRefsHave(flags reuseFlag) bool {
-	for i, r := range p.refs {
-		if reuseFlag(i)&flags == flags && r > 0 {
+// AllRefsHave returns if all references have all specified flags.
+func (c FlagCounter) AllRefsHave(flags BitFlags) bool {
+	for i, r := range c.refs {
+		if BitFlags(i)&flags != flags && r > 0 {
 			return false
 		}
 	}
 	return true
 }
 
-// intersectionRefs returns the set of flags shared by all references.
-func (p portNode) intersectionRefs() reuseFlag {
-	intersection := flagMask
-	for i, r := range p.refs {
+// IntersectionRefs returns the set of flags shared by all references.
+func (c FlagCounter) IntersectionRefs() BitFlags {
+	intersection := FlagMask
+	for i, r := range c.refs {
 		if r > 0 {
-			intersection &= reuseFlag(i)
+			intersection &= BitFlags(i)
 		}
 	}
 	return intersection
 }
 
+type destination struct {
+	addr tcpip.Address
+	port uint16
+}
+
+func makeDestination(a tcpip.FullAddress) destination {
+	return destination{
+		a.Addr,
+		a.Port,
+	}
+}
+
+// portNode is never empty. When it has no elements, it is removed from the
+// map that references it.
+type portNode map[destination]FlagCounter
+
+// intersectionRefs calculates the intersection of flag bit values which affect
+// the specified destination.
+//
+// If no destinations are present, all flag values are returned as there are no
+// entries to limit possible flag values of a new entry.
+//
+// In addition to the intersection, the number of intersecting refs is
+// returned.
+func (p portNode) intersectionRefs(dst destination) (BitFlags, int) {
+	intersection := FlagMask
+	var count int
+
+	for d, f := range p {
+		if d == dst {
+			intersection &= f.IntersectionRefs()
+			count++
+			continue
+		}
+		// Wildcard destinations affect all destinations for TupleOnly.
+		if d.addr == anyIPAddress || dst.addr == anyIPAddress {
+			// Only bitwise and the TupleOnlyFlag.
+			intersection &= ((^TupleOnlyFlag) | f.IntersectionRefs())
+			count++
+		}
+	}
+
+	return intersection, count
+}
+
 // deviceNode is never empty. When it has no elements, it is removed from the
 // map that references it.
 type deviceNode map[tcpip.NICID]portNode
 
 // isAvailable checks whether binding is possible by device. If not binding to a
-// device, check against all portNodes. If binding to a specific device, check
+// device, check against all FlagCounters. If binding to a specific device, check
 // against the unspecified device and the provided device.
 //
 // If either of the port reuse flags is enabled on any of the nodes, all nodes
 // sharing a port must share at least one reuse flag. This matches Linux's
 // behavior.
-func (d deviceNode) isAvailable(flags Flags, bindToDevice tcpip.NICID) bool {
-	flagBits := flags.bits()
+func (d deviceNode) isAvailable(flags Flags, bindToDevice tcpip.NICID, dst destination) bool {
+	flagBits := flags.Bits()
 	if bindToDevice == 0 {
-		// Trying to binding all devices.
-		if flagBits == 0 {
-			// Can't bind because the (addr,port) is already bound.
-			return false
-		}
-		intersection := flagMask
+		intersection := FlagMask
 		for _, p := range d {
-			i := p.intersectionRefs()
+			i, c := p.intersectionRefs(dst)
+			if c == 0 {
+				continue
+			}
 			intersection &= i
 			if intersection&flagBits == 0 {
 				// Can't bind because the (addr,port) was
@@ -165,19 +264,20 @@ func (d deviceNode) isAvailable(flags Flags, bindToDevice tcpip.NICID) bool {
 		return true
 	}
 
-	intersection := flagMask
+	intersection := FlagMask
 
 	if p, ok := d[0]; ok {
-		intersection = p.intersectionRefs()
-		if intersection&flagBits == 0 {
+		var c int
+		intersection, c = p.intersectionRefs(dst)
+		if c > 0 && intersection&flagBits == 0 {
 			return false
 		}
 	}
 
 	if p, ok := d[bindToDevice]; ok {
-		i := p.intersectionRefs()
+		i, c := p.intersectionRefs(dst)
 		intersection &= i
-		if intersection&flagBits == 0 {
+		if c > 0 && intersection&flagBits == 0 {
 			return false
 		}
 	}
@@ -191,12 +291,12 @@ type bindAddresses map[tcpip.Address]deviceNode
 // isAvailable checks whether an IP address is available to bind to. If the
 // address is the "any" address, check all other addresses. Otherwise, just
 // check against the "any" address and the provided address.
-func (b bindAddresses) isAvailable(addr tcpip.Address, flags Flags, bindToDevice tcpip.NICID) bool {
+func (b bindAddresses) isAvailable(addr tcpip.Address, flags Flags, bindToDevice tcpip.NICID, dst destination) bool {
 	if addr == anyIPAddress {
 		// If binding to the "any" address then check that there are no conflicts
 		// with all addresses.
 		for _, d := range b {
-			if !d.isAvailable(flags, bindToDevice) {
+			if !d.isAvailable(flags, bindToDevice, dst) {
 				return false
 			}
 		}
@@ -205,14 +305,14 @@ func (b bindAddresses) isAvailable(addr tcpip.Address, flags Flags, bindToDevice
 
 	// Check that there is no conflict with the "any" address.
 	if d, ok := b[anyIPAddress]; ok {
-		if !d.isAvailable(flags, bindToDevice) {
+		if !d.isAvailable(flags, bindToDevice, dst) {
 			return false
 		}
 	}
 
 	// Check that this is no conflict with the provided address.
 	if d, ok := b[addr]; ok {
-		if !d.isAvailable(flags, bindToDevice) {
+		if !d.isAvailable(flags, bindToDevice, dst) {
 			return false
 		}
 	}
@@ -278,17 +378,17 @@ func (s *PortManager) pickEphemeralPort(offset, count uint32, testPort func(p ui
 }
 
 // IsPortAvailable tests if the given port is available on all given protocols.
-func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
+func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dest tcpip.FullAddress) bool {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	return s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice)
+	return s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice, makeDestination(dest))
 }
 
-func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
+func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dst destination) bool {
 	for _, network := range networks {
 		desc := portDescriptor{network, transport, port}
 		if addrs, ok := s.allocatedPorts[desc]; ok {
-			if !addrs.isAvailable(addr, flags, bindToDevice) {
+			if !addrs.isAvailable(addr, flags, bindToDevice, dst) {
 				return false
 			}
 		}
@@ -300,14 +400,16 @@ func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumb
 // reserved by another endpoint. If port is zero, ReservePort will search for
 // an unreserved ephemeral port and reserve it, returning its value in the
 // "port" return value.
-func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) (reservedPort uint16, err *tcpip.Error) {
+func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dest tcpip.FullAddress) (reservedPort uint16, err *tcpip.Error) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
+	dst := makeDestination(dest)
+
 	// If a port is specified, just try to reserve it for all network
 	// protocols.
 	if port != 0 {
-		if !s.reserveSpecificPort(networks, transport, addr, port, flags, bindToDevice) {
+		if !s.reserveSpecificPort(networks, transport, addr, port, flags, bindToDevice, dst) {
 			return 0, tcpip.ErrPortInUse
 		}
 		return port, nil
@@ -315,16 +417,17 @@ func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transp
 
 	// A port wasn't specified, so try to find one.
 	return s.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
-		return s.reserveSpecificPort(networks, transport, addr, p, flags, bindToDevice), nil
+		return s.reserveSpecificPort(networks, transport, addr, p, flags, bindToDevice, dst), nil
 	})
 }
 
 // reserveSpecificPort tries to reserve the given port on all given protocols.
-func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) bool {
-	if !s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice) {
+func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dst destination) bool {
+	if !s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice, dst) {
 		return false
 	}
-	flagBits := flags.bits()
+
+	flagBits := flags.Bits()
 
 	// Reserve port on all network protocols.
 	for _, network := range networks {
@@ -339,9 +442,65 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
 			d = make(deviceNode)
 			m[addr] = d
 		}
-		n := d[bindToDevice]
-		n.refs[flagBits]++
-		d[bindToDevice] = n
+		p := d[bindToDevice]
+		if p == nil {
+			p = make(portNode)
+		}
+		n := p[dst]
+		n.AddRef(flagBits)
+		p[dst] = n
+		d[bindToDevice] = p
+	}
+
+	return true
+}
+
+// ReserveTuple adds a port reservation for the tuple on all given protocol.
+func (s *PortManager) ReserveTuple(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dest tcpip.FullAddress) bool {
+	flagBits := flags.Bits()
+	dst := makeDestination(dest)
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// It is easier to undo the entire reservation, so if we find that the
+	// tuple can't be fully added, finish and undo the whole thing.
+	undo := false
+
+	// Reserve port on all network protocols.
+	for _, network := range networks {
+		desc := portDescriptor{network, transport, port}
+		m, ok := s.allocatedPorts[desc]
+		if !ok {
+			m = make(bindAddresses)
+			s.allocatedPorts[desc] = m
+		}
+		d, ok := m[addr]
+		if !ok {
+			d = make(deviceNode)
+			m[addr] = d
+		}
+		p := d[bindToDevice]
+		if p == nil {
+			p = make(portNode)
+		}
+
+		n := p[dst]
+		if n.TotalRefs() != 0 && n.IntersectionRefs()&flagBits == 0 {
+			// Tuple already exists.
+			undo = true
+		}
+		n.AddRef(flagBits)
+		p[dst] = n
+		d[bindToDevice] = p
+	}
+
+	if undo {
+		// releasePortLocked decrements the counts (rather than setting
+		// them to zero), so it will undo the incorrect incrementing
+		// above.
+		s.releasePortLocked(networks, transport, addr, port, flagBits, bindToDevice, dst)
+		return false
 	}
 
 	return true
@@ -349,12 +508,14 @@ func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber
 
 // ReleasePort releases the reservation on a port/IP combination so that it can
 // be reserved by other endpoints.
-func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID) {
+func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dest tcpip.FullAddress) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	flagBits := flags.bits()
+	s.releasePortLocked(networks, transport, addr, port, flags.Bits(), bindToDevice, makeDestination(dest))
+}
 
+func (s *PortManager) releasePortLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags BitFlags, bindToDevice tcpip.NICID, dst destination) {
 	for _, network := range networks {
 		desc := portDescriptor{network, transport, port}
 		if m, ok := s.allocatedPorts[desc]; ok {
@@ -362,21 +523,32 @@ func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transp
 			if !ok {
 				continue
 			}
-			n, ok := d[bindToDevice]
+			p, ok := d[bindToDevice]
 			if !ok {
 				continue
 			}
-			n.refs[flagBits]--
-			d[bindToDevice] = n
-			if n.refs == [nextFlag]int{} {
-				delete(d, bindToDevice)
+			n, ok := p[dst]
+			if !ok {
+				continue
 			}
-			if len(d) == 0 {
-				delete(m, addr)
+			n.DropRef(flags)
+			if n.TotalRefs() > 0 {
+				p[dst] = n
+				continue
 			}
-			if len(m) == 0 {
-				delete(s.allocatedPorts, desc)
+			delete(p, dst)
+			if len(p) > 0 {
+				continue
+			}
+			delete(d, bindToDevice)
+			if len(d) > 0 {
+				continue
+			}
+			delete(m, addr)
+			if len(m) > 0 {
+				continue
 			}
+			delete(s.allocatedPorts, desc)
 		}
 	}
 }
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
index d6969d050..58db5868c 100644
--- a/pkg/tcpip/ports/ports_test.go
+++ b/pkg/tcpip/ports/ports_test.go
@@ -36,6 +36,7 @@ type portReserveTestAction struct {
 	flags   Flags
 	release bool
 	device  tcpip.NICID
+	dest    tcpip.FullAddress
 }
 
 func TestPortReservation(t *testing.T) {
@@ -272,6 +273,54 @@ func TestPortReservation(t *testing.T) {
 				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
 				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
 			},
+		}, {
+			tname: "bind tuple with reuseaddr, and then wildcard with reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 24}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{}, want: nil},
+			},
+		}, {
+			tname: "bind tuple with reuseaddr, and then wildcard",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 24}, want: nil},
+				{port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind wildcard with reuseaddr, and then tuple with reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 24}, want: nil},
+			},
+		}, {
+			tname: "bind tuple with reuseaddr, and then wildcard",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 24}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind two tuples with reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 24}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 25}, want: nil},
+			},
+		}, {
+			tname: "bind two tuples",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 24}, want: nil},
+				{port: 24, ip: fakeIPAddress, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 25}, want: nil},
+			},
+		}, {
+			tname: "bind wildcard, and then tuple with reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, dest: tcpip.FullAddress{}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 24}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind wildcard twice with reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: anyIPAddress, flags: Flags{TupleOnly: true}, want: nil},
+				{port: 24, ip: anyIPAddress, flags: Flags{TupleOnly: true}, want: nil},
+			},
 		},
 	} {
 		t.Run(test.tname, func(t *testing.T) {
@@ -280,19 +329,18 @@ func TestPortReservation(t *testing.T) {
 
 			for _, test := range test.actions {
 				if test.release {
-					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device)
+					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device, test.dest)
 					continue
 				}
-				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device)
+				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device, test.dest)
 				if err != test.want {
-					t.Fatalf("ReservePort(.., .., %s, %d, %+v, %d) = %v, want %v", test.ip, test.port, test.flags, test.device, err, test.want)
+					t.Fatalf("ReservePort(.., .., %s, %d, %+v, %d, %v) = %v, want %v", test.ip, test.port, test.flags, test.device, test.dest, err, test.want)
 				}
 				if test.port == 0 && (gotPort == 0 || gotPort < FirstEphemeral) {
-					t.Fatalf("ReservePort(.., .., .., 0) = %d, want port number >= %d to be picked", gotPort, FirstEphemeral)
+					t.Fatalf("ReservePort(.., .., .., 0, ..) = %d, want port number >= %d to be picked", gotPort, FirstEphemeral)
 				}
 			}
 		})
-
 	}
 }
 
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
index f71073207..900938dd1 100644
--- a/pkg/tcpip/stack/BUILD
+++ b/pkg/tcpip/stack/BUILD
@@ -16,6 +16,18 @@ go_template_instance(
 )
 
 go_template_instance(
+    name = "neighbor_entry_list",
+    out = "neighbor_entry_list.go",
+    package = "stack",
+    prefix = "neighborEntry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*neighborEntry",
+        "Linker": "*neighborEntry",
+    },
+)
+
+go_template_instance(
     name = "packet_buffer_list",
     out = "packet_buffer_list.go",
     package = "stack",
@@ -27,20 +39,39 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "tuple_list",
+    out = "tuple_list.go",
+    package = "stack",
+    prefix = "tuple",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*tuple",
+        "Linker": "*tuple",
+    },
+)
+
 go_library(
     name = "stack",
     srcs = [
         "conntrack.go",
         "dhcpv6configurationfromndpra_string.go",
         "forwarder.go",
+        "headertype_string.go",
         "icmp_rate_limit.go",
         "iptables.go",
+        "iptables_state.go",
         "iptables_targets.go",
         "iptables_types.go",
         "linkaddrcache.go",
         "linkaddrentry_list.go",
         "ndp.go",
+        "neighbor_cache.go",
+        "neighbor_entry.go",
+        "neighbor_entry_list.go",
+        "neighborstate_string.go",
         "nic.go",
+        "nud.go",
         "packet_buffer.go",
         "packet_buffer_list.go",
         "rand.go",
@@ -48,7 +79,9 @@ go_library(
         "route.go",
         "stack.go",
         "stack_global_state.go",
+        "stack_options.go",
         "transport_demuxer.go",
+        "tuple_list.go",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -74,10 +107,12 @@ go_test(
     size = "medium",
     srcs = [
         "ndp_test.go",
+        "nud_test.go",
         "stack_test.go",
         "transport_demuxer_test.go",
         "transport_test.go",
     ],
+    shard_count = 20,
     deps = [
         ":stack",
         "//pkg/rand",
@@ -87,12 +122,15 @@ go_test(
         "//pkg/tcpip/header",
         "//pkg/tcpip/link/channel",
         "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/network/arp",
         "//pkg/tcpip/network/ipv4",
         "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/ports",
         "//pkg/tcpip/transport/icmp",
         "//pkg/tcpip/transport/udp",
         "//pkg/waiter",
-        "@com_github_google_go-cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
     ],
 )
 
@@ -100,9 +138,13 @@ go_test(
     name = "stack_test",
     size = "small",
     srcs = [
+        "fake_time_test.go",
         "forwarder_test.go",
         "linkaddrcache_test.go",
+        "neighbor_cache_test.go",
+        "neighbor_entry_test.go",
         "nic_test.go",
+        "packet_buffer_test.go",
     ],
     library = ":stack",
     deps = [
@@ -110,5 +152,9 @@ go_test(
         "//pkg/sync",
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "@com_github_dpjacques_clockwork//:go_default_library",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
     ],
 )
diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go
index 7d1ede1f2..7dd344b4f 100644
--- a/pkg/tcpip/stack/conntrack.go
+++ b/pkg/tcpip/stack/conntrack.go
@@ -20,376 +20,383 @@ import (
 	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
-	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack"
 )
 
 // Connection tracking is used to track and manipulate packets for NAT rules.
-// The connection is created for a packet if it does not exist. Every connection
-// contains two tuples (original and reply). The tuples are manipulated if there
-// is a matching NAT rule. The packet is modified by looking at the tuples in the
-// Prerouting and Output hooks.
+// The connection is created for a packet if it does not exist. Every
+// connection contains two tuples (original and reply). The tuples are
+// manipulated if there is a matching NAT rule. The packet is modified by
+// looking at the tuples in the Prerouting and Output hooks.
+//
+// Currently, only TCP tracking is supported.
+
+// Our hash table has 16K buckets.
+// TODO(gvisor.dev/issue/170): These should be tunable.
+const numBuckets = 1 << 14
 
 // Direction of the tuple.
-type ctDirection int
+type direction int
 
 const (
-	dirOriginal ctDirection = iota
+	dirOriginal direction = iota
 	dirReply
 )
 
-// Status of connection.
-// TODO(gvisor.dev/issue/170): Add other states of connection.
-type connStatus int
-
-const (
-	connNew connStatus = iota
-	connEstablished
-)
-
 // Manipulation type for the connection.
 type manipType int
 
 const (
-	manipDstPrerouting manipType = iota
+	manipNone manipType = iota
+	manipDstPrerouting
 	manipDstOutput
 )
 
-// connTrackMutable is the manipulatable part of the tuple.
-type connTrackMutable struct {
-	// addr is source address of the tuple.
-	addr tcpip.Address
-
-	// port is source port of the tuple.
-	port uint16
-
-	// protocol is network layer protocol.
-	protocol tcpip.NetworkProtocolNumber
-}
-
-// connTrackImmutable is the non-manipulatable part of the tuple.
-type connTrackImmutable struct {
-	// addr is destination address of the tuple.
-	addr tcpip.Address
+// tuple holds a connection's identifying and manipulating data in one
+// direction. It is immutable.
+//
+// +stateify savable
+type tuple struct {
+	// tupleEntry is used to build an intrusive list of tuples.
+	tupleEntry
 
-	// direction is direction (original or reply) of the tuple.
-	direction ctDirection
+	tupleID
 
-	// port is destination port of the tuple.
-	port uint16
+	// conn is the connection tracking entry this tuple belongs to.
+	conn *conn
 
-	// protocol is transport layer protocol.
-	protocol tcpip.TransportProtocolNumber
+	// direction is the direction of the tuple.
+	direction direction
 }
 
-// connTrackTuple represents the tuple which is created from the
-// packet.
-type connTrackTuple struct {
-	// dst is non-manipulatable part of the tuple.
-	dst connTrackImmutable
-
-	// src is manipulatable part of the tuple.
-	src connTrackMutable
+// tupleID uniquely identifies a connection in one direction. It currently
+// contains enough information to distinguish between any TCP or UDP
+// connection, and will need to be extended to support other protocols.
+//
+// +stateify savable
+type tupleID struct {
+	srcAddr    tcpip.Address
+	srcPort    uint16
+	dstAddr    tcpip.Address
+	dstPort    uint16
+	transProto tcpip.TransportProtocolNumber
+	netProto   tcpip.NetworkProtocolNumber
 }
 
-// connTrackTupleHolder is the container of tuple and connection.
-type ConnTrackTupleHolder struct {
-	// conn is pointer to the connection tracking entry.
-	conn *connTrack
-
-	// tuple is original or reply tuple.
-	tuple connTrackTuple
+// reply creates the reply tupleID.
+func (ti tupleID) reply() tupleID {
+	return tupleID{
+		srcAddr:    ti.dstAddr,
+		srcPort:    ti.dstPort,
+		dstAddr:    ti.srcAddr,
+		dstPort:    ti.srcPort,
+		transProto: ti.transProto,
+		netProto:   ti.netProto,
+	}
 }
 
-// connTrack is the connection.
-type connTrack struct {
-	// originalTupleHolder contains tuple in original direction.
-	originalTupleHolder ConnTrackTupleHolder
-
-	// replyTupleHolder contains tuple in reply direction.
-	replyTupleHolder ConnTrackTupleHolder
-
-	// status indicates connection is new or established.
-	status connStatus
+// conn is a tracked connection.
+//
+// +stateify savable
+type conn struct {
+	// original is the tuple in original direction. It is immutable.
+	original tuple
 
-	// timeout indicates the time connection should be active.
-	timeout time.Duration
+	// reply is the tuple in reply direction. It is immutable.
+	reply tuple
 
-	// manip indicates if the packet should be manipulated.
+	// manip indicates if the packet should be manipulated. It is immutable.
 	manip manipType
 
-	// tcb is TCB control block. It is used to keep track of states
-	// of tcp connection.
-	tcb tcpconntrack.TCB
-
 	// tcbHook indicates if the packet is inbound or outbound to
-	// update the state of tcb.
+	// update the state of tcb. It is immutable.
 	tcbHook Hook
-}
 
-// ConnTrackTable contains a map of all existing connections created for
-// NAT rules.
-type ConnTrackTable struct {
-	// connMu protects connTrackTable.
-	connMu sync.RWMutex
+	// mu protects all mutable state.
+	mu sync.Mutex `state:"nosave"`
+	// tcb is TCB control block. It is used to keep track of states
+	// of tcp connection and is protected by mu.
+	tcb tcpconntrack.TCB
+	// lastUsed is the last time the connection saw a relevant packet, and
+	// is updated by each packet on the connection. It is protected by mu.
+	lastUsed time.Time `state:".(unixTime)"`
+}
 
-	// connTrackTable maintains a map of tuples needed for connection tracking
-	// for iptables NAT rules. The key for the map is an integer calculated
-	// using seed, source address, destination address, source port and
-	// destination port.
-	CtMap map[uint32]ConnTrackTupleHolder
+// timedOut returns whether the connection timed out based on its state.
+func (cn *conn) timedOut(now time.Time) bool {
+	const establishedTimeout = 5 * 24 * time.Hour
+	const defaultTimeout = 120 * time.Second
+	cn.mu.Lock()
+	defer cn.mu.Unlock()
+	if cn.tcb.State() == tcpconntrack.ResultAlive {
+		// Use the same default as Linux, which doesn't delete
+		// established connections for 5(!) days.
+		return now.Sub(cn.lastUsed) > establishedTimeout
+	}
+	// Use the same default as Linux, which lets connections in most states
+	// other than established remain for <= 120 seconds.
+	return now.Sub(cn.lastUsed) > defaultTimeout
+}
 
-	// seed is a one-time random value initialized at stack startup
-	// and is used in calculation of hash key for connection tracking
-	// table.
-	Seed uint32
+// update the connection tracking state.
+//
+// Precondition: ct.mu must be held.
+func (ct *conn) updateLocked(tcpHeader header.TCP, hook Hook) {
+	// Update the state of tcb. tcb assumes it's always initialized on the
+	// client. However, we only need to know whether the connection is
+	// established or not, so the client/server distinction isn't important.
+	// TODO(gvisor.dev/issue/170): Add support in tcpconntrack to handle
+	// other tcp states.
+	if ct.tcb.IsEmpty() {
+		ct.tcb.Init(tcpHeader)
+	} else if hook == ct.tcbHook {
+		ct.tcb.UpdateStateOutbound(tcpHeader)
+	} else {
+		ct.tcb.UpdateStateInbound(tcpHeader)
+	}
 }
 
-// parseHeaders sets headers in the packet.
-func parseHeaders(pkt *PacketBuffer) {
-	newPkt := pkt.Clone()
+// ConnTrack tracks all connections created for NAT rules. Most users are
+// expected to only call handlePacket, insertRedirectConn, and maybeInsertNoop.
+//
+// ConnTrack keeps all connections in a slice of buckets, each of which holds a
+// linked list of tuples. This gives us some desirable properties:
+// - Each bucket has its own lock, lessening lock contention.
+// - The slice is large enough that lists stay short (<10 elements on average).
+//   Thus traversal is fast.
+// - During linked list traversal we reap expired connections. This amortizes
+//   the cost of reaping them and makes reapUnused faster.
+//
+// Locks are ordered by their location in the buckets slice. That is, a
+// goroutine that locks buckets[i] can only lock buckets[j] s.t. i < j.
+//
+// +stateify savable
+type ConnTrack struct {
+	// seed is a one-time random value initialized at stack startup
+	// and is used in the calculation of hash keys for the list of buckets.
+	// It is immutable.
+	seed uint32
 
-	// Set network header.
-	hdr, ok := newPkt.Data.PullUp(header.IPv4MinimumSize)
-	if !ok {
-		return
-	}
-	netHeader := header.IPv4(hdr)
-	newPkt.NetworkHeader = hdr
-	length := int(netHeader.HeaderLength())
+	// mu protects the buckets slice, but not buckets' contents. Only take
+	// the write lock if you are modifying the slice or saving for S/R.
+	mu sync.RWMutex `state:"nosave"`
 
-	// TODO(gvisor.dev/issue/170): Need to support for other
-	// protocols as well.
-	// Set transport header.
-	switch protocol := netHeader.TransportProtocol(); protocol {
-	case header.UDPProtocolNumber:
-		if newPkt.TransportHeader == nil {
-			h, ok := newPkt.Data.PullUp(length + header.UDPMinimumSize)
-			if !ok {
-				return
-			}
-			newPkt.TransportHeader = buffer.View(header.UDP(h[length:]))
-		}
-	case header.TCPProtocolNumber:
-		if newPkt.TransportHeader == nil {
-			h, ok := newPkt.Data.PullUp(length + header.TCPMinimumSize)
-			if !ok {
-				return
-			}
-			newPkt.TransportHeader = buffer.View(header.TCP(h[length:]))
-		}
-	}
-	pkt.NetworkHeader = newPkt.NetworkHeader
-	pkt.TransportHeader = newPkt.TransportHeader
+	// buckets is protected by mu.
+	buckets []bucket
 }
 
-// packetToTuple converts packet to a tuple in original direction.
-func packetToTuple(pkt PacketBuffer, hook Hook) (connTrackTuple, *tcpip.Error) {
-	var tuple connTrackTuple
+// +stateify savable
+type bucket struct {
+	// mu protects tuples.
+	mu     sync.Mutex `state:"nosave"`
+	tuples tupleList
+}
 
-	netHeader := header.IPv4(pkt.NetworkHeader)
+// packetToTupleID converts packet to a tuple ID. It fails when pkt lacks a valid
+// TCP header.
+func packetToTupleID(pkt *PacketBuffer) (tupleID, *tcpip.Error) {
 	// TODO(gvisor.dev/issue/170): Need to support for other
 	// protocols as well.
-	if netHeader == nil || netHeader.TransportProtocol() != header.TCPProtocolNumber {
-		return tuple, tcpip.ErrUnknownProtocol
+	netHeader := header.IPv4(pkt.NetworkHeader().View())
+	if len(netHeader) < header.IPv4MinimumSize || netHeader.TransportProtocol() != header.TCPProtocolNumber {
+		return tupleID{}, tcpip.ErrUnknownProtocol
 	}
-	tcpHeader := header.TCP(pkt.TransportHeader)
-	if tcpHeader == nil {
-		return tuple, tcpip.ErrUnknownProtocol
+	tcpHeader := header.TCP(pkt.TransportHeader().View())
+	if len(tcpHeader) < header.TCPMinimumSize {
+		return tupleID{}, tcpip.ErrUnknownProtocol
 	}
 
-	tuple.src.addr = netHeader.SourceAddress()
-	tuple.src.port = tcpHeader.SourcePort()
-	tuple.src.protocol = header.IPv4ProtocolNumber
-
-	tuple.dst.addr = netHeader.DestinationAddress()
-	tuple.dst.port = tcpHeader.DestinationPort()
-	tuple.dst.protocol = netHeader.TransportProtocol()
-
-	return tuple, nil
-}
-
-// getReplyTuple creates reply tuple for the given tuple.
-func getReplyTuple(tuple connTrackTuple) connTrackTuple {
-	var replyTuple connTrackTuple
-	replyTuple.src.addr = tuple.dst.addr
-	replyTuple.src.port = tuple.dst.port
-	replyTuple.src.protocol = tuple.src.protocol
-	replyTuple.dst.addr = tuple.src.addr
-	replyTuple.dst.port = tuple.src.port
-	replyTuple.dst.protocol = tuple.dst.protocol
-	replyTuple.dst.direction = dirReply
-
-	return replyTuple
+	return tupleID{
+		srcAddr:    netHeader.SourceAddress(),
+		srcPort:    tcpHeader.SourcePort(),
+		dstAddr:    netHeader.DestinationAddress(),
+		dstPort:    tcpHeader.DestinationPort(),
+		transProto: netHeader.TransportProtocol(),
+		netProto:   header.IPv4ProtocolNumber,
+	}, nil
 }
 
-// makeNewConn creates new connection.
-func makeNewConn(tuple, replyTuple connTrackTuple) connTrack {
-	var conn connTrack
-	conn.status = connNew
-	conn.originalTupleHolder.tuple = tuple
-	conn.originalTupleHolder.conn = &conn
-	conn.replyTupleHolder.tuple = replyTuple
-	conn.replyTupleHolder.conn = &conn
-
-	return conn
+// newConn creates new connection.
+func newConn(orig, reply tupleID, manip manipType, hook Hook) *conn {
+	conn := conn{
+		manip:    manip,
+		tcbHook:  hook,
+		lastUsed: time.Now(),
+	}
+	conn.original = tuple{conn: &conn, tupleID: orig}
+	conn.reply = tuple{conn: &conn, tupleID: reply, direction: dirReply}
+	return &conn
 }
 
-// getTupleHash returns hash of the tuple. The fields used for
-// generating hash are seed (generated once for stack), source address,
-// destination address, source port and destination ports.
-func (ct *ConnTrackTable) getTupleHash(tuple connTrackTuple) uint32 {
-	h := jenkins.Sum32(ct.Seed)
-	h.Write([]byte(tuple.src.addr))
-	h.Write([]byte(tuple.dst.addr))
-	portBuf := make([]byte, 2)
-	binary.LittleEndian.PutUint16(portBuf, tuple.src.port)
-	h.Write([]byte(portBuf))
-	binary.LittleEndian.PutUint16(portBuf, tuple.dst.port)
-	h.Write([]byte(portBuf))
-
-	return h.Sum32()
+// connFor gets the conn for pkt if it exists, or returns nil
+// if it does not. It returns an error when pkt does not contain a valid TCP
+// header.
+// TODO(gvisor.dev/issue/170): Only TCP packets are supported. Need to support
+// other transport protocols.
+func (ct *ConnTrack) connFor(pkt *PacketBuffer) (*conn, direction) {
+	tid, err := packetToTupleID(pkt)
+	if err != nil {
+		return nil, dirOriginal
+	}
+	return ct.connForTID(tid)
 }
 
-// connTrackForPacket returns connTrack for packet.
-// TODO(gvisor.dev/issue/170): Only TCP packets are supported. Need to support other
-// transport protocols.
-func (ct *ConnTrackTable) connTrackForPacket(pkt *PacketBuffer, hook Hook, createConn bool) (*connTrack, ctDirection) {
-	if hook == Prerouting {
-		// Headers will not be set in Prerouting.
-		// TODO(gvisor.dev/issue/170): Change this after parsing headers
-		// code is added.
-		parseHeaders(pkt)
+func (ct *ConnTrack) connForTID(tid tupleID) (*conn, direction) {
+	bucket := ct.bucket(tid)
+	now := time.Now()
+
+	ct.mu.RLock()
+	defer ct.mu.RUnlock()
+	ct.buckets[bucket].mu.Lock()
+	defer ct.buckets[bucket].mu.Unlock()
+
+	// Iterate over the tuples in a bucket, cleaning up any unused
+	// connections we find.
+	for other := ct.buckets[bucket].tuples.Front(); other != nil; other = other.Next() {
+		// Clean up any timed-out connections we happen to find.
+		if ct.reapTupleLocked(other, bucket, now) {
+			// The tuple expired.
+			continue
+		}
+		if tid == other.tupleID {
+			return other.conn, other.direction
+		}
 	}
 
-	var dir ctDirection
-	tuple, err := packetToTuple(*pkt, hook)
-	if err != nil {
-		return nil, dir
-	}
-
-	ct.connMu.Lock()
-	defer ct.connMu.Unlock()
-
-	connTrackTable := ct.CtMap
-	hash := ct.getTupleHash(tuple)
-
-	var conn *connTrack
-	switch createConn {
-	case true:
-		// If connection does not exist for the hash, create a new
-		// connection.
-		replyTuple := getReplyTuple(tuple)
-		replyHash := ct.getTupleHash(replyTuple)
-		newConn := makeNewConn(tuple, replyTuple)
-		conn = &newConn
-
-		// Add tupleHolders to the map.
-		// TODO(gvisor.dev/issue/170): Need to support collisions using linked list.
-		ct.CtMap[hash] = conn.originalTupleHolder
-		ct.CtMap[replyHash] = conn.replyTupleHolder
-	default:
-		tupleHolder, ok := connTrackTable[hash]
-		if !ok {
-			return nil, dir
-		}
+	return nil, dirOriginal
+}
 
-		// If this is the reply of new connection, set the connection
-		// status as ESTABLISHED.
-		conn = tupleHolder.conn
-		if conn.status == connNew && tupleHolder.tuple.dst.direction == dirReply {
-			conn.status = connEstablished
-		}
-		if tupleHolder.conn == nil {
-			panic("tupleHolder has null connection tracking entry")
-		}
+func (ct *ConnTrack) insertRedirectConn(pkt *PacketBuffer, hook Hook, rt RedirectTarget) *conn {
+	tid, err := packetToTupleID(pkt)
+	if err != nil {
+		return nil
+	}
+	if hook != Prerouting && hook != Output {
+		return nil
+	}
 
-		dir = tupleHolder.tuple.dst.direction
+	// Create a new connection and change the port as per the iptables
+	// rule. This tuple will be used to manipulate the packet in
+	// handlePacket.
+	replyTID := tid.reply()
+	replyTID.srcAddr = rt.MinIP
+	replyTID.srcPort = rt.MinPort
+	var manip manipType
+	switch hook {
+	case Prerouting:
+		manip = manipDstPrerouting
+	case Output:
+		manip = manipDstOutput
 	}
-	return conn, dir
+	conn := newConn(tid, replyTID, manip, hook)
+	ct.insertConn(conn)
+	return conn
 }
 
-// SetNatInfo will manipulate the tuples according to iptables NAT rules.
-func (ct *ConnTrackTable) SetNatInfo(pkt *PacketBuffer, rt RedirectTarget, hook Hook) {
-	// Get the connection. Connection is always created before this
-	// function is called.
-	conn, _ := ct.connTrackForPacket(pkt, hook, false)
-	if conn == nil {
-		panic("connection should be created to manipulate tuples.")
+// insertConn inserts conn into the appropriate table bucket.
+func (ct *ConnTrack) insertConn(conn *conn) {
+	// Lock the buckets in the correct order.
+	tupleBucket := ct.bucket(conn.original.tupleID)
+	replyBucket := ct.bucket(conn.reply.tupleID)
+	ct.mu.RLock()
+	defer ct.mu.RUnlock()
+	if tupleBucket < replyBucket {
+		ct.buckets[tupleBucket].mu.Lock()
+		ct.buckets[replyBucket].mu.Lock()
+	} else if tupleBucket > replyBucket {
+		ct.buckets[replyBucket].mu.Lock()
+		ct.buckets[tupleBucket].mu.Lock()
+	} else {
+		// Both tuples are in the same bucket.
+		ct.buckets[tupleBucket].mu.Lock()
 	}
-	replyTuple := conn.replyTupleHolder.tuple
-	replyHash := ct.getTupleHash(replyTuple)
 
-	// TODO(gvisor.dev/issue/170): Support only redirect of ports. Need to
-	// support changing of address for Prerouting.
-
-	// Change the port as per the iptables rule. This tuple will be used
-	// to manipulate the packet in HandlePacket.
-	conn.replyTupleHolder.tuple.src.addr = rt.MinIP
-	conn.replyTupleHolder.tuple.src.port = rt.MinPort
-	newHash := ct.getTupleHash(conn.replyTupleHolder.tuple)
+	// Now that we hold the locks, ensure the tuple hasn't been inserted by
+	// another thread.
+	alreadyInserted := false
+	for other := ct.buckets[tupleBucket].tuples.Front(); other != nil; other = other.Next() {
+		if other.tupleID == conn.original.tupleID {
+			alreadyInserted = true
+			break
+		}
+	}
 
-	// Add the changed tuple to the map.
-	ct.connMu.Lock()
-	defer ct.connMu.Unlock()
-	ct.CtMap[newHash] = conn.replyTupleHolder
-	if hook == Output {
-		conn.replyTupleHolder.conn.manip = manipDstOutput
+	if !alreadyInserted {
+		// Add the tuple to the map.
+		ct.buckets[tupleBucket].tuples.PushFront(&conn.original)
+		ct.buckets[replyBucket].tuples.PushFront(&conn.reply)
 	}
 
-	// Delete the old tuple.
-	delete(ct.CtMap, replyHash)
+	// Unlocking can happen in any order.
+	ct.buckets[tupleBucket].mu.Unlock()
+	if tupleBucket != replyBucket {
+		ct.buckets[replyBucket].mu.Unlock()
+	}
 }
 
 // handlePacketPrerouting manipulates ports for packets in Prerouting hook.
-// TODO(gvisor.dev/issue/170): Change address for Prerouting hook..
-func handlePacketPrerouting(pkt *PacketBuffer, conn *connTrack, dir ctDirection) {
-	netHeader := header.IPv4(pkt.NetworkHeader)
-	tcpHeader := header.TCP(pkt.TransportHeader)
+// TODO(gvisor.dev/issue/170): Change address for Prerouting hook.
+func handlePacketPrerouting(pkt *PacketBuffer, conn *conn, dir direction) {
+	// If this is a noop entry, don't do anything.
+	if conn.manip == manipNone {
+		return
+	}
+
+	netHeader := header.IPv4(pkt.NetworkHeader().View())
+	tcpHeader := header.TCP(pkt.TransportHeader().View())
 
 	// For prerouting redirection, packets going in the original direction
 	// have their destinations modified and replies have their sources
 	// modified.
 	switch dir {
 	case dirOriginal:
-		port := conn.replyTupleHolder.tuple.src.port
+		port := conn.reply.srcPort
 		tcpHeader.SetDestinationPort(port)
-		netHeader.SetDestinationAddress(conn.replyTupleHolder.tuple.src.addr)
+		netHeader.SetDestinationAddress(conn.reply.srcAddr)
 	case dirReply:
-		port := conn.originalTupleHolder.tuple.dst.port
+		port := conn.original.dstPort
 		tcpHeader.SetSourcePort(port)
-		netHeader.SetSourceAddress(conn.originalTupleHolder.tuple.dst.addr)
+		netHeader.SetSourceAddress(conn.original.dstAddr)
 	}
 
+	// TODO(gvisor.dev/issue/170): TCP checksums aren't usually validated
+	// on inbound packets, so we don't recalculate them. However, we should
+	// support cases when they are validated, e.g. when we can't offload
+	// receive checksumming.
+
 	netHeader.SetChecksum(0)
 	netHeader.SetChecksum(^netHeader.CalculateChecksum())
 }
 
 // handlePacketOutput manipulates ports for packets in Output hook.
-func handlePacketOutput(pkt *PacketBuffer, conn *connTrack, gso *GSO, r *Route, dir ctDirection) {
-	netHeader := header.IPv4(pkt.NetworkHeader)
-	tcpHeader := header.TCP(pkt.TransportHeader)
+func handlePacketOutput(pkt *PacketBuffer, conn *conn, gso *GSO, r *Route, dir direction) {
+	// If this is a noop entry, don't do anything.
+	if conn.manip == manipNone {
+		return
+	}
+
+	netHeader := header.IPv4(pkt.NetworkHeader().View())
+	tcpHeader := header.TCP(pkt.TransportHeader().View())
 
 	// For output redirection, packets going in the original direction
 	// have their destinations modified and replies have their sources
 	// modified. For prerouting redirection, we only reach this point
 	// when replying, so packet sources are modified.
 	if conn.manip == manipDstOutput && dir == dirOriginal {
-		port := conn.replyTupleHolder.tuple.src.port
+		port := conn.reply.srcPort
 		tcpHeader.SetDestinationPort(port)
-		netHeader.SetDestinationAddress(conn.replyTupleHolder.tuple.src.addr)
+		netHeader.SetDestinationAddress(conn.reply.srcAddr)
 	} else {
-		port := conn.originalTupleHolder.tuple.dst.port
+		port := conn.original.dstPort
 		tcpHeader.SetSourcePort(port)
-		netHeader.SetSourceAddress(conn.originalTupleHolder.tuple.dst.addr)
+		netHeader.SetSourceAddress(conn.original.dstAddr)
 	}
 
 	// Calculate the TCP checksum and set it.
 	tcpHeader.SetChecksum(0)
-	hdr := &pkt.Header
-	length := uint16(pkt.Data.Size()+hdr.UsedLength()) - uint16(netHeader.HeaderLength())
+	length := uint16(pkt.Size()) - uint16(netHeader.HeaderLength())
 	xsum := r.PseudoHeaderChecksum(header.TCPProtocolNumber, length)
 	if gso != nil && gso.NeedsCsum {
 		tcpHeader.SetChecksum(xsum)
@@ -402,33 +409,32 @@ func handlePacketOutput(pkt *PacketBuffer, conn *connTrack, gso *GSO, r *Route,
 	netHeader.SetChecksum(^netHeader.CalculateChecksum())
 }
 
-// HandlePacket will manipulate the port and address of the packet if the
-// connection exists.
-func (ct *ConnTrackTable) HandlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Route) {
+// handlePacket will manipulate the port and address of the packet if the
+// connection exists. Returns whether, after the packet traverses the tables,
+// it should create a new entry in the table.
+func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Route) bool {
 	if pkt.NatDone {
-		return
+		return false
 	}
 
 	if hook != Prerouting && hook != Output {
-		return
+		return false
 	}
 
-	conn, dir := ct.connTrackForPacket(pkt, hook, false)
-	// Connection or Rule not found for the packet.
-	if conn == nil {
-		return
+	// TODO(gvisor.dev/issue/170): Support other transport protocols.
+	if nh := pkt.NetworkHeader().View(); nh.IsEmpty() || header.IPv4(nh).TransportProtocol() != header.TCPProtocolNumber {
+		return false
 	}
 
-	netHeader := header.IPv4(pkt.NetworkHeader)
-	// TODO(gvisor.dev/issue/170): Need to support for other transport
-	// protocols as well.
-	if netHeader == nil || netHeader.TransportProtocol() != header.TCPProtocolNumber {
-		return
+	conn, dir := ct.connFor(pkt)
+	// Connection or Rule not found for the packet.
+	if conn == nil {
+		return true
 	}
 
-	tcpHeader := header.TCP(pkt.TransportHeader)
-	if tcpHeader == nil {
-		return
+	tcpHeader := header.TCP(pkt.TransportHeader().View())
+	if len(tcpHeader) < header.TCPMinimumSize {
+		return false
 	}
 
 	switch hook {
@@ -442,39 +448,184 @@ func (ct *ConnTrackTable) HandlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r
 	// Update the state of tcb.
 	// TODO(gvisor.dev/issue/170): Add support in tcpcontrack to handle
 	// other tcp states.
-	var st tcpconntrack.Result
-	if conn.tcb.IsEmpty() {
-		conn.tcb.Init(tcpHeader)
-		conn.tcbHook = hook
-	} else {
-		switch hook {
-		case conn.tcbHook:
-			st = conn.tcb.UpdateStateOutbound(tcpHeader)
-		default:
-			st = conn.tcb.UpdateStateInbound(tcpHeader)
-		}
+	conn.mu.Lock()
+	defer conn.mu.Unlock()
+
+	// Mark the connection as having been used recently so it isn't reaped.
+	conn.lastUsed = time.Now()
+	// Update connection state.
+	conn.updateLocked(header.TCP(pkt.TransportHeader().View()), hook)
+
+	return false
+}
+
+// maybeInsertNoop tries to insert a no-op connection entry to keep connections
+// from getting clobbered when replies arrive. It only inserts if there isn't
+// already a connection for pkt.
+//
+// This should be called after traversing iptables rules only, to ensure that
+// pkt.NatDone is set correctly.
+func (ct *ConnTrack) maybeInsertNoop(pkt *PacketBuffer, hook Hook) {
+	// If there were a rule applying to this packet, it would be marked
+	// with NatDone.
+	if pkt.NatDone {
+		return
 	}
 
-	// Delete conntrack if tcp connection is closed.
-	if st == tcpconntrack.ResultClosedByPeer || st == tcpconntrack.ResultClosedBySelf || st == tcpconntrack.ResultReset {
-		ct.deleteConnTrack(conn)
+	// We only track TCP connections.
+	if nh := pkt.NetworkHeader().View(); nh.IsEmpty() || header.IPv4(nh).TransportProtocol() != header.TCPProtocolNumber {
+		return
 	}
-}
 
-// deleteConnTrack deletes the connection.
-func (ct *ConnTrackTable) deleteConnTrack(conn *connTrack) {
-	if conn == nil {
+	// This is the first packet we're seeing for the TCP connection. Insert
+	// the noop entry (an identity mapping) so that the response doesn't
+	// get NATed, breaking the connection.
+	tid, err := packetToTupleID(pkt)
+	if err != nil {
 		return
 	}
+	conn := newConn(tid, tid.reply(), manipNone, hook)
+	conn.updateLocked(header.TCP(pkt.TransportHeader().View()), hook)
+	ct.insertConn(conn)
+}
 
-	tuple := conn.originalTupleHolder.tuple
-	hash := ct.getTupleHash(tuple)
-	replyTuple := conn.replyTupleHolder.tuple
-	replyHash := ct.getTupleHash(replyTuple)
+// bucket gets the conntrack bucket for a tupleID.
+func (ct *ConnTrack) bucket(id tupleID) int {
+	h := jenkins.Sum32(ct.seed)
+	h.Write([]byte(id.srcAddr))
+	h.Write([]byte(id.dstAddr))
+	shortBuf := make([]byte, 2)
+	binary.LittleEndian.PutUint16(shortBuf, id.srcPort)
+	h.Write([]byte(shortBuf))
+	binary.LittleEndian.PutUint16(shortBuf, id.dstPort)
+	h.Write([]byte(shortBuf))
+	binary.LittleEndian.PutUint16(shortBuf, uint16(id.transProto))
+	h.Write([]byte(shortBuf))
+	binary.LittleEndian.PutUint16(shortBuf, uint16(id.netProto))
+	h.Write([]byte(shortBuf))
+	ct.mu.RLock()
+	defer ct.mu.RUnlock()
+	return int(h.Sum32()) % len(ct.buckets)
+}
 
-	ct.connMu.Lock()
-	defer ct.connMu.Unlock()
+// reapUnused deletes timed out entries from the conntrack map. The rules for
+// reaping are:
+// - Most reaping occurs in connFor, which is called on each packet. connFor
+//   cleans up the bucket the packet's connection maps to. Thus calls to
+//   reapUnused should be fast.
+// - Each call to reapUnused traverses a fraction of the conntrack table.
+//   Specifically, it traverses len(ct.buckets)/fractionPerReaping.
+// - After reaping, reapUnused decides when it should next run based on the
+//   ratio of expired connections to examined connections. If the ratio is
+//   greater than maxExpiredPct, it schedules the next run quickly. Otherwise it
+//   slightly increases the interval between runs.
+// - maxFullTraversal caps the time it takes to traverse the entire table.
+//
+// reapUnused returns the next bucket that should be checked and the time after
+// which it should be called again.
+func (ct *ConnTrack) reapUnused(start int, prevInterval time.Duration) (int, time.Duration) {
+	// TODO(gvisor.dev/issue/170): This can be more finely controlled, as
+	// it is in Linux via sysctl.
+	const fractionPerReaping = 128
+	const maxExpiredPct = 50
+	const maxFullTraversal = 60 * time.Second
+	const minInterval = 10 * time.Millisecond
+	const maxInterval = maxFullTraversal / fractionPerReaping
+
+	now := time.Now()
+	checked := 0
+	expired := 0
+	var idx int
+	ct.mu.RLock()
+	defer ct.mu.RUnlock()
+	for i := 0; i < len(ct.buckets)/fractionPerReaping; i++ {
+		idx = (i + start) % len(ct.buckets)
+		ct.buckets[idx].mu.Lock()
+		for tuple := ct.buckets[idx].tuples.Front(); tuple != nil; tuple = tuple.Next() {
+			checked++
+			if ct.reapTupleLocked(tuple, idx, now) {
+				expired++
+			}
+		}
+		ct.buckets[idx].mu.Unlock()
+	}
+	// We already checked buckets[idx].
+	idx++
+
+	// If half or more of the connections are expired, the table has gotten
+	// stale. Reschedule quickly.
+	expiredPct := 0
+	if checked != 0 {
+		expiredPct = expired * 100 / checked
+	}
+	if expiredPct > maxExpiredPct {
+		return idx, minInterval
+	}
+	if interval := prevInterval + minInterval; interval <= maxInterval {
+		// Increment the interval between runs.
+		return idx, interval
+	}
+	// We've hit the maximum interval.
+	return idx, maxInterval
+}
+
+// reapTupleLocked tries to remove tuple and its reply from the table. It
+// returns whether the tuple's connection has timed out.
+//
+// Preconditions: ct.mu is locked for reading and bucket is locked.
+func (ct *ConnTrack) reapTupleLocked(tuple *tuple, bucket int, now time.Time) bool {
+	if !tuple.conn.timedOut(now) {
+		return false
+	}
+
+	// To maintain lock order, we can only reap these tuples if the reply
+	// appears later in the table.
+	replyBucket := ct.bucket(tuple.reply())
+	if bucket > replyBucket {
+		return true
+	}
+
+	// Don't re-lock if both tuples are in the same bucket.
+	differentBuckets := bucket != replyBucket
+	if differentBuckets {
+		ct.buckets[replyBucket].mu.Lock()
+	}
+
+	// We have the buckets locked and can remove both tuples.
+	if tuple.direction == dirOriginal {
+		ct.buckets[replyBucket].tuples.Remove(&tuple.conn.reply)
+	} else {
+		ct.buckets[replyBucket].tuples.Remove(&tuple.conn.original)
+	}
+	ct.buckets[bucket].tuples.Remove(tuple)
+
+	// Don't re-unlock if both tuples are in the same bucket.
+	if differentBuckets {
+		ct.buckets[replyBucket].mu.Unlock()
+	}
+
+	return true
+}
+
+func (ct *ConnTrack) originalDst(epID TransportEndpointID) (tcpip.Address, uint16, *tcpip.Error) {
+	// Lookup the connection. The reply's original destination
+	// describes the original address.
+	tid := tupleID{
+		srcAddr:    epID.LocalAddress,
+		srcPort:    epID.LocalPort,
+		dstAddr:    epID.RemoteAddress,
+		dstPort:    epID.RemotePort,
+		transProto: header.TCPProtocolNumber,
+		netProto:   header.IPv4ProtocolNumber,
+	}
+	conn, _ := ct.connForTID(tid)
+	if conn == nil {
+		// Not a tracked connection.
+		return "", 0, tcpip.ErrNotConnected
+	} else if conn.manip == manipNone {
+		// Unmanipulated connection.
+		return "", 0, tcpip.ErrInvalidOptionValue
+	}
 
-	delete(ct.CtMap, hash)
-	delete(ct.CtMap, replyHash)
+	return conn.original.dstAddr, conn.original.dstPort, nil
 }
diff --git a/pkg/tcpip/stack/fake_time_test.go b/pkg/tcpip/stack/fake_time_test.go
new file mode 100644
index 000000000..92c8cb534
--- /dev/null
+++ b/pkg/tcpip/stack/fake_time_test.go
@@ -0,0 +1,209 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"container/heap"
+	"sync"
+	"time"
+
+	"github.com/dpjacques/clockwork"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+type fakeClock struct {
+	clock clockwork.FakeClock
+
+	// mu protects the fields below.
+	mu sync.RWMutex
+
+	// times is min-heap of times. A heap is used for quick retrieval of the next
+	// upcoming time of scheduled work.
+	times *timeHeap
+
+	// waitGroups stores one WaitGroup for all work scheduled to execute at the
+	// same time via AfterFunc. This allows parallel execution of all functions
+	// passed to AfterFunc scheduled for the same time.
+	waitGroups map[time.Time]*sync.WaitGroup
+}
+
+func newFakeClock() *fakeClock {
+	return &fakeClock{
+		clock:      clockwork.NewFakeClock(),
+		times:      &timeHeap{},
+		waitGroups: make(map[time.Time]*sync.WaitGroup),
+	}
+}
+
+var _ tcpip.Clock = (*fakeClock)(nil)
+
+// NowNanoseconds implements tcpip.Clock.NowNanoseconds.
+func (fc *fakeClock) NowNanoseconds() int64 {
+	return fc.clock.Now().UnixNano()
+}
+
+// NowMonotonic implements tcpip.Clock.NowMonotonic.
+func (fc *fakeClock) NowMonotonic() int64 {
+	return fc.NowNanoseconds()
+}
+
+// AfterFunc implements tcpip.Clock.AfterFunc.
+func (fc *fakeClock) AfterFunc(d time.Duration, f func()) tcpip.Timer {
+	until := fc.clock.Now().Add(d)
+	wg := fc.addWait(until)
+	return &fakeTimer{
+		clock: fc,
+		until: until,
+		timer: fc.clock.AfterFunc(d, func() {
+			defer wg.Done()
+			f()
+		}),
+	}
+}
+
+// addWait adds an additional wait to the WaitGroup for parallel execution of
+// all work scheduled for t. Returns a reference to the WaitGroup modified.
+func (fc *fakeClock) addWait(t time.Time) *sync.WaitGroup {
+	fc.mu.RLock()
+	wg, ok := fc.waitGroups[t]
+	fc.mu.RUnlock()
+
+	if ok {
+		wg.Add(1)
+		return wg
+	}
+
+	fc.mu.Lock()
+	heap.Push(fc.times, t)
+	fc.mu.Unlock()
+
+	wg = &sync.WaitGroup{}
+	wg.Add(1)
+
+	fc.mu.Lock()
+	fc.waitGroups[t] = wg
+	fc.mu.Unlock()
+
+	return wg
+}
+
+// removeWait removes a wait from the WaitGroup for parallel execution of all
+// work scheduled for t.
+func (fc *fakeClock) removeWait(t time.Time) {
+	fc.mu.RLock()
+	defer fc.mu.RUnlock()
+
+	wg := fc.waitGroups[t]
+	wg.Done()
+}
+
+// advance executes all work that have been scheduled to execute within d from
+// the current fake time. Blocks until all work has completed execution.
+func (fc *fakeClock) advance(d time.Duration) {
+	// Block until all the work is done
+	until := fc.clock.Now().Add(d)
+	for {
+		fc.mu.Lock()
+		if fc.times.Len() == 0 {
+			fc.mu.Unlock()
+			return
+		}
+
+		t := heap.Pop(fc.times).(time.Time)
+		if t.After(until) {
+			// No work to do
+			heap.Push(fc.times, t)
+			fc.mu.Unlock()
+			return
+		}
+		fc.mu.Unlock()
+
+		diff := t.Sub(fc.clock.Now())
+		fc.clock.Advance(diff)
+
+		fc.mu.RLock()
+		wg := fc.waitGroups[t]
+		fc.mu.RUnlock()
+
+		wg.Wait()
+
+		fc.mu.Lock()
+		delete(fc.waitGroups, t)
+		fc.mu.Unlock()
+	}
+}
+
+type fakeTimer struct {
+	clock *fakeClock
+	timer clockwork.Timer
+
+	mu    sync.RWMutex
+	until time.Time
+}
+
+var _ tcpip.Timer = (*fakeTimer)(nil)
+
+// Reset implements tcpip.Timer.Reset.
+func (ft *fakeTimer) Reset(d time.Duration) {
+	if !ft.timer.Reset(d) {
+		return
+	}
+
+	ft.mu.Lock()
+	defer ft.mu.Unlock()
+
+	ft.clock.removeWait(ft.until)
+	ft.until = ft.clock.clock.Now().Add(d)
+	ft.clock.addWait(ft.until)
+}
+
+// Stop implements tcpip.Timer.Stop.
+func (ft *fakeTimer) Stop() bool {
+	if !ft.timer.Stop() {
+		return false
+	}
+
+	ft.mu.RLock()
+	defer ft.mu.RUnlock()
+
+	ft.clock.removeWait(ft.until)
+	return true
+}
+
+type timeHeap []time.Time
+
+var _ heap.Interface = (*timeHeap)(nil)
+
+func (h timeHeap) Len() int {
+	return len(h)
+}
+
+func (h timeHeap) Less(i, j int) bool {
+	return h[i].Before(h[j])
+}
+
+func (h timeHeap) Swap(i, j int) {
+	h[i], h[j] = h[j], h[i]
+}
+
+func (h *timeHeap) Push(x interface{}) {
+	*h = append(*h, x.(time.Time))
+}
+
+func (h *timeHeap) Pop() interface{} {
+	last := (*h)[len(*h)-1]
+	*h = (*h)[:len(*h)-1]
+	return last
+}
diff --git a/pkg/tcpip/stack/forwarder.go b/pkg/tcpip/stack/forwarder.go
index 6b64cd37f..3eff141e6 100644
--- a/pkg/tcpip/stack/forwarder.go
+++ b/pkg/tcpip/stack/forwarder.go
@@ -32,7 +32,7 @@ type pendingPacket struct {
 	nic   *NIC
 	route *Route
 	proto tcpip.NetworkProtocolNumber
-	pkt   PacketBuffer
+	pkt   *PacketBuffer
 }
 
 type forwardQueue struct {
@@ -50,7 +50,7 @@ func newForwardQueue() *forwardQueue {
 	return &forwardQueue{packets: make(map[<-chan struct{}][]*pendingPacket)}
 }
 
-func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
+func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
 	shouldWait := false
 
 	f.Lock()
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
index 8084d50bc..944f622fd 100644
--- a/pkg/tcpip/stack/forwarder_test.go
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -22,6 +22,7 @@ import (
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 const (
@@ -33,6 +34,10 @@ const (
 	// except where another value is explicitly used. It is chosen to match
 	// the MTU of loopback interfaces on linux systems.
 	fwdTestNetDefaultMTU = 65536
+
+	dstAddrOffset        = 0
+	srcAddrOffset        = 1
+	protocolNumberOffset = 2
 )
 
 // fwdTestNetworkEndpoint is a network-layer protocol endpoint.
@@ -68,16 +73,9 @@ func (f *fwdTestNetworkEndpoint) ID() *NetworkEndpointID {
 	return &f.id
 }
 
-func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt PacketBuffer) {
-	// Consume the network header.
-	b, ok := pkt.Data.PullUp(fwdTestNetHeaderLen)
-	if !ok {
-		return
-	}
-	pkt.Data.TrimFront(fwdTestNetHeaderLen)
-
+func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt *PacketBuffer) {
 	// Dispatch the packet to the transport protocol.
-	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(b[2]), pkt)
+	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(pkt.NetworkHeader().View()[protocolNumberOffset]), pkt)
 }
 
 func (f *fwdTestNetworkEndpoint) MaxHeaderLength() uint16 {
@@ -96,13 +94,13 @@ func (f *fwdTestNetworkEndpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNu
 	return f.proto.Number()
 }
 
-func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt PacketBuffer) *tcpip.Error {
+func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error {
 	// Add the protocol's header to the packet and send it to the link
 	// endpoint.
-	b := pkt.Header.Prepend(fwdTestNetHeaderLen)
-	b[0] = r.RemoteAddress[0]
-	b[1] = f.id.LocalAddress[0]
-	b[2] = byte(params.Protocol)
+	b := pkt.NetworkHeader().Push(fwdTestNetHeaderLen)
+	b[dstAddrOffset] = r.RemoteAddress[0]
+	b[srcAddrOffset] = f.id.LocalAddress[0]
+	b[protocolNumberOffset] = byte(params.Protocol)
 
 	return f.ep.WritePacket(r, gso, fwdTestNetNumber, pkt)
 }
@@ -112,7 +110,7 @@ func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBuf
 	panic("not implemented")
 }
 
-func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt PacketBuffer) *tcpip.Error {
+func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
@@ -123,10 +121,12 @@ func (*fwdTestNetworkEndpoint) Close() {}
 type fwdTestNetworkProtocol struct {
 	addrCache              *linkAddrCache
 	addrResolveDelay       time.Duration
-	onLinkAddressResolved  func(cache *linkAddrCache, addr tcpip.Address)
+	onLinkAddressResolved  func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress)
 	onResolveStaticAddress func(tcpip.Address) (tcpip.LinkAddress, bool)
 }
 
+var _ LinkAddressResolver = (*fwdTestNetworkProtocol)(nil)
+
 func (f *fwdTestNetworkProtocol) Number() tcpip.NetworkProtocolNumber {
 	return fwdTestNetNumber
 }
@@ -140,7 +140,15 @@ func (f *fwdTestNetworkProtocol) DefaultPrefixLen() int {
 }
 
 func (*fwdTestNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
-	return tcpip.Address(v[1:2]), tcpip.Address(v[0:1])
+	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
+}
+
+func (*fwdTestNetworkProtocol) Parse(pkt *PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
+	netHeader, ok := pkt.NetworkHeader().Consume(fwdTestNetHeaderLen)
+	if !ok {
+		return 0, false, false
+	}
+	return tcpip.TransportProtocolNumber(netHeader[protocolNumberOffset]), true, true
 }
 
 func (f *fwdTestNetworkProtocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, ep LinkEndpoint, _ *Stack) (NetworkEndpoint, *tcpip.Error) {
@@ -166,10 +174,10 @@ func (f *fwdTestNetworkProtocol) Close() {}
 
 func (f *fwdTestNetworkProtocol) Wait() {}
 
-func (f *fwdTestNetworkProtocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP LinkEndpoint) *tcpip.Error {
+func (f *fwdTestNetworkProtocol) LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error {
 	if f.addrCache != nil && f.onLinkAddressResolved != nil {
 		time.AfterFunc(f.addrResolveDelay, func() {
-			f.onLinkAddressResolved(f.addrCache, addr)
+			f.onLinkAddressResolved(f.addrCache, addr, remoteLinkAddr)
 		})
 	}
 	return nil
@@ -190,7 +198,7 @@ func (f *fwdTestNetworkProtocol) LinkAddressProtocol() tcpip.NetworkProtocolNumb
 type fwdTestPacketInfo struct {
 	RemoteLinkAddress tcpip.LinkAddress
 	LocalLinkAddress  tcpip.LinkAddress
-	Pkt               PacketBuffer
+	Pkt               *PacketBuffer
 }
 
 type fwdTestLinkEndpoint struct {
@@ -203,13 +211,13 @@ type fwdTestLinkEndpoint struct {
 }
 
 // InjectInbound injects an inbound packet.
-func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
+func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
 	e.InjectLinkAddr(protocol, "", pkt)
 }
 
 // InjectLinkAddr injects an inbound packet with a remote link address.
-func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt PacketBuffer) {
-	e.dispatcher.DeliverNetworkPacket(e, remote, "" /* local */, protocol, pkt)
+func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt *PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
 }
 
 // Attach saves the stack network-layer dispatcher for use later when packets
@@ -251,7 +259,7 @@ func (e *fwdTestLinkEndpoint) LinkAddress() tcpip.LinkAddress {
 	return e.linkAddr
 }
 
-func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) *tcpip.Error {
+func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
 	p := fwdTestPacketInfo{
 		RemoteLinkAddress: r.RemoteLinkAddress,
 		LocalLinkAddress:  r.LocalLinkAddress,
@@ -270,7 +278,7 @@ func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.Netw
 func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
 	n := 0
 	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
-		e.WritePacket(r, gso, protocol, *pkt)
+		e.WritePacket(r, gso, protocol, pkt)
 		n++
 	}
 
@@ -280,7 +288,7 @@ func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBuffer
 // WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
 func (e *fwdTestLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
 	p := fwdTestPacketInfo{
-		Pkt: PacketBuffer{Data: vv},
+		Pkt: NewPacketBuffer(PacketBufferOptions{Data: vv}),
 	}
 
 	select {
@@ -294,6 +302,16 @@ func (e *fwdTestLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Er
 // Wait implements stack.LinkEndpoint.Wait.
 func (*fwdTestLinkEndpoint) Wait() {}
 
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
+func (*fwdTestLinkEndpoint) ARPHardwareType() header.ARPHardwareType {
+	panic("not implemented")
+}
+
+// AddHeader implements stack.LinkEndpoint.AddHeader.
+func (e *fwdTestLinkEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	panic("not implemented")
+}
+
 func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol) (ep1, ep2 *fwdTestLinkEndpoint) {
 	// Create a stack with the network protocol and two NICs.
 	s := New(Options{
@@ -361,10 +379,10 @@ func TestForwardingWithStaticResolver(t *testing.T) {
 	// Inject an inbound packet to address 3 on NIC 1, and see if it is
 	// forwarded to NIC 2.
 	buf := buffer.NewView(30)
-	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 
 	var p fwdTestPacketInfo
 
@@ -387,7 +405,7 @@ func TestForwardingWithFakeResolver(t *testing.T) {
 	// Create a network protocol with a fake resolver.
 	proto := &fwdTestNetworkProtocol{
 		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
 			// Any address will be resolved to the link address "c".
 			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
 		},
@@ -398,10 +416,10 @@ func TestForwardingWithFakeResolver(t *testing.T) {
 	// Inject an inbound packet to address 3 on NIC 1, and see if it is
 	// forwarded to NIC 2.
 	buf := buffer.NewView(30)
-	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 
 	var p fwdTestPacketInfo
 
@@ -429,10 +447,10 @@ func TestForwardingWithNoResolver(t *testing.T) {
 	// inject an inbound packet to address 3 on NIC 1, and see if it is
 	// forwarded to NIC 2.
 	buf := buffer.NewView(30)
-	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 
 	select {
 	case <-ep2.C:
@@ -445,7 +463,7 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
 	// Create a network protocol with a fake resolver.
 	proto := &fwdTestNetworkProtocol{
 		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
 			// Only packets to address 3 will be resolved to the
 			// link address "c".
 			if addr == "\x03" {
@@ -459,18 +477,18 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
 	// Inject an inbound packet to address 4 on NIC 1. This packet should
 	// not be forwarded.
 	buf := buffer.NewView(30)
-	buf[0] = 4
-	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+	buf[dstAddrOffset] = 4
+	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 
 	// Inject an inbound packet to address 3 on NIC 1, and see if it is
 	// forwarded to NIC 2.
 	buf = buffer.NewView(30)
-	buf[0] = 3
-	ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 
 	var p fwdTestPacketInfo
 
@@ -480,9 +498,8 @@ func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
 		t.Fatal("packet not forwarded")
 	}
 
-	b := p.Pkt.Data.ToView()
-	if b[0] != 3 {
-		t.Fatalf("got b[0] = %d, want = 3", b[0])
+	if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] != 3 {
+		t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", nh[dstAddrOffset])
 	}
 
 	// Test that the address resolution happened correctly.
@@ -498,7 +515,7 @@ func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
 	// Create a network protocol with a fake resolver.
 	proto := &fwdTestNetworkProtocol{
 		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
 			// Any packets will be resolved to the link address "c".
 			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
 		},
@@ -509,10 +526,10 @@ func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
 	// Inject two inbound packets to address 3 on NIC 1.
 	for i := 0; i < 2; i++ {
 		buf := buffer.NewView(30)
-		buf[0] = 3
-		ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+		buf[dstAddrOffset] = 3
+		ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
 			Data: buf.ToVectorisedView(),
-		})
+		}))
 	}
 
 	for i := 0; i < 2; i++ {
@@ -524,9 +541,8 @@ func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
 			t.Fatal("packet not forwarded")
 		}
 
-		b := p.Pkt.Data.ToView()
-		if b[0] != 3 {
-			t.Fatalf("got b[0] = %d, want = 3", b[0])
+		if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] != 3 {
+			t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", nh[dstAddrOffset])
 		}
 
 		// Test that the address resolution happened correctly.
@@ -543,7 +559,7 @@ func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
 	// Create a network protocol with a fake resolver.
 	proto := &fwdTestNetworkProtocol{
 		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
 			// Any packets will be resolved to the link address "c".
 			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
 		},
@@ -554,12 +570,12 @@ func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
 	for i := 0; i < maxPendingPacketsPerResolution+5; i++ {
 		// Inject inbound 'maxPendingPacketsPerResolution + 5' packets on NIC 1.
 		buf := buffer.NewView(30)
-		buf[0] = 3
+		buf[dstAddrOffset] = 3
 		// Set the packet sequence number.
 		binary.BigEndian.PutUint16(buf[fwdTestNetHeaderLen:], uint16(i))
-		ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+		ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
 			Data: buf.ToVectorisedView(),
-		})
+		}))
 	}
 
 	for i := 0; i < maxPendingPacketsPerResolution; i++ {
@@ -571,14 +587,19 @@ func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
 			t.Fatal("packet not forwarded")
 		}
 
-		b := p.Pkt.Data.ToView()
-		if b[0] != 3 {
-			t.Fatalf("got b[0] = %d, want = 3", b[0])
+		b := PayloadSince(p.Pkt.NetworkHeader())
+		if b[dstAddrOffset] != 3 {
+			t.Fatalf("got b[dstAddrOffset] = %d, want = 3", b[dstAddrOffset])
 		}
-		// The first 5 packets should not be forwarded so the the
-		// sequemnce number should start with 5.
+		if len(b) < fwdTestNetHeaderLen+2 {
+			t.Fatalf("packet is too short to hold a sequence number: len(b) = %d", b)
+		}
+		seqNumBuf := b[fwdTestNetHeaderLen:]
+
+		// The first 5 packets should not be forwarded so the sequence number should
+		// start with 5.
 		want := uint16(i + 5)
-		if n := binary.BigEndian.Uint16(b[fwdTestNetHeaderLen:]); n != want {
+		if n := binary.BigEndian.Uint16(seqNumBuf); n != want {
 			t.Fatalf("got the packet #%d, want = #%d", n, want)
 		}
 
@@ -596,7 +617,7 @@ func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
 	// Create a network protocol with a fake resolver.
 	proto := &fwdTestNetworkProtocol{
 		addrResolveDelay: 500 * time.Millisecond,
-		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address, _ tcpip.LinkAddress) {
 			// Any packets will be resolved to the link address "c".
 			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
 		},
@@ -609,10 +630,10 @@ func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
 		// Each packet has a different destination address (3 to
 		// maxPendingResolutions + 7).
 		buf := buffer.NewView(30)
-		buf[0] = byte(3 + i)
-		ep1.InjectInbound(fwdTestNetNumber, PacketBuffer{
+		buf[dstAddrOffset] = byte(3 + i)
+		ep1.InjectInbound(fwdTestNetNumber, NewPacketBuffer(PacketBufferOptions{
 			Data: buf.ToVectorisedView(),
-		})
+		}))
 	}
 
 	for i := 0; i < maxPendingResolutions; i++ {
@@ -626,9 +647,8 @@ func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
 
 		// The first 5 packets (address 3 to 7) should not be forwarded
 		// because their address resolutions are interrupted.
-		b := p.Pkt.Data.ToView()
-		if b[0] < 8 {
-			t.Fatalf("got b[0] = %d, want b[0] >= 8", b[0])
+		if nh := PayloadSince(p.Pkt.NetworkHeader()); nh[dstAddrOffset] < 8 {
+			t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want p.Pkt.NetworkHeader[dstAddrOffset] >= 8", nh[dstAddrOffset])
 		}
 
 		// Test that the address resolution happened correctly.
diff --git a/pkg/tcpip/stack/headertype_string.go b/pkg/tcpip/stack/headertype_string.go
new file mode 100644
index 000000000..5efddfaaf
--- /dev/null
+++ b/pkg/tcpip/stack/headertype_string.go
@@ -0,0 +1,39 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at //
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by "stringer -type headerType ."; DO NOT EDIT.
+
+package stack
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[linkHeader-0]
+	_ = x[networkHeader-1]
+	_ = x[transportHeader-2]
+	_ = x[numHeaderType-3]
+}
+
+const _headerType_name = "linkHeadernetworkHeadertransportHeadernumHeaderType"
+
+var _headerType_index = [...]uint8{0, 10, 23, 38, 51}
+
+func (i headerType) String() string {
+	if i < 0 || i >= headerType(len(_headerType_index)-1) {
+		return "headerType(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+	return _headerType_name[_headerType_index[i]:_headerType_index[i+1]]
+}
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
index 443423b3c..c37da814f 100644
--- a/pkg/tcpip/stack/iptables.go
+++ b/pkg/tcpip/stack/iptables.go
@@ -16,40 +16,49 @@ package stack
 
 import (
 	"fmt"
-	"strings"
+	"time"
 
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
-// Table names.
+// tableID is an index into IPTables.tables.
+type tableID int
+
 const (
-	TablenameNat    = "nat"
-	TablenameMangle = "mangle"
-	TablenameFilter = "filter"
+	natID tableID = iota
+	mangleID
+	filterID
+	numTables
 )
 
-// Chain names as defined by net/ipv4/netfilter/ip_tables.c.
+// Table names.
 const (
-	ChainNamePrerouting  = "PREROUTING"
-	ChainNameInput       = "INPUT"
-	ChainNameForward     = "FORWARD"
-	ChainNameOutput      = "OUTPUT"
-	ChainNamePostrouting = "POSTROUTING"
+	NATTable    = "nat"
+	MangleTable = "mangle"
+	FilterTable = "filter"
 )
 
+// nameToID is immutable.
+var nameToID = map[string]tableID{
+	NATTable:    natID,
+	MangleTable: mangleID,
+	FilterTable: filterID,
+}
+
 // HookUnset indicates that there is no hook set for an entrypoint or
 // underflow.
 const HookUnset = -1
 
+// reaperDelay is how long to wait before starting to reap connections.
+const reaperDelay = 5 * time.Second
+
 // DefaultTables returns a default set of tables. Each chain is set to accept
 // all packets.
-func DefaultTables() IPTables {
-	// TODO(gvisor.dev/issue/170): We may be able to swap out some strings for
-	// iotas.
-	return IPTables{
-		Tables: map[string]Table{
-			TablenameNat: Table{
+func DefaultTables() *IPTables {
+	return &IPTables{
+		tables: [numTables]Table{
+			natID: Table{
 				Rules: []Rule{
 					Rule{Target: AcceptTarget{}},
 					Rule{Target: AcceptTarget{}},
@@ -57,65 +66,71 @@ func DefaultTables() IPTables {
 					Rule{Target: AcceptTarget{}},
 					Rule{Target: ErrorTarget{}},
 				},
-				BuiltinChains: map[Hook]int{
+				BuiltinChains: [NumHooks]int{
 					Prerouting:  0,
 					Input:       1,
+					Forward:     HookUnset,
 					Output:      2,
 					Postrouting: 3,
 				},
-				Underflows: map[Hook]int{
+				Underflows: [NumHooks]int{
 					Prerouting:  0,
 					Input:       1,
+					Forward:     HookUnset,
 					Output:      2,
 					Postrouting: 3,
 				},
-				UserChains: map[string]int{},
 			},
-			TablenameMangle: Table{
+			mangleID: Table{
 				Rules: []Rule{
 					Rule{Target: AcceptTarget{}},
 					Rule{Target: AcceptTarget{}},
 					Rule{Target: ErrorTarget{}},
 				},
-				BuiltinChains: map[Hook]int{
+				BuiltinChains: [NumHooks]int{
 					Prerouting: 0,
 					Output:     1,
 				},
-				Underflows: map[Hook]int{
-					Prerouting: 0,
-					Output:     1,
+				Underflows: [NumHooks]int{
+					Prerouting:  0,
+					Input:       HookUnset,
+					Forward:     HookUnset,
+					Output:      1,
+					Postrouting: HookUnset,
 				},
-				UserChains: map[string]int{},
 			},
-			TablenameFilter: Table{
+			filterID: Table{
 				Rules: []Rule{
 					Rule{Target: AcceptTarget{}},
 					Rule{Target: AcceptTarget{}},
 					Rule{Target: AcceptTarget{}},
 					Rule{Target: ErrorTarget{}},
 				},
-				BuiltinChains: map[Hook]int{
-					Input:   0,
-					Forward: 1,
-					Output:  2,
+				BuiltinChains: [NumHooks]int{
+					Prerouting:  HookUnset,
+					Input:       0,
+					Forward:     1,
+					Output:      2,
+					Postrouting: HookUnset,
 				},
-				Underflows: map[Hook]int{
-					Input:   0,
-					Forward: 1,
-					Output:  2,
+				Underflows: [NumHooks]int{
+					Prerouting:  HookUnset,
+					Input:       0,
+					Forward:     1,
+					Output:      2,
+					Postrouting: HookUnset,
 				},
-				UserChains: map[string]int{},
 			},
 		},
-		Priorities: map[Hook][]string{
-			Input:      []string{TablenameNat, TablenameFilter},
-			Prerouting: []string{TablenameMangle, TablenameNat},
-			Output:     []string{TablenameMangle, TablenameNat, TablenameFilter},
+		priorities: [NumHooks][]tableID{
+			Prerouting: []tableID{mangleID, natID},
+			Input:      []tableID{natID, filterID},
+			Output:     []tableID{mangleID, natID, filterID},
 		},
-		connections: ConnTrackTable{
-			CtMap: make(map[uint32]ConnTrackTupleHolder),
-			Seed:  generateRandUint32(),
+		connections: ConnTrack{
+			seed: generateRandUint32(),
 		},
+		reaperDone: make(chan struct{}, 1),
 	}
 }
 
@@ -124,41 +139,61 @@ func DefaultTables() IPTables {
 func EmptyFilterTable() Table {
 	return Table{
 		Rules: []Rule{},
-		BuiltinChains: map[Hook]int{
-			Input:   HookUnset,
-			Forward: HookUnset,
-			Output:  HookUnset,
+		BuiltinChains: [NumHooks]int{
+			Prerouting:  HookUnset,
+			Postrouting: HookUnset,
 		},
-		Underflows: map[Hook]int{
-			Input:   HookUnset,
-			Forward: HookUnset,
-			Output:  HookUnset,
+		Underflows: [NumHooks]int{
+			Prerouting:  HookUnset,
+			Postrouting: HookUnset,
 		},
-		UserChains: map[string]int{},
 	}
 }
 
-// EmptyNatTable returns a Table with no rules and the filter table chains
+// EmptyNATTable returns a Table with no rules and the filter table chains
 // mapped to HookUnset.
-func EmptyNatTable() Table {
+func EmptyNATTable() Table {
 	return Table{
 		Rules: []Rule{},
-		BuiltinChains: map[Hook]int{
-			Prerouting:  HookUnset,
-			Input:       HookUnset,
-			Output:      HookUnset,
-			Postrouting: HookUnset,
+		BuiltinChains: [NumHooks]int{
+			Forward: HookUnset,
 		},
-		Underflows: map[Hook]int{
-			Prerouting:  HookUnset,
-			Input:       HookUnset,
-			Output:      HookUnset,
-			Postrouting: HookUnset,
+		Underflows: [NumHooks]int{
+			Forward: HookUnset,
 		},
-		UserChains: map[string]int{},
 	}
 }
 
+// GetTable returns a table by name.
+func (it *IPTables) GetTable(name string) (Table, bool) {
+	id, ok := nameToID[name]
+	if !ok {
+		return Table{}, false
+	}
+	it.mu.RLock()
+	defer it.mu.RUnlock()
+	return it.tables[id], true
+}
+
+// ReplaceTable replaces or inserts table by name.
+func (it *IPTables) ReplaceTable(name string, table Table) *tcpip.Error {
+	id, ok := nameToID[name]
+	if !ok {
+		return tcpip.ErrInvalidOptionValue
+	}
+	it.mu.Lock()
+	defer it.mu.Unlock()
+	// If iptables is being enabled, initialize the conntrack table and
+	// reaper.
+	if !it.modified {
+		it.connections.buckets = make([]bucket, numBuckets)
+		it.startReaper(reaperDelay)
+	}
+	it.modified = true
+	it.tables[id] = table
+	return nil
+}
+
 // A chainVerdict is what a table decides should be done with a packet.
 type chainVerdict int
 
@@ -180,13 +215,27 @@ const (
 //
 // Precondition: pkt.NetworkHeader is set.
 func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, address tcpip.Address, nicName string) bool {
+	// Many users never configure iptables. Spare them the cost of rule
+	// traversal if rules have never been set.
+	it.mu.RLock()
+	defer it.mu.RUnlock()
+	if !it.modified {
+		return true
+	}
+
 	// Packets are manipulated only if connection and matching
 	// NAT rule exists.
-	it.connections.HandlePacket(pkt, hook, gso, r)
+	shouldTrack := it.connections.handlePacket(pkt, hook, gso, r)
 
 	// Go through each table containing the hook.
-	for _, tablename := range it.Priorities[hook] {
-		table := it.Tables[tablename]
+	priorities := it.priorities[hook]
+	for _, tableID := range priorities {
+		// If handlePacket already NATed the packet, we don't need to
+		// check the NAT table.
+		if tableID == natID && pkt.NatDone {
+			continue
+		}
+		table := it.tables[tableID]
 		ruleIdx := table.BuiltinChains[hook]
 		switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict {
 		// If the table returns Accept, move on to the next table.
@@ -215,17 +264,59 @@ func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, addr
 		}
 	}
 
+	// If this connection should be tracked, try to add an entry for it. If
+	// traversing the nat table didn't end in adding an entry,
+	// maybeInsertNoop will add a no-op entry for the connection. This is
+	// needeed when establishing connections so that the SYN/ACK reply to an
+	// outgoing SYN is delivered to the correct endpoint rather than being
+	// redirected by a prerouting rule.
+	//
+	// From the iptables documentation: "If there is no rule, a `null'
+	// binding is created: this usually does not map the packet, but exists
+	// to ensure we don't map another stream over an existing one."
+	if shouldTrack {
+		it.connections.maybeInsertNoop(pkt, hook)
+	}
+
 	// Every table returned Accept.
 	return true
 }
 
+// beforeSave is invoked by stateify.
+func (it *IPTables) beforeSave() {
+	// Ensure the reaper exits cleanly.
+	it.reaperDone <- struct{}{}
+	// Prevent others from modifying the connection table.
+	it.connections.mu.Lock()
+}
+
+// afterLoad is invoked by stateify.
+func (it *IPTables) afterLoad() {
+	it.startReaper(reaperDelay)
+}
+
+// startReaper starts a goroutine that wakes up periodically to reap timed out
+// connections.
+func (it *IPTables) startReaper(interval time.Duration) {
+	go func() { // S/R-SAFE: reaperDone is signalled when iptables is saved.
+		bucket := 0
+		for {
+			select {
+			case <-it.reaperDone:
+				return
+			case <-time.After(interval):
+				bucket, interval = it.connections.reapUnused(bucket, interval)
+			}
+		}
+	}()
+}
+
 // CheckPackets runs pkts through the rules for hook and returns a map of packets that
 // should not go forward.
 //
-// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-//
-// TODO(gvisor.dev/issue/170): pk.NetworkHeader will always be set as a
-// precondition.
+// Preconditions:
+// - pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// - pkt.NetworkHeader is not nil.
 //
 // NOTE: unlike the Check API the returned map contains packets that should be
 // dropped.
@@ -249,9 +340,9 @@ func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r *
 	return drop, natPkts
 }
 
-// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-// TODO(gvisor.dev/issue/170): pkt.NetworkHeader will always be set as a
-// precondition.
+// Preconditions:
+// - pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// - pkt.NetworkHeader is not nil.
 func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) chainVerdict {
 	// Start from ruleIdx and walk the list of rules until a rule gives us
 	// a verdict.
@@ -296,25 +387,14 @@ func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleId
 	return chainDrop
 }
 
-// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
-// TODO(gvisor.dev/issue/170): pkt.NetworkHeader will always be set as a
-// precondition.
+// Preconditions:
+// - pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// - pkt.NetworkHeader is not nil.
 func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) (RuleVerdict, int) {
 	rule := table.Rules[ruleIdx]
 
-	// If pkt.NetworkHeader hasn't been set yet, it will be contained in
-	// pkt.Data.
-	if pkt.NetworkHeader == nil {
-		var ok bool
-		pkt.NetworkHeader, ok = pkt.Data.PullUp(header.IPv4MinimumSize)
-		if !ok {
-			// Precondition has been violated.
-			panic(fmt.Sprintf("iptables checks require IPv4 headers of at least %d bytes", header.IPv4MinimumSize))
-		}
-	}
-
 	// Check whether the packet matches the IP header filter.
-	if !filterMatch(rule.Filter, header.IPv4(pkt.NetworkHeader), hook, nicName) {
+	if !rule.Filter.match(header.IPv4(pkt.NetworkHeader().View()), hook, nicName) {
 		// Continue on to the next rule.
 		return RuleJump, ruleIdx + 1
 	}
@@ -322,7 +402,7 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx
 	// Go through each rule matcher. If they all match, run
 	// the rule target.
 	for _, matcher := range rule.Matchers {
-		matches, hotdrop := matcher.Match(hook, *pkt, "")
+		matches, hotdrop := matcher.Match(hook, pkt, "")
 		if hotdrop {
 			return RuleDrop, 0
 		}
@@ -336,46 +416,8 @@ func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx
 	return rule.Target.Action(pkt, &it.connections, hook, gso, r, address)
 }
 
-func filterMatch(filter IPHeaderFilter, hdr header.IPv4, hook Hook, nicName string) bool {
-	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
-	// Check the transport protocol.
-	if filter.Protocol != 0 && filter.Protocol != hdr.TransportProtocol() {
-		return false
-	}
-
-	// Check the destination IP.
-	dest := hdr.DestinationAddress()
-	matches := true
-	for i := range filter.Dst {
-		if dest[i]&filter.DstMask[i] != filter.Dst[i] {
-			matches = false
-			break
-		}
-	}
-	if matches == filter.DstInvert {
-		return false
-	}
-
-	// Check the output interface.
-	// TODO(gvisor.dev/issue/170): Add the check for FORWARD and POSTROUTING
-	// hooks after supported.
-	if hook == Output {
-		n := len(filter.OutputInterface)
-		if n == 0 {
-			return true
-		}
-
-		// If the interface name ends with '+', any interface which begins
-		// with the name should be matched.
-		ifName := filter.OutputInterface
-		matches = true
-		if strings.HasSuffix(ifName, "+") {
-			matches = strings.HasPrefix(nicName, ifName[:n-1])
-		} else {
-			matches = nicName == ifName
-		}
-		return filter.OutputInterfaceInvert != matches
-	}
-
-	return true
+// OriginalDst returns the original destination of redirected connections. It
+// returns an error if the connection doesn't exist or isn't redirected.
+func (it *IPTables) OriginalDst(epID TransportEndpointID) (tcpip.Address, uint16, *tcpip.Error) {
+	return it.connections.originalDst(epID)
 }
diff --git a/pkg/tcpip/stack/iptables_state.go b/pkg/tcpip/stack/iptables_state.go
new file mode 100644
index 000000000..529e02a07
--- /dev/null
+++ b/pkg/tcpip/stack/iptables_state.go
@@ -0,0 +1,40 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"time"
+)
+
+// +stateify savable
+type unixTime struct {
+	second int64
+	nano   int64
+}
+
+// saveLastUsed is invoked by stateify.
+func (cn *conn) saveLastUsed() unixTime {
+	return unixTime{cn.lastUsed.Unix(), cn.lastUsed.UnixNano()}
+}
+
+// loadLastUsed is invoked by stateify.
+func (cn *conn) loadLastUsed(unix unixTime) {
+	cn.lastUsed = time.Unix(unix.second, unix.nano)
+}
+
+// beforeSave is invoked by stateify.
+func (ct *ConnTrack) beforeSave() {
+	ct.mu.Lock()
+}
diff --git a/pkg/tcpip/stack/iptables_targets.go b/pkg/tcpip/stack/iptables_targets.go
index 36cc6275d..5f1b2af64 100644
--- a/pkg/tcpip/stack/iptables_targets.go
+++ b/pkg/tcpip/stack/iptables_targets.go
@@ -24,7 +24,7 @@ import (
 type AcceptTarget struct{}
 
 // Action implements Target.Action.
-func (AcceptTarget) Action(*PacketBuffer, *ConnTrackTable, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (AcceptTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	return RuleAccept, 0
 }
 
@@ -32,7 +32,7 @@ func (AcceptTarget) Action(*PacketBuffer, *ConnTrackTable, Hook, *GSO, *Route, t
 type DropTarget struct{}
 
 // Action implements Target.Action.
-func (DropTarget) Action(*PacketBuffer, *ConnTrackTable, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (DropTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	return RuleDrop, 0
 }
 
@@ -41,7 +41,7 @@ func (DropTarget) Action(*PacketBuffer, *ConnTrackTable, Hook, *GSO, *Route, tcp
 type ErrorTarget struct{}
 
 // Action implements Target.Action.
-func (ErrorTarget) Action(*PacketBuffer, *ConnTrackTable, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (ErrorTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	log.Debugf("ErrorTarget triggered.")
 	return RuleDrop, 0
 }
@@ -52,7 +52,7 @@ type UserChainTarget struct {
 }
 
 // Action implements Target.Action.
-func (UserChainTarget) Action(*PacketBuffer, *ConnTrackTable, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (UserChainTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	panic("UserChainTarget should never be called.")
 }
 
@@ -61,7 +61,7 @@ func (UserChainTarget) Action(*PacketBuffer, *ConnTrackTable, Hook, *GSO, *Route
 type ReturnTarget struct{}
 
 // Action implements Target.Action.
-func (ReturnTarget) Action(*PacketBuffer, *ConnTrackTable, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+func (ReturnTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
 	return RuleReturn, 0
 }
 
@@ -92,19 +92,14 @@ type RedirectTarget struct {
 // TODO(gvisor.dev/issue/170): Parse headers without copying. The current
 // implementation only works for PREROUTING and calls pkt.Clone(), neither
 // of which should be the case.
-func (rt RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrackTable, hook Hook, gso *GSO, r *Route, address tcpip.Address) (RuleVerdict, int) {
+func (rt RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, gso *GSO, r *Route, address tcpip.Address) (RuleVerdict, int) {
 	// Packet is already manipulated.
 	if pkt.NatDone {
 		return RuleAccept, 0
 	}
 
-	// Set network header.
-	if hook == Prerouting {
-		parseHeaders(pkt)
-	}
-
 	// Drop the packet if network and transport header are not set.
-	if pkt.NetworkHeader == nil || pkt.TransportHeader == nil {
+	if pkt.NetworkHeader().View().IsEmpty() || pkt.TransportHeader().View().IsEmpty() {
 		return RuleDrop, 0
 	}
 
@@ -123,17 +118,16 @@ func (rt RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrackTable, hook Hook
 
 	// TODO(gvisor.dev/issue/170): Check Flags in RedirectTarget if
 	// we need to change dest address (for OUTPUT chain) or ports.
-	netHeader := header.IPv4(pkt.NetworkHeader)
+	netHeader := header.IPv4(pkt.NetworkHeader().View())
 	switch protocol := netHeader.TransportProtocol(); protocol {
 	case header.UDPProtocolNumber:
-		udpHeader := header.UDP(pkt.TransportHeader)
+		udpHeader := header.UDP(pkt.TransportHeader().View())
 		udpHeader.SetDestinationPort(rt.MinPort)
 
 		// Calculate UDP checksum and set it.
 		if hook == Output {
 			udpHeader.SetChecksum(0)
-			hdr := &pkt.Header
-			length := uint16(pkt.Data.Size()+hdr.UsedLength()) - uint16(netHeader.HeaderLength())
+			length := uint16(pkt.Size()) - uint16(netHeader.HeaderLength())
 
 			// Only calculate the checksum if offloading isn't supported.
 			if r.Capabilities()&CapabilityTXChecksumOffload == 0 {
@@ -155,12 +149,11 @@ func (rt RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrackTable, hook Hook
 			return RuleAccept, 0
 		}
 
-		// Set up conection for matching NAT rule.
-		// Only the first packet of the connection comes here.
-		// Other packets will be manipulated in connection tracking.
-		if conn, _ := ct.connTrackForPacket(pkt, hook, true); conn != nil {
-			ct.SetNatInfo(pkt, rt, hook)
-			ct.HandlePacket(pkt, hook, gso, r)
+		// Set up conection for matching NAT rule. Only the first
+		// packet of the connection comes here. Other packets will be
+		// manipulated in connection tracking.
+		if conn := ct.insertRedirectConn(pkt, hook, rt); conn != nil {
+			ct.handlePacket(pkt, hook, gso, r)
 		}
 	default:
 		return RuleDrop, 0
diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go
index fe06007ae..73274ada9 100644
--- a/pkg/tcpip/stack/iptables_types.go
+++ b/pkg/tcpip/stack/iptables_types.go
@@ -15,7 +15,11 @@
 package stack
 
 import (
+	"strings"
+	"sync"
+
 	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
 // A Hook specifies one of the hooks built into the network stack.
@@ -74,63 +78,65 @@ const (
 )
 
 // IPTables holds all the tables for a netstack.
+//
+// +stateify savable
 type IPTables struct {
-	// Tables maps table names to tables. User tables have arbitrary names.
-	Tables map[string]Table
+	// mu protects tables, priorities, and modified.
+	mu sync.RWMutex
 
-	// Priorities maps each hook to a list of table names. The order of the
+	// tables maps tableIDs to tables. Holds builtin tables only, not user
+	// tables. mu must be locked for accessing.
+	tables [numTables]Table
+
+	// priorities maps each hook to a list of table names. The order of the
 	// list is the order in which each table should be visited for that
-	// hook.
-	Priorities map[Hook][]string
+	// hook. mu needs to be locked for accessing.
+	priorities [NumHooks][]tableID
+
+	// modified is whether tables have been modified at least once. It is
+	// used to elide the iptables performance overhead for workloads that
+	// don't utilize iptables.
+	modified bool
 
-	connections ConnTrackTable
+	connections ConnTrack
+
+	// reaperDone can be signalled to stop the reaper goroutine.
+	reaperDone chan struct{}
 }
 
 // A Table defines a set of chains and hooks into the network stack. It is
-// really just a list of rules with some metadata for entrypoints and such.
+// really just a list of rules.
+//
+// +stateify savable
 type Table struct {
 	// Rules holds the rules that make up the table.
 	Rules []Rule
 
 	// BuiltinChains maps builtin chains to their entrypoint rule in Rules.
-	BuiltinChains map[Hook]int
+	BuiltinChains [NumHooks]int
 
 	// Underflows maps builtin chains to their underflow rule in Rules
 	// (i.e. the rule to execute if the chain returns without a verdict).
-	Underflows map[Hook]int
-
-	// UserChains holds user-defined chains for the keyed by name. Users
-	// can give their chains arbitrary names.
-	UserChains map[string]int
-
-	// Metadata holds information about the Table that is useful to users
-	// of IPTables, but not to the netstack IPTables code itself.
-	metadata interface{}
+	Underflows [NumHooks]int
 }
 
 // ValidHooks returns a bitmap of the builtin hooks for the given table.
 func (table *Table) ValidHooks() uint32 {
 	hooks := uint32(0)
-	for hook := range table.BuiltinChains {
-		hooks |= 1 << hook
+	for hook, ruleIdx := range table.BuiltinChains {
+		if ruleIdx != HookUnset {
+			hooks |= 1 << hook
+		}
 	}
 	return hooks
 }
 
-// Metadata returns the metadata object stored in table.
-func (table *Table) Metadata() interface{} {
-	return table.metadata
-}
-
-// SetMetadata sets the metadata object stored in table.
-func (table *Table) SetMetadata(metadata interface{}) {
-	table.metadata = metadata
-}
-
 // A Rule is a packet processing rule. It consists of two pieces. First it
 // contains zero or more matchers, each of which is a specification of which
 // packets this rule applies to. If there are no matchers in the rule, it
 // applies to any packet.
+//
+// +stateify savable
 type Rule struct {
 	// Filter holds basic IP filtering fields common to every rule.
 	Filter IPHeaderFilter
@@ -143,6 +149,8 @@ type Rule struct {
 }
 
 // IPHeaderFilter holds basic IP filtering data common to every rule.
+//
+// +stateify savable
 type IPHeaderFilter struct {
 	// Protocol matches the transport protocol.
 	Protocol tcpip.TransportProtocolNumber
@@ -159,6 +167,16 @@ type IPHeaderFilter struct {
 	// comparison.
 	DstInvert bool
 
+	// Src matches the source IP address.
+	Src tcpip.Address
+
+	// SrcMask masks bits of the source IP address when comparing with Src.
+	SrcMask tcpip.Address
+
+	// SrcInvert inverts the meaning of the source IP check, i.e. when true the
+	// filter will match packets that fail the source comparison.
+	SrcInvert bool
+
 	// OutputInterface matches the name of the outgoing interface for the
 	// packet.
 	OutputInterface string
@@ -173,6 +191,55 @@ type IPHeaderFilter struct {
 	OutputInterfaceInvert bool
 }
 
+// match returns whether hdr matches the filter.
+func (fl IPHeaderFilter) match(hdr header.IPv4, hook Hook, nicName string) bool {
+	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
+	// Check the transport protocol.
+	if fl.Protocol != 0 && fl.Protocol != hdr.TransportProtocol() {
+		return false
+	}
+
+	// Check the source and destination IPs.
+	if !filterAddress(hdr.DestinationAddress(), fl.DstMask, fl.Dst, fl.DstInvert) || !filterAddress(hdr.SourceAddress(), fl.SrcMask, fl.Src, fl.SrcInvert) {
+		return false
+	}
+
+	// Check the output interface.
+	// TODO(gvisor.dev/issue/170): Add the check for FORWARD and POSTROUTING
+	// hooks after supported.
+	if hook == Output {
+		n := len(fl.OutputInterface)
+		if n == 0 {
+			return true
+		}
+
+		// If the interface name ends with '+', any interface which begins
+		// with the name should be matched.
+		ifName := fl.OutputInterface
+		matches := true
+		if strings.HasSuffix(ifName, "+") {
+			matches = strings.HasPrefix(nicName, ifName[:n-1])
+		} else {
+			matches = nicName == ifName
+		}
+		return fl.OutputInterfaceInvert != matches
+	}
+
+	return true
+}
+
+// filterAddress returns whether addr matches the filter.
+func filterAddress(addr, mask, filterAddr tcpip.Address, invert bool) bool {
+	matches := true
+	for i := range filterAddr {
+		if addr[i]&mask[i] != filterAddr[i] {
+			matches = false
+			break
+		}
+	}
+	return matches != invert
+}
+
 // A Matcher is the interface for matching packets.
 type Matcher interface {
 	// Name returns the name of the Matcher.
@@ -183,7 +250,7 @@ type Matcher interface {
 	// used for suspicious packets.
 	//
 	// Precondition: packet.NetworkHeader is set.
-	Match(hook Hook, packet PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
+	Match(hook Hook, packet *PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
 }
 
 // A Target is the interface for taking an action for a packet.
@@ -191,5 +258,5 @@ type Target interface {
 	// Action takes an action on the packet and returns a verdict on how
 	// traversal should (or should not) continue. If the return value is
 	// Jump, it also returns the index of the rule to jump to.
-	Action(packet *PacketBuffer, connections *ConnTrackTable, hook Hook, gso *GSO, r *Route, address tcpip.Address) (RuleVerdict, int)
+	Action(packet *PacketBuffer, connections *ConnTrack, hook Hook, gso *GSO, r *Route, address tcpip.Address) (RuleVerdict, int)
 }
diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go
index 403557fd7..6f73a0ce4 100644
--- a/pkg/tcpip/stack/linkaddrcache.go
+++ b/pkg/tcpip/stack/linkaddrcache.go
@@ -244,7 +244,7 @@ func (c *linkAddrCache) startAddressResolution(k tcpip.FullAddress, linkRes Link
 	for i := 0; ; i++ {
 		// Send link request, then wait for the timeout limit and check
 		// whether the request succeeded.
-		linkRes.LinkAddressRequest(k.Addr, localAddr, linkEP)
+		linkRes.LinkAddressRequest(k.Addr, localAddr, "" /* linkAddr */, linkEP)
 
 		select {
 		case now := <-time.After(c.resolutionTimeout):
diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go
index 1baa498d0..b15b8d1cb 100644
--- a/pkg/tcpip/stack/linkaddrcache_test.go
+++ b/pkg/tcpip/stack/linkaddrcache_test.go
@@ -48,7 +48,7 @@ type testLinkAddressResolver struct {
 	onLinkAddressRequest func()
 }
 
-func (r *testLinkAddressResolver) LinkAddressRequest(addr, _ tcpip.Address, _ LinkEndpoint) *tcpip.Error {
+func (r *testLinkAddressResolver) LinkAddressRequest(addr, _ tcpip.Address, _ tcpip.LinkAddress, _ LinkEndpoint) *tcpip.Error {
 	time.AfterFunc(r.delay, func() { r.fakeRequest(addr) })
 	if f := r.onLinkAddressRequest; f != nil {
 		f()
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
index 526c7d6ff..93567806b 100644
--- a/pkg/tcpip/stack/ndp.go
+++ b/pkg/tcpip/stack/ndp.go
@@ -33,12 +33,6 @@ const (
 	// Default = 1 (from RFC 4862 section 5.1)
 	defaultDupAddrDetectTransmits = 1
 
-	// defaultRetransmitTimer is the default amount of time to wait between
-	// sending NDP Neighbor solicitation messages.
-	//
-	// Default = 1s (from RFC 4861 section 10).
-	defaultRetransmitTimer = time.Second
-
 	// defaultMaxRtrSolicitations is the default number of Router
 	// Solicitation messages to send when a NIC becomes enabled.
 	//
@@ -79,16 +73,6 @@ const (
 	// Default = true.
 	defaultAutoGenGlobalAddresses = true
 
-	// minimumRetransmitTimer is the minimum amount of time to wait between
-	// sending NDP Neighbor solicitation messages. Note, RFC 4861 does
-	// not impose a minimum Retransmit Timer, but we do here to make sure
-	// the messages are not sent all at once. We also come to this value
-	// because in the RetransmitTimer field of a Router Advertisement, a
-	// value of 0 means unspecified, so the smallest valid value is 1.
-	// Note, the unit of the RetransmitTimer field in the Router
-	// Advertisement is milliseconds.
-	minimumRetransmitTimer = time.Millisecond
-
 	// minimumRtrSolicitationInterval is the minimum amount of time to wait
 	// between sending Router Solicitation messages. This limit is imposed
 	// to make sure that Router Solicitation messages are not sent all at
@@ -467,8 +451,17 @@ type ndpState struct {
 	// The default routers discovered through Router Advertisements.
 	defaultRouters map[tcpip.Address]defaultRouterState
 
-	// The timer used to send the next router solicitation message.
-	rtrSolicitTimer *time.Timer
+	rtrSolicit struct {
+		// The timer used to send the next router solicitation message.
+		timer tcpip.Timer
+
+		// Used to let the Router Solicitation timer know that it has been stopped.
+		//
+		// Must only be read from or written to while protected by the lock of
+		// the NIC this ndpState is associated with. MUST be set when the timer is
+		// set.
+		done *bool
+	}
 
 	// The on-link prefixes discovered through Router Advertisements' Prefix
 	// Information option.
@@ -494,7 +487,7 @@ type ndpState struct {
 // to the DAD goroutine that DAD should stop.
 type dadState struct {
 	// The DAD timer to send the next NS message, or resolve the address.
-	timer *time.Timer
+	timer tcpip.Timer
 
 	// Used to let the DAD timer know that it has been stopped.
 	//
@@ -506,38 +499,38 @@ type dadState struct {
 // defaultRouterState holds data associated with a default router discovered by
 // a Router Advertisement (RA).
 type defaultRouterState struct {
-	// Timer to invalidate the default router.
+	// Job to invalidate the default router.
 	//
 	// Must not be nil.
-	invalidationTimer *tcpip.CancellableTimer
+	invalidationJob *tcpip.Job
 }
 
 // onLinkPrefixState holds data associated with an on-link prefix discovered by
 // a Router Advertisement's Prefix Information option (PI) when the NDP
 // configurations was configured to do so.
 type onLinkPrefixState struct {
-	// Timer to invalidate the on-link prefix.
+	// Job to invalidate the on-link prefix.
 	//
 	// Must not be nil.
-	invalidationTimer *tcpip.CancellableTimer
+	invalidationJob *tcpip.Job
 }
 
 // tempSLAACAddrState holds state associated with a temporary SLAAC address.
 type tempSLAACAddrState struct {
-	// Timer to deprecate the temporary SLAAC address.
+	// Job to deprecate the temporary SLAAC address.
 	//
 	// Must not be nil.
-	deprecationTimer *tcpip.CancellableTimer
+	deprecationJob *tcpip.Job
 
-	// Timer to invalidate the temporary SLAAC address.
+	// Job to invalidate the temporary SLAAC address.
 	//
 	// Must not be nil.
-	invalidationTimer *tcpip.CancellableTimer
+	invalidationJob *tcpip.Job
 
-	// Timer to regenerate the temporary SLAAC address.
+	// Job to regenerate the temporary SLAAC address.
 	//
 	// Must not be nil.
-	regenTimer *tcpip.CancellableTimer
+	regenJob *tcpip.Job
 
 	createdAt time.Time
 
@@ -552,15 +545,15 @@ type tempSLAACAddrState struct {
 
 // slaacPrefixState holds state associated with a SLAAC prefix.
 type slaacPrefixState struct {
-	// Timer to deprecate the prefix.
+	// Job to deprecate the prefix.
 	//
 	// Must not be nil.
-	deprecationTimer *tcpip.CancellableTimer
+	deprecationJob *tcpip.Job
 
-	// Timer to invalidate the prefix.
+	// Job to invalidate the prefix.
 	//
 	// Must not be nil.
-	invalidationTimer *tcpip.CancellableTimer
+	invalidationJob *tcpip.Job
 
 	// Nonzero only when the address is not valid forever.
 	validUntil time.Time
@@ -642,19 +635,20 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 	}
 
 	var done bool
-	var timer *time.Timer
+	var timer tcpip.Timer
 	// We initially start a timer to fire immediately because some of the DAD work
 	// cannot be done while holding the NIC's lock. This is effectively the same
 	// as starting a goroutine but we use a timer that fires immediately so we can
 	// reset it for the next DAD iteration.
-	timer = time.AfterFunc(0, func() {
-		ndp.nic.mu.RLock()
+	timer = ndp.nic.stack.Clock().AfterFunc(0, func() {
+		ndp.nic.mu.Lock()
+		defer ndp.nic.mu.Unlock()
+
 		if done {
 			// If we reach this point, it means that the DAD timer fired after
 			// another goroutine already obtained the NIC lock and stopped DAD
 			// before this function obtained the NIC lock. Simply return here and do
 			// nothing further.
-			ndp.nic.mu.RUnlock()
 			return
 		}
 
@@ -665,15 +659,23 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 		}
 
 		dadDone := remaining == 0
-		ndp.nic.mu.RUnlock()
 
 		var err *tcpip.Error
 		if !dadDone {
-			err = ndp.sendDADPacket(addr)
+			// Use the unspecified address as the source address when performing DAD.
+			ref := ndp.nic.getRefOrCreateTempLocked(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint)
+
+			// Do not hold the lock when sending packets which may be a long running
+			// task or may block link address resolution. We know this is safe
+			// because immediately after obtaining the lock again, we check if DAD
+			// has been stopped before doing any work with the NIC. Note, DAD would be
+			// stopped if the NIC was disabled or removed, or if the address was
+			// removed.
+			ndp.nic.mu.Unlock()
+			err = ndp.sendDADPacket(addr, ref)
+			ndp.nic.mu.Lock()
 		}
 
-		ndp.nic.mu.Lock()
-		defer ndp.nic.mu.Unlock()
 		if done {
 			// If we reach this point, it means that DAD was stopped after we released
 			// the NIC's read lock and before we obtained the write lock.
@@ -721,28 +723,39 @@ func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *ref
 // addr.
 //
 // addr must be a tentative IPv6 address on ndp's NIC.
-func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
+//
+// The NIC ndp belongs to MUST NOT be locked.
+func (ndp *ndpState) sendDADPacket(addr tcpip.Address, ref *referencedNetworkEndpoint) *tcpip.Error {
 	snmc := header.SolicitedNodeAddr(addr)
 
-	// Use the unspecified address as the source address when performing DAD.
-	ref := ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, forceSpoofing)
-	r := makeRoute(header.IPv6ProtocolNumber, header.IPv6Any, snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
+	r := makeRoute(header.IPv6ProtocolNumber, ref.ep.ID().LocalAddress, snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 	defer r.Release()
 
 	// Route should resolve immediately since snmc is a multicast address so a
 	// remote link address can be calculated without a resolution process.
 	if c, err := r.Resolve(nil); err != nil {
+		// Do not consider the NIC being unknown or disabled as a fatal error.
+		// Since this method is required to be called when the NIC is not locked,
+		// the NIC could have been disabled or removed by another goroutine.
+		if err == tcpip.ErrUnknownNICID || err != tcpip.ErrInvalidEndpointState {
+			return err
+		}
+
 		panic(fmt.Sprintf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.nic.ID(), err))
 	} else if c != nil {
 		panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.nic.ID()))
 	}
 
-	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
-	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
-	pkt.SetType(header.ICMPv6NeighborSolicit)
-	ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+	icmpData := header.ICMPv6(buffer.NewView(header.ICMPv6NeighborSolicitMinimumSize))
+	icmpData.SetType(header.ICMPv6NeighborSolicit)
+	ns := header.NDPNeighborSolicit(icmpData.NDPPayload())
 	ns.SetTargetAddress(addr)
-	pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+	icmpData.SetChecksum(header.ICMPv6Checksum(icmpData, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+
+	pkt := NewPacketBuffer(PacketBufferOptions{
+		ReserveHeaderBytes: int(r.MaxHeaderLength()),
+		Data:               buffer.View(icmpData).ToVectorisedView(),
+	})
 
 	sent := r.Stats().ICMP.V6PacketsSent
 	if err := r.WritePacket(nil,
@@ -750,7 +763,7 @@ func (ndp *ndpState) sendDADPacket(addr tcpip.Address) *tcpip.Error {
 			Protocol: header.ICMPv6ProtocolNumber,
 			TTL:      header.NDPHopLimit,
 			TOS:      DefaultTOS,
-		}, PacketBuffer{Header: hdr},
+		}, pkt,
 	); err != nil {
 		sent.Dropped.Increment()
 		return err
@@ -846,9 +859,9 @@ func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 
 		case ok && rl != 0:
 			// This is an already discovered default router. Update
-			// the invalidation timer.
-			rtr.invalidationTimer.StopLocked()
-			rtr.invalidationTimer.Reset(rl)
+			// the invalidation job.
+			rtr.invalidationJob.Cancel()
+			rtr.invalidationJob.Schedule(rl)
 			ndp.defaultRouters[ip] = rtr
 
 		case ok && rl == 0:
@@ -925,7 +938,7 @@ func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
 		return
 	}
 
-	rtr.invalidationTimer.StopLocked()
+	rtr.invalidationJob.Cancel()
 	delete(ndp.defaultRouters, ip)
 
 	// Let the integrator know a discovered default router is invalidated.
@@ -954,12 +967,12 @@ func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
 	}
 
 	state := defaultRouterState{
-		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+		invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
 			ndp.invalidateDefaultRouter(ip)
 		}),
 	}
 
-	state.invalidationTimer.Reset(rl)
+	state.invalidationJob.Schedule(rl)
 
 	ndp.defaultRouters[ip] = state
 }
@@ -984,13 +997,13 @@ func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration)
 	}
 
 	state := onLinkPrefixState{
-		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+		invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
 			ndp.invalidateOnLinkPrefix(prefix)
 		}),
 	}
 
 	if l < header.NDPInfiniteLifetime {
-		state.invalidationTimer.Reset(l)
+		state.invalidationJob.Schedule(l)
 	}
 
 	ndp.onLinkPrefixes[prefix] = state
@@ -1008,7 +1021,7 @@ func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
 		return
 	}
 
-	s.invalidationTimer.StopLocked()
+	s.invalidationJob.Cancel()
 	delete(ndp.onLinkPrefixes, prefix)
 
 	// Let the integrator know a discovered on-link prefix is invalidated.
@@ -1057,14 +1070,14 @@ func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformatio
 	// This is an already discovered on-link prefix with a
 	// new non-zero valid lifetime.
 	//
-	// Update the invalidation timer.
+	// Update the invalidation job.
 
-	prefixState.invalidationTimer.StopLocked()
+	prefixState.invalidationJob.Cancel()
 
 	if vl < header.NDPInfiniteLifetime {
-		// Prefix is valid for a finite lifetime, reset the timer to expire after
+		// Prefix is valid for a finite lifetime, schedule the job to execute after
 		// the new valid lifetime.
-		prefixState.invalidationTimer.Reset(vl)
+		prefixState.invalidationJob.Schedule(vl)
 	}
 
 	ndp.onLinkPrefixes[prefix] = prefixState
@@ -1129,7 +1142,7 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 	}
 
 	state := slaacPrefixState{
-		deprecationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+		deprecationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
 			state, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the deprecated SLAAC prefix %s", prefix))
@@ -1137,7 +1150,7 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 
 			ndp.deprecateSLAACAddress(state.stableAddr.ref)
 		}),
-		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+		invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
 			state, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the invalidated SLAAC prefix %s", prefix))
@@ -1159,19 +1172,19 @@ func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
 
 	if !ndp.generateSLAACAddr(prefix, &state) {
 		// We were unable to generate an address for the prefix, we do not nothing
-		// further as there is no reason to maintain state or timers for a prefix we
+		// further as there is no reason to maintain state or jobs for a prefix we
 		// do not have an address for.
 		return
 	}
 
-	// Setup the initial timers to deprecate and invalidate prefix.
+	// Setup the initial jobs to deprecate and invalidate prefix.
 
 	if pl < header.NDPInfiniteLifetime && pl != 0 {
-		state.deprecationTimer.Reset(pl)
+		state.deprecationJob.Schedule(pl)
 	}
 
 	if vl < header.NDPInfiniteLifetime {
-		state.invalidationTimer.Reset(vl)
+		state.invalidationJob.Schedule(vl)
 		state.validUntil = now.Add(vl)
 	}
 
@@ -1403,7 +1416,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 	}
 
 	state := tempSLAACAddrState{
-		deprecationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+		deprecationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
 			prefixState, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to deprecate temporary address %s", prefix, generatedAddr))
@@ -1416,7 +1429,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 
 			ndp.deprecateSLAACAddress(tempAddrState.ref)
 		}),
-		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+		invalidationJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
 			prefixState, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to invalidate temporary address %s", prefix, generatedAddr))
@@ -1429,7 +1442,7 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 
 			ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, generatedAddr.Address, tempAddrState)
 		}),
-		regenTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+		regenJob: ndp.nic.stack.newJob(&ndp.nic.mu, func() {
 			prefixState, ok := ndp.slaacPrefixes[prefix]
 			if !ok {
 				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to regenerate temporary address after %s", prefix, generatedAddr))
@@ -1456,9 +1469,9 @@ func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *sla
 		ref:       ref,
 	}
 
-	state.deprecationTimer.Reset(pl)
-	state.invalidationTimer.Reset(vl)
-	state.regenTimer.Reset(pl - ndp.configs.RegenAdvanceDuration)
+	state.deprecationJob.Schedule(pl)
+	state.invalidationJob.Schedule(vl)
+	state.regenJob.Schedule(pl - ndp.configs.RegenAdvanceDuration)
 
 	prefixState.generationAttempts++
 	prefixState.tempAddrs[generatedAddr.Address] = state
@@ -1493,16 +1506,16 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 		prefixState.stableAddr.ref.deprecated = false
 	}
 
-	// If prefix was preferred for some finite lifetime before, stop the
-	// deprecation timer so it can be reset.
-	prefixState.deprecationTimer.StopLocked()
+	// If prefix was preferred for some finite lifetime before, cancel the
+	// deprecation job so it can be reset.
+	prefixState.deprecationJob.Cancel()
 
 	now := time.Now()
 
-	// Reset the deprecation timer if prefix has a finite preferred lifetime.
+	// Schedule the deprecation job if prefix has a finite preferred lifetime.
 	if pl < header.NDPInfiniteLifetime {
 		if !deprecated {
-			prefixState.deprecationTimer.Reset(pl)
+			prefixState.deprecationJob.Schedule(pl)
 		}
 		prefixState.preferredUntil = now.Add(pl)
 	} else {
@@ -1521,9 +1534,9 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 	// 3) Otherwise, reset the valid lifetime of the prefix to 2 hours.
 
 	if vl >= header.NDPInfiniteLifetime {
-		// Handle the infinite valid lifetime separately as we do not keep a timer
-		// in this case.
-		prefixState.invalidationTimer.StopLocked()
+		// Handle the infinite valid lifetime separately as we do not schedule a
+		// job in this case.
+		prefixState.invalidationJob.Cancel()
 		prefixState.validUntil = time.Time{}
 	} else {
 		var effectiveVl time.Duration
@@ -1544,8 +1557,8 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 		}
 
 		if effectiveVl != 0 {
-			prefixState.invalidationTimer.StopLocked()
-			prefixState.invalidationTimer.Reset(effectiveVl)
+			prefixState.invalidationJob.Cancel()
+			prefixState.invalidationJob.Schedule(effectiveVl)
 			prefixState.validUntil = now.Add(effectiveVl)
 		}
 	}
@@ -1557,7 +1570,7 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 	}
 
 	// Note, we do not need to update the entries in the temporary address map
-	// after updating the timers because the timers are held as pointers.
+	// after updating the jobs because the jobs are held as pointers.
 	var regenForAddr tcpip.Address
 	allAddressesRegenerated := true
 	for tempAddr, tempAddrState := range prefixState.tempAddrs {
@@ -1571,14 +1584,14 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 		}
 
 		// If the address is no longer valid, invalidate it immediately. Otherwise,
-		// reset the invalidation timer.
+		// reset the invalidation job.
 		newValidLifetime := validUntil.Sub(now)
 		if newValidLifetime <= 0 {
 			ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, tempAddr, tempAddrState)
 			continue
 		}
-		tempAddrState.invalidationTimer.StopLocked()
-		tempAddrState.invalidationTimer.Reset(newValidLifetime)
+		tempAddrState.invalidationJob.Cancel()
+		tempAddrState.invalidationJob.Schedule(newValidLifetime)
 
 		// As per RFC 4941 section 3.3 step 4, the preferred lifetime of a temporary
 		// address is the lower of the preferred lifetime of the stable address or
@@ -1591,17 +1604,17 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 		}
 
 		// If the address is no longer preferred, deprecate it immediately.
-		// Otherwise, reset the deprecation timer.
+		// Otherwise, schedule the deprecation job again.
 		newPreferredLifetime := preferredUntil.Sub(now)
-		tempAddrState.deprecationTimer.StopLocked()
+		tempAddrState.deprecationJob.Cancel()
 		if newPreferredLifetime <= 0 {
 			ndp.deprecateSLAACAddress(tempAddrState.ref)
 		} else {
 			tempAddrState.ref.deprecated = false
-			tempAddrState.deprecationTimer.Reset(newPreferredLifetime)
+			tempAddrState.deprecationJob.Schedule(newPreferredLifetime)
 		}
 
-		tempAddrState.regenTimer.StopLocked()
+		tempAddrState.regenJob.Cancel()
 		if tempAddrState.regenerated {
 		} else {
 			allAddressesRegenerated = false
@@ -1612,7 +1625,7 @@ func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixStat
 				// immediately after we finish iterating over the temporary addresses.
 				regenForAddr = tempAddr
 			} else {
-				tempAddrState.regenTimer.Reset(newPreferredLifetime - ndp.configs.RegenAdvanceDuration)
+				tempAddrState.regenJob.Schedule(newPreferredLifetime - ndp.configs.RegenAdvanceDuration)
 			}
 		}
 	}
@@ -1692,7 +1705,7 @@ func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPr
 	ndp.cleanupSLAACPrefixResources(prefix, state)
 }
 
-// cleanupSLAACPrefixResources cleansup a SLAAC prefix's timers and entry.
+// cleanupSLAACPrefixResources cleans up a SLAAC prefix's jobs and entry.
 //
 // Panics if the SLAAC prefix is not known.
 //
@@ -1704,8 +1717,8 @@ func (ndp *ndpState) cleanupSLAACPrefixResources(prefix tcpip.Subnet, state slaa
 	}
 
 	state.stableAddr.ref = nil
-	state.deprecationTimer.StopLocked()
-	state.invalidationTimer.StopLocked()
+	state.deprecationJob.Cancel()
+	state.invalidationJob.Cancel()
 	delete(ndp.slaacPrefixes, prefix)
 }
 
@@ -1750,13 +1763,13 @@ func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotify(addr tcpip.AddressWi
 }
 
 // cleanupTempSLAACAddrResourcesAndNotify cleans up a temporary SLAAC address's
-// timers and entry.
+// jobs and entry.
 //
 // The NIC that ndp belongs to MUST be locked.
 func (ndp *ndpState) cleanupTempSLAACAddrResources(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) {
-	tempAddrState.deprecationTimer.StopLocked()
-	tempAddrState.invalidationTimer.StopLocked()
-	tempAddrState.regenTimer.StopLocked()
+	tempAddrState.deprecationJob.Cancel()
+	tempAddrState.invalidationJob.Cancel()
+	tempAddrState.regenJob.Cancel()
 	delete(tempAddrs, tempAddr)
 }
 
@@ -1816,7 +1829,7 @@ func (ndp *ndpState) cleanupState(hostOnly bool) {
 //
 // The NIC ndp belongs to MUST be locked.
 func (ndp *ndpState) startSolicitingRouters() {
-	if ndp.rtrSolicitTimer != nil {
+	if ndp.rtrSolicit.timer != nil {
 		// We are already soliciting routers.
 		return
 	}
@@ -1833,14 +1846,27 @@ func (ndp *ndpState) startSolicitingRouters() {
 		delay = time.Duration(rand.Int63n(int64(ndp.configs.MaxRtrSolicitationDelay)))
 	}
 
-	ndp.rtrSolicitTimer = time.AfterFunc(delay, func() {
+	var done bool
+	ndp.rtrSolicit.done = &done
+	ndp.rtrSolicit.timer = ndp.nic.stack.Clock().AfterFunc(delay, func() {
+		ndp.nic.mu.Lock()
+		if done {
+			// If we reach this point, it means that the RS timer fired after another
+			// goroutine already obtained the NIC lock and stopped solicitations.
+			// Simply return here and do nothing further.
+			ndp.nic.mu.Unlock()
+			return
+		}
+
 		// As per RFC 4861 section 4.1, the source of the RS is an address assigned
 		// to the sending interface, or the unspecified address if no address is
 		// assigned to the sending interface.
-		ref := ndp.nic.primaryIPv6Endpoint(header.IPv6AllRoutersMulticastAddress)
+		ref := ndp.nic.primaryIPv6EndpointRLocked(header.IPv6AllRoutersMulticastAddress)
 		if ref == nil {
-			ref = ndp.nic.getRefOrCreateTemp(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint, forceSpoofing)
+			ref = ndp.nic.getRefOrCreateTempLocked(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint)
 		}
+		ndp.nic.mu.Unlock()
+
 		localAddr := ref.ep.ID().LocalAddress
 		r := makeRoute(header.IPv6ProtocolNumber, localAddr, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
 		defer r.Release()
@@ -1849,6 +1875,13 @@ func (ndp *ndpState) startSolicitingRouters() {
 		// header.IPv6AllRoutersMulticastAddress is a multicast address so a
 		// remote link address can be calculated without a resolution process.
 		if c, err := r.Resolve(nil); err != nil {
+			// Do not consider the NIC being unknown or disabled as a fatal error.
+			// Since this method is required to be called when the NIC is not locked,
+			// the NIC could have been disabled or removed by another goroutine.
+			if err == tcpip.ErrUnknownNICID || err == tcpip.ErrInvalidEndpointState {
+				return
+			}
+
 			panic(fmt.Sprintf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID(), err))
 		} else if c != nil {
 			panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID()))
@@ -1868,12 +1901,16 @@ func (ndp *ndpState) startSolicitingRouters() {
 			}
 		}
 		payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize + int(optsSerializer.Length())
-		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + payloadSize)
-		pkt := header.ICMPv6(hdr.Prepend(payloadSize))
-		pkt.SetType(header.ICMPv6RouterSolicit)
-		rs := header.NDPRouterSolicit(pkt.NDPPayload())
+		icmpData := header.ICMPv6(buffer.NewView(payloadSize))
+		icmpData.SetType(header.ICMPv6RouterSolicit)
+		rs := header.NDPRouterSolicit(icmpData.NDPPayload())
 		rs.Options().Serialize(optsSerializer)
-		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+		icmpData.SetChecksum(header.ICMPv6Checksum(icmpData, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+
+		pkt := NewPacketBuffer(PacketBufferOptions{
+			ReserveHeaderBytes: int(r.MaxHeaderLength()),
+			Data:               buffer.View(icmpData).ToVectorisedView(),
+		})
 
 		sent := r.Stats().ICMP.V6PacketsSent
 		if err := r.WritePacket(nil,
@@ -1881,7 +1918,7 @@ func (ndp *ndpState) startSolicitingRouters() {
 				Protocol: header.ICMPv6ProtocolNumber,
 				TTL:      header.NDPHopLimit,
 				TOS:      DefaultTOS,
-			}, PacketBuffer{Header: hdr},
+			}, pkt,
 		); err != nil {
 			sent.Dropped.Increment()
 			log.Printf("startSolicitingRouters: error writing NDP router solicit message on NIC(%d); err = %s", ndp.nic.ID(), err)
@@ -1893,17 +1930,18 @@ func (ndp *ndpState) startSolicitingRouters() {
 		}
 
 		ndp.nic.mu.Lock()
-		defer ndp.nic.mu.Unlock()
-		if remaining == 0 {
-			ndp.rtrSolicitTimer = nil
-		} else if ndp.rtrSolicitTimer != nil {
+		if done || remaining == 0 {
+			ndp.rtrSolicit.timer = nil
+			ndp.rtrSolicit.done = nil
+		} else if ndp.rtrSolicit.timer != nil {
 			// Note, we need to explicitly check to make sure that
 			// the timer field is not nil because if it was nil but
 			// we still reached this point, then we know the NIC
 			// was requested to stop soliciting routers so we don't
 			// need to send the next Router Solicitation message.
-			ndp.rtrSolicitTimer.Reset(ndp.configs.RtrSolicitationInterval)
+			ndp.rtrSolicit.timer.Reset(ndp.configs.RtrSolicitationInterval)
 		}
+		ndp.nic.mu.Unlock()
 	})
 
 }
@@ -1913,13 +1951,15 @@ func (ndp *ndpState) startSolicitingRouters() {
 //
 // The NIC ndp belongs to MUST be locked.
 func (ndp *ndpState) stopSolicitingRouters() {
-	if ndp.rtrSolicitTimer == nil {
+	if ndp.rtrSolicit.timer == nil {
 		// Nothing to do.
 		return
 	}
 
-	ndp.rtrSolicitTimer.Stop()
-	ndp.rtrSolicitTimer = nil
+	*ndp.rtrSolicit.done = true
+	ndp.rtrSolicit.timer.Stop()
+	ndp.rtrSolicit.timer = nil
+	ndp.rtrSolicit.done = nil
 }
 
 // initializeTempAddrState initializes state related to temporary SLAAC
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
index b3d174cdd..21bf53010 100644
--- a/pkg/tcpip/stack/ndp_test.go
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -36,15 +36,24 @@ import (
 )
 
 const (
-	addr1                    = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
-	addr2                    = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
-	addr3                    = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03")
-	linkAddr1                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
-	linkAddr2                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x07")
-	linkAddr3                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x08")
-	linkAddr4                = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x09")
-	defaultTimeout           = 100 * time.Millisecond
-	defaultAsyncEventTimeout = time.Second
+	addr1     = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	addr2     = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+	addr3     = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03")
+	linkAddr1 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+	linkAddr2 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x07")
+	linkAddr3 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x08")
+	linkAddr4 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x09")
+
+	// Extra time to use when waiting for an async event to occur.
+	defaultAsyncPositiveEventTimeout = 10 * time.Second
+
+	// Extra time to use when waiting for an async event to not occur.
+	//
+	// Since a negative check is used to make sure an event did not happen, it is
+	// okay to use a smaller timeout compared to the positive case since execution
+	// stall in regards to the monotonic clock will not affect the expected
+	// outcome.
+	defaultAsyncNegativeEventTimeout = time.Second
 )
 
 var (
@@ -421,45 +430,90 @@ func TestDADResolve(t *testing.T) {
 				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
 			}
 
+			// We add a default route so the call to FindRoute below will succeed
+			// once we have an assigned address.
+			s.SetRouteTable([]tcpip.Route{{
+				Destination: header.IPv6EmptySubnet,
+				Gateway:     addr3,
+				NIC:         nicID,
+			}})
+
 			if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
 				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
 			}
 
 			// Address should not be considered bound to the NIC yet (DAD ongoing).
-			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
-			if err != nil {
-				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
-			}
-			if want := (tcpip.AddressWithPrefix{}); addr != want {
+			if addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %s), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			} else if want := (tcpip.AddressWithPrefix{}); addr != want {
 				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
 			}
 
 			// Make sure the address does not resolve before the resolution time has
 			// passed.
-			time.Sleep(test.expectedRetransmitTimer*time.Duration(test.dupAddrDetectTransmits) - defaultAsyncEventTimeout)
-			addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
-			if err != nil {
-				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			time.Sleep(test.expectedRetransmitTimer*time.Duration(test.dupAddrDetectTransmits) - defaultAsyncNegativeEventTimeout)
+			if addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+				t.Errorf("got stack.GetMainNICAddress(%d, %d) = (_, %s), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			} else if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Errorf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+			}
+			// Should not get a route even if we specify the local address as the
+			// tentative address.
+			{
+				r, err := s.FindRoute(nicID, "", addr2, header.IPv6ProtocolNumber, false)
+				if err != tcpip.ErrNoRoute {
+					t.Errorf("got FindRoute(%d, '', %s, %d, false) = (%+v, %v), want = (_, %s)", nicID, addr2, header.IPv6ProtocolNumber, r, err, tcpip.ErrNoRoute)
+				}
+				r.Release()
 			}
-			if want := (tcpip.AddressWithPrefix{}); addr != want {
-				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+			{
+				r, err := s.FindRoute(nicID, addr1, addr2, header.IPv6ProtocolNumber, false)
+				if err != tcpip.ErrNoRoute {
+					t.Errorf("got FindRoute(%d, %s, %s, %d, false) = (%+v, %v), want = (_, %s)", nicID, addr1, addr2, header.IPv6ProtocolNumber, r, err, tcpip.ErrNoRoute)
+				}
+				r.Release()
+			}
+
+			if t.Failed() {
+				t.FailNow()
 			}
 
 			// Wait for DAD to resolve.
 			select {
-			case <-time.After(2 * defaultAsyncEventTimeout):
+			case <-time.After(defaultAsyncPositiveEventTimeout):
 				t.Fatal("timed out waiting for DAD resolution")
 			case e := <-ndpDisp.dadC:
 				if diff := checkDADEvent(e, nicID, addr1, true, nil); diff != "" {
 					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 				}
 			}
-			addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
-			if err != nil {
-				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			if addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+				t.Errorf("got stack.GetMainNICAddress(%d, %d) = (_, %s), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			} else if addr.Address != addr1 {
+				t.Errorf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, addr, addr1)
 			}
-			if addr.Address != addr1 {
-				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, addr, addr1)
+			// Should get a route using the address now that it is resolved.
+			{
+				r, err := s.FindRoute(nicID, "", addr2, header.IPv6ProtocolNumber, false)
+				if err != nil {
+					t.Errorf("got FindRoute(%d, '', %s, %d, false): %s", nicID, addr2, header.IPv6ProtocolNumber, err)
+				} else if r.LocalAddress != addr1 {
+					t.Errorf("got r.LocalAddress = %s, want = %s", r.LocalAddress, addr1)
+				}
+				r.Release()
+			}
+			{
+				r, err := s.FindRoute(nicID, addr1, addr2, header.IPv6ProtocolNumber, false)
+				if err != nil {
+					t.Errorf("got FindRoute(%d, %s, %s, %d, false): %s", nicID, addr1, addr2, header.IPv6ProtocolNumber, err)
+				} else if r.LocalAddress != addr1 {
+					t.Errorf("got r.LocalAddress = %s, want = %s", r.LocalAddress, addr1)
+				}
+				r.Release()
+			}
+
+			if t.Failed() {
+				t.FailNow()
 			}
 
 			// Should not have sent any more NS messages.
@@ -487,7 +541,7 @@ func TestDADResolve(t *testing.T) {
 				// As per RFC 4861 section 4.3, a possible option is the Source Link
 				// Layer option, but this option MUST NOT be included when the source
 				// address of the packet is the unspecified address.
-				checker.IPv6(t, p.Pkt.Header.View(),
+				checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
 					checker.SrcAddr(header.IPv6Any),
 					checker.DstAddr(snmc),
 					checker.TTL(header.NDPHopLimit),
@@ -496,8 +550,8 @@ func TestDADResolve(t *testing.T) {
 						checker.NDPNSOptions(nil),
 					))
 
-				if l, want := p.Pkt.Header.AvailableLength(), int(test.linkHeaderLen); l != want {
-					t.Errorf("got p.Pkt.Header.AvailableLength() = %d; want = %d", l, want)
+				if l, want := p.Pkt.AvailableHeaderBytes(), int(test.linkHeaderLen); l != want {
+					t.Errorf("got p.Pkt.AvailableHeaderBytes() = %d; want = %d", l, want)
 				}
 			}
 		})
@@ -613,9 +667,10 @@ func TestDADFail(t *testing.T) {
 			// Receive a packet to simulate multiple nodes owning or
 			// attempting to own the same address.
 			hdr := test.makeBuf(addr1)
-			e.InjectInbound(header.IPv6ProtocolNumber, stack.PacketBuffer{
+			pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 				Data: hdr.View().ToVectorisedView(),
 			})
+			e.InjectInbound(header.IPv6ProtocolNumber, pkt)
 
 			stat := test.getStat(s.Stats().ICMP.V6PacketsReceived)
 			if got := stat.Value(); got != 1 {
@@ -935,7 +990,7 @@ func TestSetNDPConfigurations(t *testing.T) {
 
 // raBufWithOptsAndDHCPv6 returns a valid NDP Router Advertisement with options
 // and DHCPv6 configurations specified.
-func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, optSer header.NDPOptionsSerializer) stack.PacketBuffer {
+func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, optSer header.NDPOptionsSerializer) *stack.PacketBuffer {
 	icmpSize := header.ICMPv6HeaderSize + header.NDPRAMinimumSize + int(optSer.Length())
 	hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
 	pkt := header.ICMPv6(hdr.Prepend(icmpSize))
@@ -970,14 +1025,16 @@ func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherCo
 		DstAddr:       header.IPv6AllNodesMulticastAddress,
 	})
 
-	return stack.PacketBuffer{Data: hdr.View().ToVectorisedView()}
+	return stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: hdr.View().ToVectorisedView(),
+	})
 }
 
 // raBufWithOpts returns a valid NDP Router Advertisement with options.
 //
 // Note, raBufWithOpts does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) stack.PacketBuffer {
+func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) *stack.PacketBuffer {
 	return raBufWithOptsAndDHCPv6(ip, rl, false, false, optSer)
 }
 
@@ -986,7 +1043,7 @@ func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializ
 //
 // Note, raBufWithDHCPv6 does not populate any of the RA fields other than the
 // DHCPv6 related ones.
-func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bool) stack.PacketBuffer {
+func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bool) *stack.PacketBuffer {
 	return raBufWithOptsAndDHCPv6(ip, 0, managedAddresses, otherConfiguratiosns, header.NDPOptionsSerializer{})
 }
 
@@ -994,7 +1051,7 @@ func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bo
 //
 // Note, raBuf does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBuf(ip tcpip.Address, rl uint16) stack.PacketBuffer {
+func raBuf(ip tcpip.Address, rl uint16) *stack.PacketBuffer {
 	return raBufWithOpts(ip, rl, header.NDPOptionsSerializer{})
 }
 
@@ -1003,7 +1060,7 @@ func raBuf(ip tcpip.Address, rl uint16) stack.PacketBuffer {
 //
 // Note, raBufWithPI does not populate any of the RA fields other than the
 // Router Lifetime.
-func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) stack.PacketBuffer {
+func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) *stack.PacketBuffer {
 	flags := uint8(0)
 	if onLink {
 		// The OnLink flag is the 7th bit in the flags byte.
@@ -1124,7 +1181,7 @@ func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
 	select {
 	case <-ndpDisp.routerC:
 		t.Fatal("should not have received any router events")
-	case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
+	case <-time.After(lifetimeSeconds*time.Second + defaultAsyncNegativeEventTimeout):
 	}
 }
 
@@ -1200,14 +1257,14 @@ func TestRouterDiscovery(t *testing.T) {
 	default:
 	}
 
-	// Wait for lladdr2's router invalidation timer to fire. The lifetime
+	// Wait for lladdr2's router invalidation job to execute. The lifetime
 	// of the router should have been updated to the most recent (smaller)
 	// lifetime.
 	//
 	// Wait for the normal lifetime plus an extra bit for the
 	// router to get invalidated. If we don't get an invalidation
 	// event after this time, then something is wrong.
-	expectAsyncRouterInvalidationEvent(llAddr2, l2LifetimeSeconds*time.Second+defaultAsyncEventTimeout)
+	expectAsyncRouterInvalidationEvent(llAddr2, l2LifetimeSeconds*time.Second+defaultAsyncPositiveEventTimeout)
 
 	// Rx an RA from lladdr2 with huge lifetime.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
@@ -1217,14 +1274,14 @@ func TestRouterDiscovery(t *testing.T) {
 	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 0))
 	expectRouterEvent(llAddr2, false)
 
-	// Wait for lladdr3's router invalidation timer to fire. The lifetime
+	// Wait for lladdr3's router invalidation job to execute. The lifetime
 	// of the router should have been updated to the most recent (smaller)
 	// lifetime.
 	//
 	// Wait for the normal lifetime plus an extra bit for the
 	// router to get invalidated. If we don't get an invalidation
 	// event after this time, then something is wrong.
-	expectAsyncRouterInvalidationEvent(llAddr3, l3LifetimeSeconds*time.Second+defaultAsyncEventTimeout)
+	expectAsyncRouterInvalidationEvent(llAddr3, l3LifetimeSeconds*time.Second+defaultAsyncPositiveEventTimeout)
 }
 
 // TestRouterDiscoveryMaxRouters tests that only
@@ -1373,7 +1430,7 @@ func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("should not have received any prefix events")
-	case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
+	case <-time.After(lifetimeSeconds*time.Second + defaultAsyncNegativeEventTimeout):
 	}
 }
 
@@ -1448,14 +1505,14 @@ func TestPrefixDiscovery(t *testing.T) {
 	default:
 	}
 
-	// Wait for prefix2's most recent invalidation timer plus some buffer to
+	// Wait for prefix2's most recent invalidation job plus some buffer to
 	// expire.
 	select {
 	case e := <-ndpDisp.prefixC:
 		if diff := checkPrefixEvent(e, subnet2, false); diff != "" {
 			t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
 		}
-	case <-time.After(time.Duration(lifetime)*time.Second + defaultAsyncEventTimeout):
+	case <-time.After(time.Duration(lifetime)*time.Second + defaultAsyncPositiveEventTimeout):
 		t.Fatal("timed out waiting for prefix discovery event")
 	}
 
@@ -1520,7 +1577,7 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
-	case <-time.After(testInfiniteLifetime + defaultTimeout):
+	case <-time.After(testInfiniteLifetime + defaultAsyncNegativeEventTimeout):
 	}
 
 	// Receive an RA with finite lifetime.
@@ -1545,7 +1602,7 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
-	case <-time.After(testInfiniteLifetime + defaultTimeout):
+	case <-time.After(testInfiniteLifetime + defaultAsyncNegativeEventTimeout):
 	}
 
 	// Receive an RA with a prefix with a lifetime value greater than the
@@ -1554,7 +1611,7 @@ func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
 	select {
 	case <-ndpDisp.prefixC:
 		t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
-	case <-time.After((testInfiniteLifetimeSeconds+1)*time.Second + defaultTimeout):
+	case <-time.After((testInfiniteLifetimeSeconds+1)*time.Second + defaultAsyncNegativeEventTimeout):
 	}
 
 	// Receive an RA with 0 lifetime.
@@ -1635,13 +1692,7 @@ func containsV6Addr(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix)
 		AddressWithPrefix: item,
 	}
 
-	for _, i := range list {
-		if i == protocolAddress {
-			return true
-		}
-	}
-
-	return false
+	return containsAddr(list, protocolAddress)
 }
 
 // TestNoAutoGenAddr tests that SLAAC is not performed when configured not to.
@@ -1790,7 +1841,7 @@ func TestAutoGenAddr(t *testing.T) {
 		if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" {
 			t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 		}
-	case <-time.After(newMinVLDuration + defaultAsyncEventTimeout):
+	case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout):
 		t.Fatal("timed out waiting for addr auto gen event")
 	}
 	if containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) {
@@ -1917,7 +1968,7 @@ func TestAutoGenTempAddr(t *testing.T) {
 						if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
 							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 						}
-					case <-time.After(defaultAsyncEventTimeout):
+					case <-time.After(defaultAsyncPositiveEventTimeout):
 						t.Fatal("timed out waiting for addr auto gen event")
 					}
 				}
@@ -1930,7 +1981,7 @@ func TestAutoGenTempAddr(t *testing.T) {
 						if diff := checkDADEvent(e, nicID, addr, true, nil); diff != "" {
 							t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 						}
-					case <-time.After(time.Duration(test.dupAddrTransmits)*test.retransmitTimer + defaultAsyncEventTimeout):
+					case <-time.After(time.Duration(test.dupAddrTransmits)*test.retransmitTimer + defaultAsyncPositiveEventTimeout):
 						t.Fatal("timed out waiting for DAD event")
 					}
 				}
@@ -2036,10 +2087,10 @@ func TestAutoGenTempAddr(t *testing.T) {
 						if diff := checkAutoGenAddrEvent(e, nextAddr, invalidatedAddr); diff != "" {
 							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 						}
-					case <-time.After(defaultTimeout):
+					case <-time.After(defaultAsyncPositiveEventTimeout):
 						t.Fatal("timed out waiting for addr auto gen event")
 					}
-				case <-time.After(newMinVLDuration + defaultTimeout):
+				case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout):
 					t.Fatal("timed out waiting for addr auto gen event")
 				}
 				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr2, tempAddr2}, []tcpip.AddressWithPrefix{addr1, tempAddr1}); mismatch != "" {
@@ -2135,7 +2186,7 @@ func TestNoAutoGenTempAddrForLinkLocal(t *testing.T) {
 					if diff := checkDADEvent(e, nicID, llAddr1, true, nil); diff != "" {
 						t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 					}
-				case <-time.After(time.Duration(test.dupAddrTransmits)*test.retransmitTimer + defaultAsyncEventTimeout):
+				case <-time.After(time.Duration(test.dupAddrTransmits)*test.retransmitTimer + defaultAsyncPositiveEventTimeout):
 					t.Fatal("timed out waiting for DAD event")
 				}
 
@@ -2143,7 +2194,7 @@ func TestNoAutoGenTempAddrForLinkLocal(t *testing.T) {
 				select {
 				case e := <-ndpDisp.autoGenAddrC:
 					t.Errorf("got unxpected auto gen addr event = %+v", e)
-				case <-time.After(defaultAsyncEventTimeout):
+				case <-time.After(defaultAsyncNegativeEventTimeout):
 				}
 			})
 		}
@@ -2220,7 +2271,7 @@ func TestNoAutoGenTempAddrWithoutStableAddr(t *testing.T) {
 		if diff := checkDADEvent(e, nicID, addr.Address, true, nil); diff != "" {
 			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 		}
-	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
+	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncPositiveEventTimeout):
 		t.Fatal("timed out waiting for DAD event")
 	}
 	select {
@@ -2228,7 +2279,7 @@ func TestNoAutoGenTempAddrWithoutStableAddr(t *testing.T) {
 		if diff := checkAutoGenAddrEvent(e, tempAddr, newAddr); diff != "" {
 			t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 		}
-	case <-time.After(defaultAsyncEventTimeout):
+	case <-time.After(defaultAsyncPositiveEventTimeout):
 		t.Fatal("timed out waiting for addr auto gen event")
 	}
 }
@@ -2318,13 +2369,13 @@ func TestAutoGenTempAddrRegen(t *testing.T) {
 	}
 
 	// Wait for regeneration
-	expectAutoGenAddrEventAsync(tempAddr2, newAddr, regenAfter+defaultAsyncEventTimeout)
+	expectAutoGenAddrEventAsync(tempAddr2, newAddr, regenAfter+defaultAsyncPositiveEventTimeout)
 	if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddr1, tempAddr2}, nil); mismatch != "" {
 		t.Fatal(mismatch)
 	}
 
 	// Wait for regeneration
-	expectAutoGenAddrEventAsync(tempAddr3, newAddr, regenAfter+defaultAsyncEventTimeout)
+	expectAutoGenAddrEventAsync(tempAddr3, newAddr, regenAfter+defaultAsyncPositiveEventTimeout)
 	if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddr1, tempAddr2, tempAddr3}, nil); mismatch != "" {
 		t.Fatal(mismatch)
 	}
@@ -2341,7 +2392,7 @@ func TestAutoGenTempAddrRegen(t *testing.T) {
 	for _, addr := range tempAddrs {
 		// Wait for a deprecation then invalidation event, or just an invalidation
 		// event. We need to cover both cases but cannot deterministically hit both
-		// cases because the deprecation and invalidation timers could fire in any
+		// cases because the deprecation and invalidation jobs could execute in any
 		// order.
 		select {
 		case e := <-ndpDisp.autoGenAddrC:
@@ -2353,7 +2404,7 @@ func TestAutoGenTempAddrRegen(t *testing.T) {
 					if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
 						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 					}
-				case <-time.After(defaultAsyncEventTimeout):
+				case <-time.After(defaultAsyncPositiveEventTimeout):
 					t.Fatal("timed out waiting for addr auto gen event")
 				}
 			} else if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff == "" {
@@ -2362,12 +2413,12 @@ func TestAutoGenTempAddrRegen(t *testing.T) {
 				select {
 				case e := <-ndpDisp.autoGenAddrC:
 					t.Fatalf("unexpectedly got an auto-generated event = %+v", e)
-				case <-time.After(defaultTimeout):
+				case <-time.After(defaultAsyncNegativeEventTimeout):
 				}
 			} else {
 				t.Fatalf("got unexpected auto-generated event = %+v", e)
 			}
-		case <-time.After(invalidateAfter + defaultAsyncEventTimeout):
+		case <-time.After(invalidateAfter + defaultAsyncPositiveEventTimeout):
 			t.Fatal("timed out waiting for addr auto gen event")
 		}
 
@@ -2378,9 +2429,9 @@ func TestAutoGenTempAddrRegen(t *testing.T) {
 	}
 }
 
-// TestAutoGenTempAddrRegenTimerUpdates tests that a temporary address's
-// regeneration timer gets updated when refreshing the address's lifetimes.
-func TestAutoGenTempAddrRegenTimerUpdates(t *testing.T) {
+// TestAutoGenTempAddrRegenJobUpdates tests that a temporary address's
+// regeneration job gets updated when refreshing the address's lifetimes.
+func TestAutoGenTempAddrRegenJobUpdates(t *testing.T) {
 	const (
 		nicID            = 1
 		regenAfter       = 2 * time.Second
@@ -2472,14 +2523,14 @@ func TestAutoGenTempAddrRegenTimerUpdates(t *testing.T) {
 	select {
 	case e := <-ndpDisp.autoGenAddrC:
 		t.Fatalf("unexpected auto gen addr event = %+v", e)
-	case <-time.After(regenAfter + defaultAsyncEventTimeout):
+	case <-time.After(regenAfter + defaultAsyncNegativeEventTimeout):
 	}
 
 	// Prefer the prefix again.
 	//
 	// A new temporary address should immediately be generated since the
 	// regeneration time has already passed since the last address was generated
-	// - this regeneration does not depend on a timer.
+	// - this regeneration does not depend on a job.
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
 	expectAutoGenAddrEvent(tempAddr2, newAddr)
 
@@ -2501,24 +2552,24 @@ func TestAutoGenTempAddrRegenTimerUpdates(t *testing.T) {
 	select {
 	case e := <-ndpDisp.autoGenAddrC:
 		t.Fatalf("unexpected auto gen addr event = %+v", e)
-	case <-time.After(regenAfter + defaultAsyncEventTimeout):
+	case <-time.After(regenAfter + defaultAsyncNegativeEventTimeout):
 	}
 
 	// Set the maximum lifetimes for temporary addresses such that on the next
-	// RA, the regeneration timer gets reset.
+	// RA, the regeneration job gets scheduled again.
 	//
 	// The maximum lifetime is the sum of the minimum lifetimes for temporary
 	// addresses + the time that has already passed since the last address was
-	// generated so that the regeneration timer is needed to generate the next
+	// generated so that the regeneration job is needed to generate the next
 	// address.
-	newLifetimes := newMinVLDuration + regenAfter + defaultAsyncEventTimeout
+	newLifetimes := newMinVLDuration + regenAfter + defaultAsyncNegativeEventTimeout
 	ndpConfigs.MaxTempAddrValidLifetime = newLifetimes
 	ndpConfigs.MaxTempAddrPreferredLifetime = newLifetimes
 	if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
 		t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
 	}
 	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
-	expectAutoGenAddrEventAsync(tempAddr3, newAddr, regenAfter+defaultAsyncEventTimeout)
+	expectAutoGenAddrEventAsync(tempAddr3, newAddr, regenAfter+defaultAsyncPositiveEventTimeout)
 }
 
 // TestMixedSLAACAddrConflictRegen tests SLAAC address regeneration in response
@@ -2666,7 +2717,7 @@ func TestMixedSLAACAddrConflictRegen(t *testing.T) {
 					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
 						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 					}
-				case <-time.After(defaultAsyncEventTimeout):
+				case <-time.After(defaultAsyncPositiveEventTimeout):
 					t.Fatal("timed out waiting for addr auto gen event")
 				}
 			}
@@ -2679,7 +2730,7 @@ func TestMixedSLAACAddrConflictRegen(t *testing.T) {
 					if diff := checkDADEvent(e, nicID, addr, true, nil); diff != "" {
 						t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 					}
-				case <-time.After(dupAddrTransmits*retransmitTimer + defaultAsyncEventTimeout):
+				case <-time.After(dupAddrTransmits*retransmitTimer + defaultAsyncPositiveEventTimeout):
 					t.Fatal("timed out waiting for DAD event")
 				}
 			}
@@ -2939,9 +2990,9 @@ func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
 	expectPrimaryAddr(addr2)
 }
 
-// TestAutoGenAddrTimerDeprecation tests that an address is properly deprecated
+// TestAutoGenAddrJobDeprecation tests that an address is properly deprecated
 // when its preferred lifetime expires.
-func TestAutoGenAddrTimerDeprecation(t *testing.T) {
+func TestAutoGenAddrJobDeprecation(t *testing.T) {
 	const nicID = 1
 	const newMinVL = 2
 	newMinVLDuration := newMinVL * time.Second
@@ -3025,7 +3076,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	expectPrimaryAddr(addr1)
 
 	// Wait for addr of prefix1 to be deprecated.
-	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncEventTimeout)
+	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
 	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
@@ -3065,7 +3116,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	expectPrimaryAddr(addr1)
 
 	// Wait for addr of prefix1 to be deprecated.
-	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncEventTimeout)
+	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
 	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
@@ -3079,7 +3130,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 	}
 
 	// Wait for addr of prefix1 to be invalidated.
-	expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultAsyncEventTimeout)
+	expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultAsyncPositiveEventTimeout)
 	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
 		t.Fatalf("should not have %s in the list of addresses", addr1)
 	}
@@ -3111,7 +3162,7 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 				if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff != "" {
 					t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 				}
-			case <-time.After(defaultAsyncEventTimeout):
+			case <-time.After(defaultAsyncPositiveEventTimeout):
 				t.Fatal("timed out waiting for addr auto gen event")
 			}
 		} else if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff == "" {
@@ -3120,12 +3171,12 @@ func TestAutoGenAddrTimerDeprecation(t *testing.T) {
 			select {
 			case <-ndpDisp.autoGenAddrC:
 				t.Fatal("unexpectedly got an auto-generated event")
-			case <-time.After(defaultTimeout):
+			case <-time.After(defaultAsyncNegativeEventTimeout):
 			}
 		} else {
 			t.Fatalf("got unexpected auto-generated event")
 		}
-	case <-time.After(newMinVLDuration + defaultAsyncEventTimeout):
+	case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout):
 		t.Fatal("timed out waiting for addr auto gen event")
 	}
 	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
@@ -3250,7 +3301,7 @@ func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) {
 						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 					}
 
-				case <-time.After(minVLSeconds*time.Second + defaultAsyncEventTimeout):
+				case <-time.After(minVLSeconds*time.Second + defaultAsyncPositiveEventTimeout):
 					t.Fatal("timeout waiting for addr auto gen event")
 				}
 			})
@@ -3394,7 +3445,7 @@ func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 				select {
 				case <-ndpDisp.autoGenAddrC:
 					t.Fatal("unexpectedly received an auto gen addr event")
-				case <-time.After(time.Duration(test.evl)*time.Second - defaultAsyncEventTimeout):
+				case <-time.After(time.Duration(test.evl)*time.Second - defaultAsyncNegativeEventTimeout):
 				}
 
 				// Wait for the invalidation event.
@@ -3403,7 +3454,7 @@ func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
 					if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
 						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 					}
-				case <-time.After(2 * defaultAsyncEventTimeout):
+				case <-time.After(defaultAsyncPositiveEventTimeout):
 					t.Fatal("timeout waiting for addr auto gen event")
 				}
 			})
@@ -3459,12 +3510,12 @@ func TestAutoGenAddrRemoval(t *testing.T) {
 	}
 	expectAutoGenAddrEvent(addr, invalidatedAddr)
 
-	// Wait for the original valid lifetime to make sure the original timer
-	// got stopped/cleaned up.
+	// Wait for the original valid lifetime to make sure the original job got
+	// cancelled/cleaned up.
 	select {
 	case <-ndpDisp.autoGenAddrC:
 		t.Fatal("unexpectedly received an auto gen addr event")
-	case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
+	case <-time.After(lifetimeSeconds*time.Second + defaultAsyncNegativeEventTimeout):
 	}
 }
 
@@ -3627,7 +3678,7 @@ func TestAutoGenAddrStaticConflict(t *testing.T) {
 	select {
 	case <-ndpDisp.autoGenAddrC:
 		t.Fatal("unexpectedly received an auto gen addr event")
-	case <-time.After(lifetimeSeconds*time.Second + defaultTimeout):
+	case <-time.After(lifetimeSeconds*time.Second + defaultAsyncNegativeEventTimeout):
 	}
 	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr) {
 		t.Fatalf("Should have %s in the list of addresses", addr1)
@@ -3725,7 +3776,7 @@ func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
 		if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" {
 			t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 		}
-	case <-time.After(validLifetimeSecondPrefix1*time.Second + defaultAsyncEventTimeout):
+	case <-time.After(validLifetimeSecondPrefix1*time.Second + defaultAsyncPositiveEventTimeout):
 		t.Fatal("timed out waiting for addr auto gen event")
 	}
 	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
@@ -3792,7 +3843,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
 				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 			}
-		case <-time.After(defaultAsyncEventTimeout):
+		case <-time.After(defaultAsyncPositiveEventTimeout):
 			t.Fatal("timed out waiting for addr auto gen event")
 		}
 	}
@@ -3818,7 +3869,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 			if diff := checkDADEvent(e, nicID, addr, resolved, nil); diff != "" {
 				t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 			}
-		case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
+		case <-time.After(dadTransmits*retransmitTimer + defaultAsyncPositiveEventTimeout):
 			t.Fatal("timed out waiting for DAD event")
 		}
 	}
@@ -3985,7 +4036,7 @@ func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
 						select {
 						case e := <-ndpDisp.autoGenAddrC:
 							t.Fatalf("unexpectedly got an auto-generated address event = %+v", e)
-						case <-time.After(defaultAsyncEventTimeout):
+						case <-time.After(defaultAsyncNegativeEventTimeout):
 						}
 					})
 				}
@@ -4104,7 +4155,7 @@ func TestAutoGenAddrWithEUI64IIDNoDADRetries(t *testing.T) {
 			select {
 			case e := <-ndpDisp.autoGenAddrC:
 				t.Fatalf("unexpectedly got an auto-generated address event = %+v", e)
-			case <-time.After(defaultAsyncEventTimeout):
+			case <-time.After(defaultAsyncNegativeEventTimeout):
 			}
 		})
 	}
@@ -4206,7 +4257,7 @@ func TestAutoGenAddrContinuesLifetimesAfterRetry(t *testing.T) {
 		if diff := checkDADEvent(e, nicID, addr.Address, true, nil); diff != "" {
 			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
 		}
-	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
+	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncPositiveEventTimeout):
 		t.Fatal("timed out waiting for DAD event")
 	}
 
@@ -4232,7 +4283,7 @@ func TestAutoGenAddrContinuesLifetimesAfterRetry(t *testing.T) {
 				if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
 					t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 				}
-			case <-time.After(defaultAsyncEventTimeout):
+			case <-time.After(defaultAsyncPositiveEventTimeout):
 				t.Fatal("timed out waiting for invalidated auto gen addr event after deprecation")
 			}
 		} else {
@@ -4240,7 +4291,7 @@ func TestAutoGenAddrContinuesLifetimesAfterRetry(t *testing.T) {
 				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
 			}
 		}
-	case <-time.After(lifetimeSeconds*time.Second - failureTimer - dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
+	case <-time.After(lifetimeSeconds*time.Second - failureTimer - dadTransmits*retransmitTimer + defaultAsyncPositiveEventTimeout):
 		t.Fatal("timed out waiting for auto gen addr event")
 	}
 }
@@ -4824,7 +4875,7 @@ func TestCleanupNDPState(t *testing.T) {
 
 			// Should not get any more events (invalidation timers should have been
 			// cancelled when the NDP state was cleaned up).
-			time.Sleep(lifetimeSeconds*time.Second + defaultTimeout)
+			time.Sleep(lifetimeSeconds*time.Second + defaultAsyncNegativeEventTimeout)
 			select {
 			case <-ndpDisp.routerC:
 				t.Error("unexpected router event")
@@ -5086,16 +5137,15 @@ func TestRouterSolicitation(t *testing.T) {
 						t.Errorf("got remote link address = %s, want = %s", p.Route.RemoteLinkAddress, want)
 					}
 
-					checker.IPv6(t,
-						p.Pkt.Header.View(),
+					checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
 						checker.SrcAddr(test.expectedSrcAddr),
 						checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
 						checker.TTL(header.NDPHopLimit),
 						checker.NDPRS(checker.NDPRSOptions(test.expectedNDPOpts)),
 					)
 
-					if l, want := p.Pkt.Header.AvailableLength(), int(test.linkHeaderLen); l != want {
-						t.Errorf("got p.Pkt.Header.AvailableLength() = %d; want = %d", l, want)
+					if l, want := p.Pkt.AvailableHeaderBytes(), int(test.linkHeaderLen); l != want {
+						t.Errorf("got p.Pkt.AvailableHeaderBytes() = %d; want = %d", l, want)
 					}
 				}
 				waitForNothing := func(timeout time.Duration) {
@@ -5127,24 +5177,24 @@ func TestRouterSolicitation(t *testing.T) {
 				// Make sure each RS is sent at the right time.
 				remaining := test.maxRtrSolicit
 				if remaining > 0 {
-					waitForPkt(test.effectiveMaxRtrSolicitDelay + defaultAsyncEventTimeout)
+					waitForPkt(test.effectiveMaxRtrSolicitDelay + defaultAsyncPositiveEventTimeout)
 					remaining--
 				}
 
 				for ; remaining > 0; remaining-- {
-					if test.effectiveRtrSolicitInt > defaultAsyncEventTimeout {
-						waitForNothing(test.effectiveRtrSolicitInt - defaultAsyncEventTimeout)
-						waitForPkt(2 * defaultAsyncEventTimeout)
+					if test.effectiveRtrSolicitInt > defaultAsyncPositiveEventTimeout {
+						waitForNothing(test.effectiveRtrSolicitInt - defaultAsyncNegativeEventTimeout)
+						waitForPkt(defaultAsyncPositiveEventTimeout)
 					} else {
-						waitForPkt(test.effectiveRtrSolicitInt * defaultAsyncEventTimeout)
+						waitForPkt(test.effectiveRtrSolicitInt + defaultAsyncPositiveEventTimeout)
 					}
 				}
 
 				// Make sure no more RS.
 				if test.effectiveRtrSolicitInt > test.effectiveMaxRtrSolicitDelay {
-					waitForNothing(test.effectiveRtrSolicitInt + defaultAsyncEventTimeout)
+					waitForNothing(test.effectiveRtrSolicitInt + defaultAsyncNegativeEventTimeout)
 				} else {
-					waitForNothing(test.effectiveMaxRtrSolicitDelay + defaultAsyncEventTimeout)
+					waitForNothing(test.effectiveMaxRtrSolicitDelay + defaultAsyncNegativeEventTimeout)
 				}
 
 				// Make sure the counter got properly
@@ -5240,7 +5290,7 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 				if p.Proto != header.IPv6ProtocolNumber {
 					t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
 				}
-				checker.IPv6(t, p.Pkt.Header.View(),
+				checker.IPv6(t, stack.PayloadSince(p.Pkt.NetworkHeader()),
 					checker.SrcAddr(header.IPv6Any),
 					checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
 					checker.TTL(header.NDPHopLimit),
@@ -5260,11 +5310,11 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 
 			// Stop soliciting routers.
 			test.stopFn(t, s, true /* first */)
-			ctx, cancel := context.WithTimeout(context.Background(), delay+defaultAsyncEventTimeout)
+			ctx, cancel := context.WithTimeout(context.Background(), delay+defaultAsyncNegativeEventTimeout)
 			defer cancel()
 			if _, ok := e.ReadContext(ctx); ok {
 				// A single RS may have been sent before solicitations were stopped.
-				ctx, cancel := context.WithTimeout(context.Background(), interval+defaultAsyncEventTimeout)
+				ctx, cancel := context.WithTimeout(context.Background(), interval+defaultAsyncNegativeEventTimeout)
 				defer cancel()
 				if _, ok = e.ReadContext(ctx); ok {
 					t.Fatal("should not have sent more than one RS message")
@@ -5274,7 +5324,7 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 			// Stopping router solicitations after it has already been stopped should
 			// do nothing.
 			test.stopFn(t, s, false /* first */)
-			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultAsyncEventTimeout)
+			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultAsyncNegativeEventTimeout)
 			defer cancel()
 			if _, ok := e.ReadContext(ctx); ok {
 				t.Fatal("unexpectedly got a packet after router solicitation has been stopepd")
@@ -5287,10 +5337,10 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 
 			// Start soliciting routers.
 			test.startFn(t, s)
-			waitForPkt(delay + defaultAsyncEventTimeout)
-			waitForPkt(interval + defaultAsyncEventTimeout)
-			waitForPkt(interval + defaultAsyncEventTimeout)
-			ctx, cancel = context.WithTimeout(context.Background(), interval+defaultAsyncEventTimeout)
+			waitForPkt(delay + defaultAsyncPositiveEventTimeout)
+			waitForPkt(interval + defaultAsyncPositiveEventTimeout)
+			waitForPkt(interval + defaultAsyncPositiveEventTimeout)
+			ctx, cancel = context.WithTimeout(context.Background(), interval+defaultAsyncNegativeEventTimeout)
 			defer cancel()
 			if _, ok := e.ReadContext(ctx); ok {
 				t.Fatal("unexpectedly got an extra packet after sending out the expected RSs")
@@ -5299,7 +5349,7 @@ func TestStopStartSolicitingRouters(t *testing.T) {
 			// Starting router solicitations after it has already completed should do
 			// nothing.
 			test.startFn(t, s)
-			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultAsyncEventTimeout)
+			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultAsyncNegativeEventTimeout)
 			defer cancel()
 			if _, ok := e.ReadContext(ctx); ok {
 				t.Fatal("unexpectedly got a packet after finishing router solicitations")
diff --git a/pkg/tcpip/stack/neighbor_cache.go b/pkg/tcpip/stack/neighbor_cache.go
new file mode 100644
index 000000000..27e1feec0
--- /dev/null
+++ b/pkg/tcpip/stack/neighbor_cache.go
@@ -0,0 +1,333 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const neighborCacheSize = 512 // max entries per interface
+
+// neighborCache maps IP addresses to link addresses. It uses the Least
+// Recently Used (LRU) eviction strategy to implement a bounded cache for
+// dynmically acquired entries. It contains the state machine and configuration
+// for running Neighbor Unreachability Detection (NUD).
+//
+// There are two types of entries in the neighbor cache:
+//  1. Dynamic entries are discovered automatically by neighbor discovery
+//     protocols (e.g. ARP, NDP). These protocols will attempt to reconfirm
+//     reachability with the device once the entry's state becomes Stale.
+//  2. Static entries are explicitly added by a user and have no expiration.
+//     Their state is always Static. The amount of static entries stored in the
+//     cache is unbounded.
+//
+// neighborCache implements NUDHandler.
+type neighborCache struct {
+	nic   *NIC
+	state *NUDState
+
+	// mu protects the fields below.
+	mu sync.RWMutex
+
+	cache   map[tcpip.Address]*neighborEntry
+	dynamic struct {
+		lru neighborEntryList
+
+		// count tracks the amount of dynamic entries in the cache. This is
+		// needed since static entries do not count towards the LRU cache
+		// eviction strategy.
+		count uint16
+	}
+}
+
+var _ NUDHandler = (*neighborCache)(nil)
+
+// getOrCreateEntry retrieves a cache entry associated with addr. The
+// returned entry is always refreshed in the cache (it is reachable via the
+// map, and its place is bumped in LRU).
+//
+// If a matching entry exists in the cache, it is returned. If no matching
+// entry exists and the cache is full, an existing entry is evicted via LRU,
+// reset to state incomplete, and returned. If no matching entry exists and the
+// cache is not full, a new entry with state incomplete is allocated and
+// returned.
+func (n *neighborCache) getOrCreateEntry(remoteAddr, localAddr tcpip.Address, linkRes LinkAddressResolver) *neighborEntry {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	if entry, ok := n.cache[remoteAddr]; ok {
+		entry.mu.RLock()
+		if entry.neigh.State != Static {
+			n.dynamic.lru.Remove(entry)
+			n.dynamic.lru.PushFront(entry)
+		}
+		entry.mu.RUnlock()
+		return entry
+	}
+
+	// The entry that needs to be created must be dynamic since all static
+	// entries are directly added to the cache via addStaticEntry.
+	entry := newNeighborEntry(n.nic, remoteAddr, localAddr, n.state, linkRes)
+	if n.dynamic.count == neighborCacheSize {
+		e := n.dynamic.lru.Back()
+		e.mu.Lock()
+
+		delete(n.cache, e.neigh.Addr)
+		n.dynamic.lru.Remove(e)
+		n.dynamic.count--
+
+		e.dispatchRemoveEventLocked()
+		e.setStateLocked(Unknown)
+		e.notifyWakersLocked()
+		e.mu.Unlock()
+	}
+	n.cache[remoteAddr] = entry
+	n.dynamic.lru.PushFront(entry)
+	n.dynamic.count++
+	return entry
+}
+
+// entry looks up the neighbor cache for translating address to link address
+// (e.g. IP -> MAC). If the LinkEndpoint requests address resolution and there
+// is a LinkAddressResolver registered with the network protocol, the cache
+// attempts to resolve the address and returns ErrWouldBlock. If a Waker is
+// provided, it will be notified when address resolution is complete (success
+// or not).
+//
+// If address resolution is required, ErrNoLinkAddress and a notification
+// channel is returned for the top level caller to block. Channel is closed
+// once address resolution is complete (success or not).
+func (n *neighborCache) entry(remoteAddr, localAddr tcpip.Address, linkRes LinkAddressResolver, w *sleep.Waker) (NeighborEntry, <-chan struct{}, *tcpip.Error) {
+	if linkAddr, ok := linkRes.ResolveStaticAddress(remoteAddr); ok {
+		e := NeighborEntry{
+			Addr:      remoteAddr,
+			LocalAddr: localAddr,
+			LinkAddr:  linkAddr,
+			State:     Static,
+			UpdatedAt: time.Now(),
+		}
+		return e, nil, nil
+	}
+
+	entry := n.getOrCreateEntry(remoteAddr, localAddr, linkRes)
+	entry.mu.Lock()
+	defer entry.mu.Unlock()
+
+	switch s := entry.neigh.State; s {
+	case Reachable, Static:
+		return entry.neigh, nil, nil
+
+	case Unknown, Incomplete, Stale, Delay, Probe:
+		entry.addWakerLocked(w)
+
+		if entry.done == nil {
+			// Address resolution needs to be initiated.
+			if linkRes == nil {
+				return entry.neigh, nil, tcpip.ErrNoLinkAddress
+			}
+			entry.done = make(chan struct{})
+		}
+
+		entry.handlePacketQueuedLocked()
+		return entry.neigh, entry.done, tcpip.ErrWouldBlock
+
+	case Failed:
+		return entry.neigh, nil, tcpip.ErrNoLinkAddress
+
+	default:
+		panic(fmt.Sprintf("Invalid cache entry state: %s", s))
+	}
+}
+
+// removeWaker removes a waker that has been added when link resolution for
+// addr was requested.
+func (n *neighborCache) removeWaker(addr tcpip.Address, waker *sleep.Waker) {
+	n.mu.Lock()
+	if entry, ok := n.cache[addr]; ok {
+		delete(entry.wakers, waker)
+	}
+	n.mu.Unlock()
+}
+
+// entries returns all entries in the neighbor cache.
+func (n *neighborCache) entries() []NeighborEntry {
+	entries := make([]NeighborEntry, 0, len(n.cache))
+	n.mu.RLock()
+	for _, entry := range n.cache {
+		entry.mu.RLock()
+		entries = append(entries, entry.neigh)
+		entry.mu.RUnlock()
+	}
+	n.mu.RUnlock()
+	return entries
+}
+
+// addStaticEntry adds a static entry to the neighbor cache, mapping an IP
+// address to a link address. If a dynamic entry exists in the neighbor cache
+// with the same address, it will be replaced with this static entry. If a
+// static entry exists with the same address but different link address, it
+// will be updated with the new link address. If a static entry exists with the
+// same address and link address, nothing will happen.
+func (n *neighborCache) addStaticEntry(addr tcpip.Address, linkAddr tcpip.LinkAddress) {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	if entry, ok := n.cache[addr]; ok {
+		entry.mu.Lock()
+		if entry.neigh.State != Static {
+			// Dynamic entry found with the same address.
+			n.dynamic.lru.Remove(entry)
+			n.dynamic.count--
+		} else if entry.neigh.LinkAddr == linkAddr {
+			// Static entry found with the same address and link address.
+			entry.mu.Unlock()
+			return
+		} else {
+			// Static entry found with the same address but different link address.
+			entry.neigh.LinkAddr = linkAddr
+			entry.dispatchChangeEventLocked(entry.neigh.State)
+			entry.mu.Unlock()
+			return
+		}
+
+		// Notify that resolution has been interrupted, just in case the entry was
+		// in the Incomplete or Probe state.
+		entry.dispatchRemoveEventLocked()
+		entry.setStateLocked(Unknown)
+		entry.notifyWakersLocked()
+		entry.mu.Unlock()
+	}
+
+	entry := newStaticNeighborEntry(n.nic, addr, linkAddr, n.state)
+	n.cache[addr] = entry
+}
+
+// removeEntryLocked removes the specified entry from the neighbor cache.
+func (n *neighborCache) removeEntryLocked(entry *neighborEntry) {
+	if entry.neigh.State != Static {
+		n.dynamic.lru.Remove(entry)
+		n.dynamic.count--
+	}
+	if entry.neigh.State != Failed {
+		entry.dispatchRemoveEventLocked()
+	}
+	entry.setStateLocked(Unknown)
+	entry.notifyWakersLocked()
+
+	delete(n.cache, entry.neigh.Addr)
+}
+
+// removeEntry removes a dynamic or static entry by address from the neighbor
+// cache. Returns true if the entry was found and deleted.
+func (n *neighborCache) removeEntry(addr tcpip.Address) bool {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	entry, ok := n.cache[addr]
+	if !ok {
+		return false
+	}
+
+	entry.mu.Lock()
+	defer entry.mu.Unlock()
+
+	n.removeEntryLocked(entry)
+	return true
+}
+
+// clear removes all dynamic and static entries from the neighbor cache.
+func (n *neighborCache) clear() {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	for _, entry := range n.cache {
+		entry.mu.Lock()
+		entry.dispatchRemoveEventLocked()
+		entry.setStateLocked(Unknown)
+		entry.notifyWakersLocked()
+		entry.mu.Unlock()
+	}
+
+	n.dynamic.lru = neighborEntryList{}
+	n.cache = make(map[tcpip.Address]*neighborEntry)
+	n.dynamic.count = 0
+}
+
+// config returns the NUD configuration.
+func (n *neighborCache) config() NUDConfigurations {
+	return n.state.Config()
+}
+
+// setConfig changes the NUD configuration.
+//
+// If config contains invalid NUD configuration values, it will be fixed to
+// use default values for the erroneous values.
+func (n *neighborCache) setConfig(config NUDConfigurations) {
+	config.resetInvalidFields()
+	n.state.SetConfig(config)
+}
+
+// HandleProbe implements NUDHandler.HandleProbe by following the logic defined
+// in RFC 4861 section 7.2.3. Validation of the probe is expected to be handled
+// by the caller.
+func (n *neighborCache) HandleProbe(remoteAddr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, linkRes LinkAddressResolver) {
+	entry := n.getOrCreateEntry(remoteAddr, localAddr, linkRes)
+	entry.mu.Lock()
+	entry.handleProbeLocked(remoteLinkAddr)
+	entry.mu.Unlock()
+}
+
+// HandleConfirmation implements NUDHandler.HandleConfirmation by following the
+// logic defined in RFC 4861 section 7.2.5.
+//
+// TODO(gvisor.dev/issue/2277): To protect against ARP poisoning and other
+// attacks against NDP functions, Secure Neighbor Discovery (SEND) Protocol
+// should be deployed where preventing access to the broadcast segment might
+// not be possible. SEND uses RSA key pairs to produce cryptographically
+// generated addresses, as defined in RFC 3972, Cryptographically Generated
+// Addresses (CGA). This ensures that the claimed source of an NDP message is
+// the owner of the claimed address.
+func (n *neighborCache) HandleConfirmation(addr tcpip.Address, linkAddr tcpip.LinkAddress, flags ReachabilityConfirmationFlags) {
+	n.mu.RLock()
+	entry, ok := n.cache[addr]
+	n.mu.RUnlock()
+	if ok {
+		entry.mu.Lock()
+		entry.handleConfirmationLocked(linkAddr, flags)
+		entry.mu.Unlock()
+	}
+	// The confirmation SHOULD be silently discarded if the recipient did not
+	// initiate any communication with the target. This is indicated if there is
+	// no matching entry for the remote address.
+}
+
+// HandleUpperLevelConfirmation implements
+// NUDHandler.HandleUpperLevelConfirmation by following the logic defined in
+// RFC 4861 section 7.3.1.
+func (n *neighborCache) HandleUpperLevelConfirmation(addr tcpip.Address) {
+	n.mu.RLock()
+	entry, ok := n.cache[addr]
+	n.mu.RUnlock()
+	if ok {
+		entry.mu.Lock()
+		entry.handleUpperLevelConfirmationLocked()
+		entry.mu.Unlock()
+	}
+}
diff --git a/pkg/tcpip/stack/neighbor_cache_test.go b/pkg/tcpip/stack/neighbor_cache_test.go
new file mode 100644
index 000000000..b4fa69e3e
--- /dev/null
+++ b/pkg/tcpip/stack/neighbor_cache_test.go
@@ -0,0 +1,1726 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math"
+	"math/rand"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// entryStoreSize is the default number of entries that will be generated and
+	// added to the entry store. This number needs to be larger than the size of
+	// the neighbor cache to give ample opportunity for verifying behavior during
+	// cache overflows. Four times the size of the neighbor cache allows for
+	// three complete cache overflows.
+	entryStoreSize = 4 * neighborCacheSize
+
+	// typicalLatency is the typical latency for an ARP or NDP packet to travel
+	// to a router and back.
+	typicalLatency = time.Millisecond
+
+	// testEntryBroadcastAddr is a special address that indicates a packet should
+	// be sent to all nodes.
+	testEntryBroadcastAddr = tcpip.Address("broadcast")
+
+	// testEntryLocalAddr is the source address of neighbor probes.
+	testEntryLocalAddr = tcpip.Address("local_addr")
+
+	// testEntryBroadcastLinkAddr is a special link address sent back to
+	// multicast neighbor probes.
+	testEntryBroadcastLinkAddr = tcpip.LinkAddress("mac_broadcast")
+
+	// infiniteDuration indicates that a task will not occur in our lifetime.
+	infiniteDuration = time.Duration(math.MaxInt64)
+)
+
+// entryDiffOpts returns the options passed to cmp.Diff to compare neighbor
+// entries. The UpdatedAt field is ignored due to a lack of a deterministic
+// method to predict the time that an event will be dispatched.
+func entryDiffOpts() []cmp.Option {
+	return []cmp.Option{
+		cmpopts.IgnoreFields(NeighborEntry{}, "UpdatedAt"),
+	}
+}
+
+// entryDiffOptsWithSort is like entryDiffOpts but also includes an option to
+// sort slices of entries for cases where ordering must be ignored.
+func entryDiffOptsWithSort() []cmp.Option {
+	return []cmp.Option{
+		cmpopts.IgnoreFields(NeighborEntry{}, "UpdatedAt"),
+		cmpopts.SortSlices(func(a, b NeighborEntry) bool {
+			return strings.Compare(string(a.Addr), string(b.Addr)) < 0
+		}),
+	}
+}
+
+func newTestNeighborCache(nudDisp NUDDispatcher, config NUDConfigurations, clock tcpip.Clock) *neighborCache {
+	config.resetInvalidFields()
+	rng := rand.New(rand.NewSource(time.Now().UnixNano()))
+	return &neighborCache{
+		nic: &NIC{
+			stack: &Stack{
+				clock:   clock,
+				nudDisp: nudDisp,
+			},
+			id: 1,
+		},
+		state: NewNUDState(config, rng),
+		cache: make(map[tcpip.Address]*neighborEntry, neighborCacheSize),
+	}
+}
+
+// testEntryStore contains a set of IP to NeighborEntry mappings.
+type testEntryStore struct {
+	mu         sync.RWMutex
+	entriesMap map[tcpip.Address]NeighborEntry
+}
+
+func toAddress(i int) tcpip.Address {
+	buf := new(bytes.Buffer)
+	binary.Write(buf, binary.BigEndian, uint8(1))
+	binary.Write(buf, binary.BigEndian, uint8(0))
+	binary.Write(buf, binary.BigEndian, uint16(i))
+	return tcpip.Address(buf.String())
+}
+
+func toLinkAddress(i int) tcpip.LinkAddress {
+	buf := new(bytes.Buffer)
+	binary.Write(buf, binary.BigEndian, uint8(1))
+	binary.Write(buf, binary.BigEndian, uint8(0))
+	binary.Write(buf, binary.BigEndian, uint32(i))
+	return tcpip.LinkAddress(buf.String())
+}
+
+// newTestEntryStore returns a testEntryStore pre-populated with entries.
+func newTestEntryStore() *testEntryStore {
+	store := &testEntryStore{
+		entriesMap: make(map[tcpip.Address]NeighborEntry),
+	}
+	for i := 0; i < entryStoreSize; i++ {
+		addr := toAddress(i)
+		linkAddr := toLinkAddress(i)
+
+		store.entriesMap[addr] = NeighborEntry{
+			Addr:      addr,
+			LocalAddr: testEntryLocalAddr,
+			LinkAddr:  linkAddr,
+		}
+	}
+	return store
+}
+
+// size returns the number of entries in the store.
+func (s *testEntryStore) size() int {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	return len(s.entriesMap)
+}
+
+// entry returns the entry at index i. Returns an empty entry and false if i is
+// out of bounds.
+func (s *testEntryStore) entry(i int) (NeighborEntry, bool) {
+	return s.entryByAddr(toAddress(i))
+}
+
+// entryByAddr returns the entry matching addr for situations when the index is
+// not available. Returns an empty entry and false if no entries match addr.
+func (s *testEntryStore) entryByAddr(addr tcpip.Address) (NeighborEntry, bool) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	entry, ok := s.entriesMap[addr]
+	return entry, ok
+}
+
+// entries returns all entries in the store.
+func (s *testEntryStore) entries() []NeighborEntry {
+	entries := make([]NeighborEntry, 0, len(s.entriesMap))
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	for i := 0; i < entryStoreSize; i++ {
+		addr := toAddress(i)
+		if entry, ok := s.entriesMap[addr]; ok {
+			entries = append(entries, entry)
+		}
+	}
+	return entries
+}
+
+// set modifies the link addresses of an entry.
+func (s *testEntryStore) set(i int, linkAddr tcpip.LinkAddress) {
+	addr := toAddress(i)
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if entry, ok := s.entriesMap[addr]; ok {
+		entry.LinkAddr = linkAddr
+		s.entriesMap[addr] = entry
+	}
+}
+
+// testNeighborResolver implements LinkAddressResolver to emulate sending a
+// neighbor probe.
+type testNeighborResolver struct {
+	clock                tcpip.Clock
+	neigh                *neighborCache
+	entries              *testEntryStore
+	delay                time.Duration
+	onLinkAddressRequest func()
+}
+
+var _ LinkAddressResolver = (*testNeighborResolver)(nil)
+
+func (r *testNeighborResolver) LinkAddressRequest(addr, localAddr tcpip.Address, linkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error {
+	// Delay handling the request to emulate network latency.
+	r.clock.AfterFunc(r.delay, func() {
+		r.fakeRequest(addr)
+	})
+
+	// Execute post address resolution action, if available.
+	if f := r.onLinkAddressRequest; f != nil {
+		f()
+	}
+	return nil
+}
+
+// fakeRequest emulates handling a response for a link address request.
+func (r *testNeighborResolver) fakeRequest(addr tcpip.Address) {
+	if entry, ok := r.entries.entryByAddr(addr); ok {
+		r.neigh.HandleConfirmation(addr, entry.LinkAddr, ReachabilityConfirmationFlags{
+			Solicited: true,
+			Override:  false,
+			IsRouter:  false,
+		})
+	}
+}
+
+func (*testNeighborResolver) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+	if addr == testEntryBroadcastAddr {
+		return testEntryBroadcastLinkAddr, true
+	}
+	return "", false
+}
+
+func (*testNeighborResolver) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
+	return 0
+}
+
+type entryEvent struct {
+	nicID    tcpip.NICID
+	address  tcpip.Address
+	linkAddr tcpip.LinkAddress
+	state    NeighborState
+}
+
+func TestNeighborCacheGetConfig(t *testing.T) {
+	nudDisp := testNUDDispatcher{}
+	c := DefaultNUDConfigurations()
+	clock := newFakeClock()
+	neigh := newTestNeighborCache(&nudDisp, c, clock)
+
+	if got, want := neigh.config(), c; got != want {
+		t.Errorf("got neigh.config() = %+v, want = %+v", got, want)
+	}
+
+	// No events should have been dispatched.
+	nudDisp.mu.Lock()
+	defer nudDisp.mu.Unlock()
+	if diff := cmp.Diff(nudDisp.events, []testEntryEventInfo(nil)); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+}
+
+func TestNeighborCacheSetConfig(t *testing.T) {
+	nudDisp := testNUDDispatcher{}
+	c := DefaultNUDConfigurations()
+	clock := newFakeClock()
+	neigh := newTestNeighborCache(&nudDisp, c, clock)
+
+	c.MinRandomFactor = 1
+	c.MaxRandomFactor = 1
+	neigh.setConfig(c)
+
+	if got, want := neigh.config(), c; got != want {
+		t.Errorf("got neigh.config() = %+v, want = %+v", got, want)
+	}
+
+	// No events should have been dispatched.
+	nudDisp.mu.Lock()
+	defer nudDisp.mu.Unlock()
+	if diff := cmp.Diff(nudDisp.events, []testEntryEventInfo(nil)); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+}
+
+func TestNeighborCacheEntry(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	nudDisp := testNUDDispatcher{}
+	clock := newFakeClock()
+	neigh := newTestNeighborCache(&nudDisp, c, clock)
+	store := newTestEntryStore()
+	linkRes := &testNeighborResolver{
+		clock:   clock,
+		neigh:   neigh,
+		entries: store,
+		delay:   typicalLatency,
+	}
+
+	entry, ok := store.entry(0)
+	if !ok {
+		t.Fatalf("store.entry(0) not found")
+	}
+	_, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+	if err != tcpip.ErrWouldBlock {
+		t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	}
+
+	clock.advance(typicalLatency)
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     1,
+			Addr:      entry.Addr,
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     1,
+			Addr:      entry.Addr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Reachable,
+		},
+	}
+	nudDisp.mu.Lock()
+	diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...)
+	nudDisp.events = nil
+	nudDisp.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+
+	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != nil {
+		t.Fatalf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err)
+	}
+
+	// No more events should have been dispatched.
+	nudDisp.mu.Lock()
+	defer nudDisp.mu.Unlock()
+	if diff := cmp.Diff(nudDisp.events, []testEntryEventInfo(nil)); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+}
+
+func TestNeighborCacheRemoveEntry(t *testing.T) {
+	config := DefaultNUDConfigurations()
+
+	nudDisp := testNUDDispatcher{}
+	clock := newFakeClock()
+	neigh := newTestNeighborCache(&nudDisp, config, clock)
+	store := newTestEntryStore()
+	linkRes := &testNeighborResolver{
+		clock:   clock,
+		neigh:   neigh,
+		entries: store,
+		delay:   typicalLatency,
+	}
+
+	entry, ok := store.entry(0)
+	if !ok {
+		t.Fatalf("store.entry(0) not found")
+	}
+	_, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+	if err != tcpip.ErrWouldBlock {
+		t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	}
+
+	clock.advance(typicalLatency)
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     1,
+			Addr:      entry.Addr,
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     1,
+			Addr:      entry.Addr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Reachable,
+		},
+	}
+	nudDisp.mu.Lock()
+	diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...)
+	nudDisp.events = nil
+	nudDisp.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+
+	neigh.removeEntry(entry.Addr)
+
+	{
+		wantEvents := []testEntryEventInfo{
+			{
+				EventType: entryTestRemoved,
+				NICID:     1,
+				Addr:      entry.Addr,
+				LinkAddr:  entry.LinkAddr,
+				State:     Reachable,
+			},
+		}
+		nudDisp.mu.Lock()
+		diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...)
+		nudDisp.mu.Unlock()
+		if diff != "" {
+			t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+		}
+	}
+
+	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrWouldBlock {
+		t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	}
+}
+
+type testContext struct {
+	clock   *fakeClock
+	neigh   *neighborCache
+	store   *testEntryStore
+	linkRes *testNeighborResolver
+	nudDisp *testNUDDispatcher
+}
+
+func newTestContext(c NUDConfigurations) testContext {
+	nudDisp := &testNUDDispatcher{}
+	clock := newFakeClock()
+	neigh := newTestNeighborCache(nudDisp, c, clock)
+	store := newTestEntryStore()
+	linkRes := &testNeighborResolver{
+		clock:   clock,
+		neigh:   neigh,
+		entries: store,
+		delay:   typicalLatency,
+	}
+
+	return testContext{
+		clock:   clock,
+		neigh:   neigh,
+		store:   store,
+		linkRes: linkRes,
+		nudDisp: nudDisp,
+	}
+}
+
+type overflowOptions struct {
+	startAtEntryIndex int
+	wantStaticEntries []NeighborEntry
+}
+
+func (c *testContext) overflowCache(opts overflowOptions) error {
+	// Fill the neighbor cache to capacity to verify the LRU eviction strategy is
+	// working properly after the entry removal.
+	for i := opts.startAtEntryIndex; i < c.store.size(); i++ {
+		// Add a new entry
+		entry, ok := c.store.entry(i)
+		if !ok {
+			return fmt.Errorf("c.store.entry(%d) not found", i)
+		}
+		if _, _, err := c.neigh.entry(entry.Addr, entry.LocalAddr, c.linkRes, nil); err != tcpip.ErrWouldBlock {
+			return fmt.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+		}
+		c.clock.advance(c.neigh.config().RetransmitTimer)
+
+		var wantEvents []testEntryEventInfo
+
+		// When beyond the full capacity, the cache will evict an entry as per the
+		// LRU eviction strategy. Note that the number of static entries should not
+		// affect the total number of dynamic entries that can be added.
+		if i >= neighborCacheSize+opts.startAtEntryIndex {
+			removedEntry, ok := c.store.entry(i - neighborCacheSize)
+			if !ok {
+				return fmt.Errorf("store.entry(%d) not found", i-neighborCacheSize)
+			}
+			wantEvents = append(wantEvents, testEntryEventInfo{
+				EventType: entryTestRemoved,
+				NICID:     1,
+				Addr:      removedEntry.Addr,
+				LinkAddr:  removedEntry.LinkAddr,
+				State:     Reachable,
+			})
+		}
+
+		wantEvents = append(wantEvents, testEntryEventInfo{
+			EventType: entryTestAdded,
+			NICID:     1,
+			Addr:      entry.Addr,
+			State:     Incomplete,
+		}, testEntryEventInfo{
+			EventType: entryTestChanged,
+			NICID:     1,
+			Addr:      entry.Addr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Reachable,
+		})
+
+		c.nudDisp.mu.Lock()
+		diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...)
+		c.nudDisp.events = nil
+		c.nudDisp.mu.Unlock()
+		if diff != "" {
+			return fmt.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+		}
+	}
+
+	// Expect to find only the most recent entries. The order of entries reported
+	// by entries() is undeterministic, so entries have to be sorted before
+	// comparison.
+	wantUnsortedEntries := opts.wantStaticEntries
+	for i := c.store.size() - neighborCacheSize; i < c.store.size(); i++ {
+		entry, ok := c.store.entry(i)
+		if !ok {
+			return fmt.Errorf("c.store.entry(%d) not found", i)
+		}
+		wantEntry := NeighborEntry{
+			Addr:      entry.Addr,
+			LocalAddr: entry.LocalAddr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Reachable,
+		}
+		wantUnsortedEntries = append(wantUnsortedEntries, wantEntry)
+	}
+
+	if diff := cmp.Diff(c.neigh.entries(), wantUnsortedEntries, entryDiffOptsWithSort()...); diff != "" {
+		return fmt.Errorf("neighbor entries mismatch (-got, +want):\n%s", diff)
+	}
+
+	// No more events should have been dispatched.
+	c.nudDisp.mu.Lock()
+	defer c.nudDisp.mu.Unlock()
+	if diff := cmp.Diff(c.nudDisp.events, []testEntryEventInfo(nil)); diff != "" {
+		return fmt.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+
+	return nil
+}
+
+// TestNeighborCacheOverflow verifies that the LRU cache eviction strategy
+// respects the dynamic entry count.
+func TestNeighborCacheOverflow(t *testing.T) {
+	config := DefaultNUDConfigurations()
+	// Stay in Reachable so the cache can overflow
+	config.BaseReachableTime = infiniteDuration
+	config.MinRandomFactor = 1
+	config.MaxRandomFactor = 1
+
+	c := newTestContext(config)
+	opts := overflowOptions{
+		startAtEntryIndex: 0,
+	}
+	if err := c.overflowCache(opts); err != nil {
+		t.Errorf("c.overflowCache(%+v): %s", opts, err)
+	}
+}
+
+// TestNeighborCacheRemoveEntryThenOverflow verifies that the LRU cache
+// eviction strategy respects the dynamic entry count when an entry is removed.
+func TestNeighborCacheRemoveEntryThenOverflow(t *testing.T) {
+	config := DefaultNUDConfigurations()
+	// Stay in Reachable so the cache can overflow
+	config.BaseReachableTime = infiniteDuration
+	config.MinRandomFactor = 1
+	config.MaxRandomFactor = 1
+
+	c := newTestContext(config)
+
+	// Add a dynamic entry
+	entry, ok := c.store.entry(0)
+	if !ok {
+		t.Fatalf("c.store.entry(0) not found")
+	}
+	_, _, err := c.neigh.entry(entry.Addr, entry.LocalAddr, c.linkRes, nil)
+	if err != tcpip.ErrWouldBlock {
+		t.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	}
+	c.clock.advance(c.neigh.config().RetransmitTimer)
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     1,
+			Addr:      entry.Addr,
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     1,
+			Addr:      entry.Addr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Reachable,
+		},
+	}
+	c.nudDisp.mu.Lock()
+	diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...)
+	c.nudDisp.events = nil
+	c.nudDisp.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+
+	// Remove the entry
+	c.neigh.removeEntry(entry.Addr)
+
+	{
+		wantEvents := []testEntryEventInfo{
+			{
+				EventType: entryTestRemoved,
+				NICID:     1,
+				Addr:      entry.Addr,
+				LinkAddr:  entry.LinkAddr,
+				State:     Reachable,
+			},
+		}
+		c.nudDisp.mu.Lock()
+		diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...)
+		c.nudDisp.events = nil
+		c.nudDisp.mu.Unlock()
+		if diff != "" {
+			t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+		}
+	}
+
+	opts := overflowOptions{
+		startAtEntryIndex: 0,
+	}
+	if err := c.overflowCache(opts); err != nil {
+		t.Errorf("c.overflowCache(%+v): %s", opts, err)
+	}
+}
+
+// TestNeighborCacheDuplicateStaticEntryWithSameLinkAddress verifies that
+// adding a duplicate static entry with the same link address does not dispatch
+// any events.
+func TestNeighborCacheDuplicateStaticEntryWithSameLinkAddress(t *testing.T) {
+	config := DefaultNUDConfigurations()
+	c := newTestContext(config)
+
+	// Add a static entry
+	entry, ok := c.store.entry(0)
+	if !ok {
+		t.Fatalf("c.store.entry(0) not found")
+	}
+	staticLinkAddr := entry.LinkAddr + "static"
+	c.neigh.addStaticEntry(entry.Addr, staticLinkAddr)
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     1,
+			Addr:      entry.Addr,
+			LinkAddr:  staticLinkAddr,
+			State:     Static,
+		},
+	}
+	c.nudDisp.mu.Lock()
+	diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...)
+	c.nudDisp.events = nil
+	c.nudDisp.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+
+	// Remove the static entry that was just added
+	c.neigh.addStaticEntry(entry.Addr, staticLinkAddr)
+
+	// No more events should have been dispatched.
+	c.nudDisp.mu.Lock()
+	defer c.nudDisp.mu.Unlock()
+	if diff := cmp.Diff(c.nudDisp.events, []testEntryEventInfo(nil)); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+}
+
+// TestNeighborCacheDuplicateStaticEntryWithDifferentLinkAddress verifies that
+// adding a duplicate static entry with a different link address dispatches a
+// change event.
+func TestNeighborCacheDuplicateStaticEntryWithDifferentLinkAddress(t *testing.T) {
+	config := DefaultNUDConfigurations()
+	c := newTestContext(config)
+
+	// Add a static entry
+	entry, ok := c.store.entry(0)
+	if !ok {
+		t.Fatalf("c.store.entry(0) not found")
+	}
+	staticLinkAddr := entry.LinkAddr + "static"
+	c.neigh.addStaticEntry(entry.Addr, staticLinkAddr)
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     1,
+			Addr:      entry.Addr,
+			LinkAddr:  staticLinkAddr,
+			State:     Static,
+		},
+	}
+	c.nudDisp.mu.Lock()
+	diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...)
+	c.nudDisp.events = nil
+	c.nudDisp.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+
+	// Add a duplicate entry with a different link address
+	staticLinkAddr += "duplicate"
+	c.neigh.addStaticEntry(entry.Addr, staticLinkAddr)
+	{
+		wantEvents := []testEntryEventInfo{
+			{
+				EventType: entryTestChanged,
+				NICID:     1,
+				Addr:      entry.Addr,
+				LinkAddr:  staticLinkAddr,
+				State:     Static,
+			},
+		}
+		c.nudDisp.mu.Lock()
+		defer c.nudDisp.mu.Unlock()
+		if diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+			t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+		}
+	}
+}
+
+// TestNeighborCacheRemoveStaticEntryThenOverflow verifies that the LRU cache
+// eviction strategy respects the dynamic entry count when a static entry is
+// added then removed. In this case, the dynamic entry count shouldn't have
+// been touched.
+func TestNeighborCacheRemoveStaticEntryThenOverflow(t *testing.T) {
+	config := DefaultNUDConfigurations()
+	// Stay in Reachable so the cache can overflow
+	config.BaseReachableTime = infiniteDuration
+	config.MinRandomFactor = 1
+	config.MaxRandomFactor = 1
+
+	c := newTestContext(config)
+
+	// Add a static entry
+	entry, ok := c.store.entry(0)
+	if !ok {
+		t.Fatalf("c.store.entry(0) not found")
+	}
+	staticLinkAddr := entry.LinkAddr + "static"
+	c.neigh.addStaticEntry(entry.Addr, staticLinkAddr)
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     1,
+			Addr:      entry.Addr,
+			LinkAddr:  staticLinkAddr,
+			State:     Static,
+		},
+	}
+	c.nudDisp.mu.Lock()
+	diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...)
+	c.nudDisp.events = nil
+	c.nudDisp.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+
+	// Remove the static entry that was just added
+	c.neigh.removeEntry(entry.Addr)
+	{
+		wantEvents := []testEntryEventInfo{
+			{
+				EventType: entryTestRemoved,
+				NICID:     1,
+				Addr:      entry.Addr,
+				LinkAddr:  staticLinkAddr,
+				State:     Static,
+			},
+		}
+		c.nudDisp.mu.Lock()
+		diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...)
+		c.nudDisp.events = nil
+		c.nudDisp.mu.Unlock()
+		if diff != "" {
+			t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+		}
+	}
+
+	opts := overflowOptions{
+		startAtEntryIndex: 0,
+	}
+	if err := c.overflowCache(opts); err != nil {
+		t.Errorf("c.overflowCache(%+v): %s", opts, err)
+	}
+}
+
+// TestNeighborCacheOverwriteWithStaticEntryThenOverflow verifies that the LRU
+// cache eviction strategy keeps count of the dynamic entry count when an entry
+// is overwritten by a static entry. Static entries should not count towards
+// the size of the LRU cache.
+func TestNeighborCacheOverwriteWithStaticEntryThenOverflow(t *testing.T) {
+	config := DefaultNUDConfigurations()
+	// Stay in Reachable so the cache can overflow
+	config.BaseReachableTime = infiniteDuration
+	config.MinRandomFactor = 1
+	config.MaxRandomFactor = 1
+
+	c := newTestContext(config)
+
+	// Add a dynamic entry
+	entry, ok := c.store.entry(0)
+	if !ok {
+		t.Fatalf("c.store.entry(0) not found")
+	}
+	_, _, err := c.neigh.entry(entry.Addr, entry.LocalAddr, c.linkRes, nil)
+	if err != tcpip.ErrWouldBlock {
+		t.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	}
+	c.clock.advance(typicalLatency)
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     1,
+			Addr:      entry.Addr,
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     1,
+			Addr:      entry.Addr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Reachable,
+		},
+	}
+	c.nudDisp.mu.Lock()
+	diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...)
+	c.nudDisp.events = nil
+	c.nudDisp.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+
+	// Override the entry with a static one using the same address
+	staticLinkAddr := entry.LinkAddr + "static"
+	c.neigh.addStaticEntry(entry.Addr, staticLinkAddr)
+	{
+		wantEvents := []testEntryEventInfo{
+			{
+				EventType: entryTestRemoved,
+				NICID:     1,
+				Addr:      entry.Addr,
+				LinkAddr:  entry.LinkAddr,
+				State:     Reachable,
+			},
+			{
+				EventType: entryTestAdded,
+				NICID:     1,
+				Addr:      entry.Addr,
+				LinkAddr:  staticLinkAddr,
+				State:     Static,
+			},
+		}
+		c.nudDisp.mu.Lock()
+		diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...)
+		c.nudDisp.events = nil
+		c.nudDisp.mu.Unlock()
+		if diff != "" {
+			t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+		}
+	}
+
+	opts := overflowOptions{
+		startAtEntryIndex: 1,
+		wantStaticEntries: []NeighborEntry{
+			{
+				Addr:      entry.Addr,
+				LocalAddr: "", // static entries don't need a local address
+				LinkAddr:  staticLinkAddr,
+				State:     Static,
+			},
+		},
+	}
+	if err := c.overflowCache(opts); err != nil {
+		t.Errorf("c.overflowCache(%+v): %s", opts, err)
+	}
+}
+
+func TestNeighborCacheNotifiesWaker(t *testing.T) {
+	config := DefaultNUDConfigurations()
+
+	nudDisp := testNUDDispatcher{}
+	clock := newFakeClock()
+	neigh := newTestNeighborCache(&nudDisp, config, clock)
+	store := newTestEntryStore()
+	linkRes := &testNeighborResolver{
+		clock:   clock,
+		neigh:   neigh,
+		entries: store,
+		delay:   typicalLatency,
+	}
+
+	w := sleep.Waker{}
+	s := sleep.Sleeper{}
+	const wakerID = 1
+	s.AddWaker(&w, wakerID)
+
+	entry, ok := store.entry(0)
+	if !ok {
+		t.Fatalf("store.entry(0) not found")
+	}
+	_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, &w)
+	if err != tcpip.ErrWouldBlock {
+		t.Fatalf("got neigh.entry(%s, %s, _, _ = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	}
+	if doneCh == nil {
+		t.Fatalf("expected done channel from neigh.entry(%s, %s, _, _)", entry.Addr, entry.LocalAddr)
+	}
+	clock.advance(typicalLatency)
+
+	select {
+	case <-doneCh:
+	default:
+		t.Fatal("expected notification from done channel")
+	}
+
+	id, ok := s.Fetch(false /* block */)
+	if !ok {
+		t.Errorf("expected waker to be notified after neigh.entry(%s, %s, _, _)", entry.Addr, entry.LocalAddr)
+	}
+	if id != wakerID {
+		t.Errorf("got s.Fetch(false) = %d, want = %d", id, wakerID)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     1,
+			Addr:      entry.Addr,
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     1,
+			Addr:      entry.Addr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Reachable,
+		},
+	}
+	nudDisp.mu.Lock()
+	defer nudDisp.mu.Unlock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+}
+
+func TestNeighborCacheRemoveWaker(t *testing.T) {
+	config := DefaultNUDConfigurations()
+
+	nudDisp := testNUDDispatcher{}
+	clock := newFakeClock()
+	neigh := newTestNeighborCache(&nudDisp, config, clock)
+	store := newTestEntryStore()
+	linkRes := &testNeighborResolver{
+		clock:   clock,
+		neigh:   neigh,
+		entries: store,
+		delay:   typicalLatency,
+	}
+
+	w := sleep.Waker{}
+	s := sleep.Sleeper{}
+	const wakerID = 1
+	s.AddWaker(&w, wakerID)
+
+	entry, ok := store.entry(0)
+	if !ok {
+		t.Fatalf("store.entry(0) not found")
+	}
+	_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, &w)
+	if err != tcpip.ErrWouldBlock {
+		t.Fatalf("got neigh.entry(%s, %s, _, _) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	}
+	if doneCh == nil {
+		t.Fatalf("expected done channel from neigh.entry(%s, %s, _, _)", entry.Addr, entry.LocalAddr)
+	}
+
+	// Remove the waker before the neighbor cache has the opportunity to send a
+	// notification.
+	neigh.removeWaker(entry.Addr, &w)
+	clock.advance(typicalLatency)
+
+	select {
+	case <-doneCh:
+	default:
+		t.Fatal("expected notification from done channel")
+	}
+
+	if id, ok := s.Fetch(false /* block */); ok {
+		t.Errorf("unexpected notification from waker with id %d", id)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     1,
+			Addr:      entry.Addr,
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     1,
+			Addr:      entry.Addr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Reachable,
+		},
+	}
+	nudDisp.mu.Lock()
+	defer nudDisp.mu.Unlock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+}
+
+func TestNeighborCacheAddStaticEntryThenOverflow(t *testing.T) {
+	config := DefaultNUDConfigurations()
+	// Stay in Reachable so the cache can overflow
+	config.BaseReachableTime = infiniteDuration
+	config.MinRandomFactor = 1
+	config.MaxRandomFactor = 1
+
+	c := newTestContext(config)
+
+	entry, ok := c.store.entry(0)
+	if !ok {
+		t.Fatalf("c.store.entry(0) not found")
+	}
+	c.neigh.addStaticEntry(entry.Addr, entry.LinkAddr)
+	e, _, err := c.neigh.entry(entry.Addr, "", c.linkRes, nil)
+	if err != nil {
+		t.Errorf("unexpected error from c.neigh.entry(%s, \"\", _, nil): %s", entry.Addr, err)
+	}
+	want := NeighborEntry{
+		Addr:      entry.Addr,
+		LocalAddr: "", // static entries don't need a local address
+		LinkAddr:  entry.LinkAddr,
+		State:     Static,
+	}
+	if diff := cmp.Diff(e, want, entryDiffOpts()...); diff != "" {
+		t.Errorf("c.neigh.entry(%s, \"\", _, nil) mismatch (-got, +want):\n%s", entry.Addr, diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     1,
+			Addr:      entry.Addr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Static,
+		},
+	}
+	c.nudDisp.mu.Lock()
+	diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...)
+	c.nudDisp.events = nil
+	c.nudDisp.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+
+	opts := overflowOptions{
+		startAtEntryIndex: 1,
+		wantStaticEntries: []NeighborEntry{
+			{
+				Addr:      entry.Addr,
+				LocalAddr: "", // static entries don't need a local address
+				LinkAddr:  entry.LinkAddr,
+				State:     Static,
+			},
+		},
+	}
+	if err := c.overflowCache(opts); err != nil {
+		t.Errorf("c.overflowCache(%+v): %s", opts, err)
+	}
+}
+
+func TestNeighborCacheClear(t *testing.T) {
+	config := DefaultNUDConfigurations()
+
+	nudDisp := testNUDDispatcher{}
+	clock := newFakeClock()
+	neigh := newTestNeighborCache(&nudDisp, config, clock)
+	store := newTestEntryStore()
+	linkRes := &testNeighborResolver{
+		clock:   clock,
+		neigh:   neigh,
+		entries: store,
+		delay:   typicalLatency,
+	}
+
+	// Add a dynamic entry.
+	entry, ok := store.entry(0)
+	if !ok {
+		t.Fatalf("store.entry(0) not found")
+	}
+	_, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+	if err != tcpip.ErrWouldBlock {
+		t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	}
+	clock.advance(typicalLatency)
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     1,
+			Addr:      entry.Addr,
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     1,
+			Addr:      entry.Addr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Reachable,
+		},
+	}
+	nudDisp.mu.Lock()
+	diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...)
+	nudDisp.events = nil
+	nudDisp.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+
+	// Add a static entry.
+	neigh.addStaticEntry(entryTestAddr1, entryTestLinkAddr1)
+
+	{
+		wantEvents := []testEntryEventInfo{
+			{
+				EventType: entryTestAdded,
+				NICID:     1,
+				Addr:      entryTestAddr1,
+				LinkAddr:  entryTestLinkAddr1,
+				State:     Static,
+			},
+		}
+		nudDisp.mu.Lock()
+		diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...)
+		nudDisp.events = nil
+		nudDisp.mu.Unlock()
+		if diff != "" {
+			t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+		}
+	}
+
+	// Clear shoud remove both dynamic and static entries.
+	neigh.clear()
+
+	// Remove events dispatched from clear() have no deterministic order so they
+	// need to be sorted beforehand.
+	wantUnsortedEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestRemoved,
+			NICID:     1,
+			Addr:      entry.Addr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Reachable,
+		},
+		{
+			EventType: entryTestRemoved,
+			NICID:     1,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Static,
+		},
+	}
+	nudDisp.mu.Lock()
+	defer nudDisp.mu.Unlock()
+	if diff := cmp.Diff(nudDisp.events, wantUnsortedEvents, eventDiffOptsWithSort()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+}
+
+// TestNeighborCacheClearThenOverflow verifies that the LRU cache eviction
+// strategy keeps count of the dynamic entry count when all entries are
+// cleared.
+func TestNeighborCacheClearThenOverflow(t *testing.T) {
+	config := DefaultNUDConfigurations()
+	// Stay in Reachable so the cache can overflow
+	config.BaseReachableTime = infiniteDuration
+	config.MinRandomFactor = 1
+	config.MaxRandomFactor = 1
+
+	c := newTestContext(config)
+
+	// Add a dynamic entry
+	entry, ok := c.store.entry(0)
+	if !ok {
+		t.Fatalf("c.store.entry(0) not found")
+	}
+	_, _, err := c.neigh.entry(entry.Addr, entry.LocalAddr, c.linkRes, nil)
+	if err != tcpip.ErrWouldBlock {
+		t.Errorf("got c.neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	}
+	c.clock.advance(typicalLatency)
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     1,
+			Addr:      entry.Addr,
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     1,
+			Addr:      entry.Addr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Reachable,
+		},
+	}
+	c.nudDisp.mu.Lock()
+	diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...)
+	c.nudDisp.events = nil
+	c.nudDisp.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+
+	// Clear the cache.
+	c.neigh.clear()
+	{
+		wantEvents := []testEntryEventInfo{
+			{
+				EventType: entryTestRemoved,
+				NICID:     1,
+				Addr:      entry.Addr,
+				LinkAddr:  entry.LinkAddr,
+				State:     Reachable,
+			},
+		}
+		c.nudDisp.mu.Lock()
+		diff := cmp.Diff(c.nudDisp.events, wantEvents, eventDiffOpts()...)
+		c.nudDisp.events = nil
+		c.nudDisp.mu.Unlock()
+		if diff != "" {
+			t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+		}
+	}
+
+	opts := overflowOptions{
+		startAtEntryIndex: 0,
+	}
+	if err := c.overflowCache(opts); err != nil {
+		t.Errorf("c.overflowCache(%+v): %s", opts, err)
+	}
+}
+
+func TestNeighborCacheKeepFrequentlyUsed(t *testing.T) {
+	config := DefaultNUDConfigurations()
+	// Stay in Reachable so the cache can overflow
+	config.BaseReachableTime = infiniteDuration
+	config.MinRandomFactor = 1
+	config.MaxRandomFactor = 1
+
+	nudDisp := testNUDDispatcher{}
+	clock := newFakeClock()
+	neigh := newTestNeighborCache(&nudDisp, config, clock)
+	store := newTestEntryStore()
+	linkRes := &testNeighborResolver{
+		clock:   clock,
+		neigh:   neigh,
+		entries: store,
+		delay:   typicalLatency,
+	}
+
+	frequentlyUsedEntry, ok := store.entry(0)
+	if !ok {
+		t.Fatalf("store.entry(0) not found")
+	}
+
+	// The following logic is very similar to overflowCache, but
+	// periodically refreshes the frequently used entry.
+
+	// Fill the neighbor cache to capacity
+	for i := 0; i < neighborCacheSize; i++ {
+		entry, ok := store.entry(i)
+		if !ok {
+			t.Fatalf("store.entry(%d) not found", i)
+		}
+		_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+		if err != tcpip.ErrWouldBlock {
+			t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+		}
+		clock.advance(typicalLatency)
+		select {
+		case <-doneCh:
+		default:
+			t.Fatalf("expected notification from done channel returned by neigh.entry(%s, %s, _, nil)", entry.Addr, entry.LocalAddr)
+		}
+		wantEvents := []testEntryEventInfo{
+			{
+				EventType: entryTestAdded,
+				NICID:     1,
+				Addr:      entry.Addr,
+				State:     Incomplete,
+			},
+			{
+				EventType: entryTestChanged,
+				NICID:     1,
+				Addr:      entry.Addr,
+				LinkAddr:  entry.LinkAddr,
+				State:     Reachable,
+			},
+		}
+		nudDisp.mu.Lock()
+		diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...)
+		nudDisp.events = nil
+		nudDisp.mu.Unlock()
+		if diff != "" {
+			t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+		}
+	}
+
+	// Keep adding more entries
+	for i := neighborCacheSize; i < store.size(); i++ {
+		// Periodically refresh the frequently used entry
+		if i%(neighborCacheSize/2) == 0 {
+			_, _, err := neigh.entry(frequentlyUsedEntry.Addr, frequentlyUsedEntry.LocalAddr, linkRes, nil)
+			if err != nil {
+				t.Errorf("unexpected error from neigh.entry(%s, %s, _, nil): %s", frequentlyUsedEntry.Addr, frequentlyUsedEntry.LocalAddr, err)
+			}
+		}
+
+		entry, ok := store.entry(i)
+		if !ok {
+			t.Fatalf("store.entry(%d) not found", i)
+		}
+		_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+		if err != tcpip.ErrWouldBlock {
+			t.Errorf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+		}
+		clock.advance(typicalLatency)
+		select {
+		case <-doneCh:
+		default:
+			t.Fatalf("expected notification from done channel returned by neigh.entry(%s, %s, _, nil)", entry.Addr, entry.LocalAddr)
+		}
+
+		// An entry should have been removed, as per the LRU eviction strategy
+		removedEntry, ok := store.entry(i - neighborCacheSize + 1)
+		if !ok {
+			t.Fatalf("store.entry(%d) not found", i-neighborCacheSize+1)
+		}
+		wantEvents := []testEntryEventInfo{
+			{
+				EventType: entryTestRemoved,
+				NICID:     1,
+				Addr:      removedEntry.Addr,
+				LinkAddr:  removedEntry.LinkAddr,
+				State:     Reachable,
+			},
+			{
+				EventType: entryTestAdded,
+				NICID:     1,
+				Addr:      entry.Addr,
+				State:     Incomplete,
+			},
+			{
+				EventType: entryTestChanged,
+				NICID:     1,
+				Addr:      entry.Addr,
+				LinkAddr:  entry.LinkAddr,
+				State:     Reachable,
+			},
+		}
+		nudDisp.mu.Lock()
+		diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...)
+		nudDisp.events = nil
+		nudDisp.mu.Unlock()
+		if diff != "" {
+			t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+		}
+	}
+
+	// Expect to find only the frequently used entry and the most recent entries.
+	// The order of entries reported by entries() is undeterministic, so entries
+	// have to be sorted before comparison.
+	wantUnsortedEntries := []NeighborEntry{
+		{
+			Addr:      frequentlyUsedEntry.Addr,
+			LocalAddr: frequentlyUsedEntry.LocalAddr,
+			LinkAddr:  frequentlyUsedEntry.LinkAddr,
+			State:     Reachable,
+		},
+	}
+
+	for i := store.size() - neighborCacheSize + 1; i < store.size(); i++ {
+		entry, ok := store.entry(i)
+		if !ok {
+			t.Fatalf("store.entry(%d) not found", i)
+		}
+		wantEntry := NeighborEntry{
+			Addr:      entry.Addr,
+			LocalAddr: entry.LocalAddr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Reachable,
+		}
+		wantUnsortedEntries = append(wantUnsortedEntries, wantEntry)
+	}
+
+	if diff := cmp.Diff(neigh.entries(), wantUnsortedEntries, entryDiffOptsWithSort()...); diff != "" {
+		t.Errorf("neighbor entries mismatch (-got, +want):\n%s", diff)
+	}
+
+	// No more events should have been dispatched.
+	nudDisp.mu.Lock()
+	defer nudDisp.mu.Unlock()
+	if diff := cmp.Diff(nudDisp.events, []testEntryEventInfo(nil)); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+}
+
+func TestNeighborCacheConcurrent(t *testing.T) {
+	const concurrentProcesses = 16
+
+	config := DefaultNUDConfigurations()
+
+	nudDisp := testNUDDispatcher{}
+	clock := newFakeClock()
+	neigh := newTestNeighborCache(&nudDisp, config, clock)
+	store := newTestEntryStore()
+	linkRes := &testNeighborResolver{
+		clock:   clock,
+		neigh:   neigh,
+		entries: store,
+		delay:   typicalLatency,
+	}
+
+	storeEntries := store.entries()
+	for _, entry := range storeEntries {
+		var wg sync.WaitGroup
+		for r := 0; r < concurrentProcesses; r++ {
+			wg.Add(1)
+			go func(entry NeighborEntry) {
+				defer wg.Done()
+				e, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+				if err != nil && err != tcpip.ErrWouldBlock {
+					t.Errorf("got neigh.entry(%s, %s, _, nil) = (%+v, _, %s), want (_, _, nil) or (_, _, %s)", entry.Addr, entry.LocalAddr, e, err, tcpip.ErrWouldBlock)
+				}
+			}(entry)
+		}
+
+		// Wait for all gorountines to send a request
+		wg.Wait()
+
+		// Process all the requests for a single entry concurrently
+		clock.advance(typicalLatency)
+	}
+
+	// All goroutines add in the same order and add more values than can fit in
+	// the cache. Our eviction strategy requires that the last entries are
+	// present, up to the size of the neighbor cache, and the rest are missing.
+	// The order of entries reported by entries() is undeterministic, so entries
+	// have to be sorted before comparison.
+	var wantUnsortedEntries []NeighborEntry
+	for i := store.size() - neighborCacheSize; i < store.size(); i++ {
+		entry, ok := store.entry(i)
+		if !ok {
+			t.Errorf("store.entry(%d) not found", i)
+		}
+		wantEntry := NeighborEntry{
+			Addr:      entry.Addr,
+			LocalAddr: entry.LocalAddr,
+			LinkAddr:  entry.LinkAddr,
+			State:     Reachable,
+		}
+		wantUnsortedEntries = append(wantUnsortedEntries, wantEntry)
+	}
+
+	if diff := cmp.Diff(neigh.entries(), wantUnsortedEntries, entryDiffOptsWithSort()...); diff != "" {
+		t.Errorf("neighbor entries mismatch (-got, +want):\n%s", diff)
+	}
+}
+
+func TestNeighborCacheReplace(t *testing.T) {
+	config := DefaultNUDConfigurations()
+
+	nudDisp := testNUDDispatcher{}
+	clock := newFakeClock()
+	neigh := newTestNeighborCache(&nudDisp, config, clock)
+	store := newTestEntryStore()
+	linkRes := &testNeighborResolver{
+		clock:   clock,
+		neigh:   neigh,
+		entries: store,
+		delay:   typicalLatency,
+	}
+
+	// Add an entry
+	entry, ok := store.entry(0)
+	if !ok {
+		t.Fatalf("store.entry(0) not found")
+	}
+	_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+	if err != tcpip.ErrWouldBlock {
+		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	}
+	clock.advance(typicalLatency)
+	select {
+	case <-doneCh:
+	default:
+		t.Fatalf("expected notification from done channel returned by neigh.entry(%s, %s, _, nil)", entry.Addr, entry.LocalAddr)
+	}
+
+	// Verify the entry exists
+	e, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+	if err != nil {
+		t.Errorf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err)
+	}
+	if doneCh != nil {
+		t.Errorf("unexpected done channel from neigh.entry(%s, %s, _, nil): %v", entry.Addr, entry.LocalAddr, doneCh)
+	}
+	if t.Failed() {
+		t.FailNow()
+	}
+	want := NeighborEntry{
+		Addr:      entry.Addr,
+		LocalAddr: entry.LocalAddr,
+		LinkAddr:  entry.LinkAddr,
+		State:     Reachable,
+	}
+	if diff := cmp.Diff(e, want, entryDiffOpts()...); diff != "" {
+		t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", entry.Addr, entry.LinkAddr, diff)
+	}
+
+	// Notify of a link address change
+	var updatedLinkAddr tcpip.LinkAddress
+	{
+		entry, ok := store.entry(1)
+		if !ok {
+			t.Fatalf("store.entry(1) not found")
+		}
+		updatedLinkAddr = entry.LinkAddr
+	}
+	store.set(0, updatedLinkAddr)
+	neigh.HandleConfirmation(entry.Addr, updatedLinkAddr, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  true,
+		IsRouter:  false,
+	})
+
+	// Requesting the entry again should start address resolution
+	{
+		_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+		if err != tcpip.ErrWouldBlock {
+			t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+		}
+		clock.advance(config.DelayFirstProbeTime + typicalLatency)
+		select {
+		case <-doneCh:
+		default:
+			t.Fatalf("expected notification from done channel returned by neigh.entry(%s, %s, _, nil)", entry.Addr, entry.LocalAddr)
+		}
+	}
+
+	// Verify the entry's new link address
+	{
+		e, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+		clock.advance(typicalLatency)
+		if err != nil {
+			t.Errorf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err)
+		}
+		want = NeighborEntry{
+			Addr:      entry.Addr,
+			LocalAddr: entry.LocalAddr,
+			LinkAddr:  updatedLinkAddr,
+			State:     Reachable,
+		}
+		if diff := cmp.Diff(e, want, entryDiffOpts()...); diff != "" {
+			t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", entry.Addr, entry.LocalAddr, diff)
+		}
+	}
+}
+
+func TestNeighborCacheResolutionFailed(t *testing.T) {
+	config := DefaultNUDConfigurations()
+
+	nudDisp := testNUDDispatcher{}
+	clock := newFakeClock()
+	neigh := newTestNeighborCache(&nudDisp, config, clock)
+	store := newTestEntryStore()
+
+	var requestCount uint32
+	linkRes := &testNeighborResolver{
+		clock:   clock,
+		neigh:   neigh,
+		entries: store,
+		delay:   typicalLatency,
+		onLinkAddressRequest: func() {
+			atomic.AddUint32(&requestCount, 1)
+		},
+	}
+
+	// First, sanity check that resolution is working
+	entry, ok := store.entry(0)
+	if !ok {
+		t.Fatalf("store.entry(0) not found")
+	}
+	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	}
+	clock.advance(typicalLatency)
+	got, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+	if err != nil {
+		t.Fatalf("unexpected error from neigh.entry(%s, %s, _, nil): %s", entry.Addr, entry.LocalAddr, err)
+	}
+	want := NeighborEntry{
+		Addr:      entry.Addr,
+		LocalAddr: entry.LocalAddr,
+		LinkAddr:  entry.LinkAddr,
+		State:     Reachable,
+	}
+	if diff := cmp.Diff(got, want, entryDiffOpts()...); diff != "" {
+		t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", entry.Addr, entry.LocalAddr, diff)
+	}
+
+	// Verify that address resolution for an unknown address returns ErrNoLinkAddress
+	before := atomic.LoadUint32(&requestCount)
+
+	entry.Addr += "2"
+	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	}
+	waitFor := config.DelayFirstProbeTime + typicalLatency*time.Duration(config.MaxMulticastProbes)
+	clock.advance(waitFor)
+	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrNoLinkAddress {
+		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrNoLinkAddress)
+	}
+
+	maxAttempts := neigh.config().MaxUnicastProbes
+	if got, want := atomic.LoadUint32(&requestCount)-before, maxAttempts; got != want {
+		t.Errorf("got link address request count = %d, want = %d", got, want)
+	}
+}
+
+// TestNeighborCacheResolutionTimeout simulates sending MaxMulticastProbes
+// probes and not retrieving a confirmation before the duration defined by
+// MaxMulticastProbes * RetransmitTimer.
+func TestNeighborCacheResolutionTimeout(t *testing.T) {
+	config := DefaultNUDConfigurations()
+	config.RetransmitTimer = time.Millisecond // small enough to cause timeout
+
+	clock := newFakeClock()
+	neigh := newTestNeighborCache(nil, config, clock)
+	store := newTestEntryStore()
+	linkRes := &testNeighborResolver{
+		clock:   clock,
+		neigh:   neigh,
+		entries: store,
+		delay:   time.Minute, // large enough to cause timeout
+	}
+
+	entry, ok := store.entry(0)
+	if !ok {
+		t.Fatalf("store.entry(0) not found")
+	}
+	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+	}
+	waitFor := config.RetransmitTimer * time.Duration(config.MaxMulticastProbes)
+	clock.advance(waitFor)
+	if _, _, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil); err != tcpip.ErrNoLinkAddress {
+		t.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrNoLinkAddress)
+	}
+}
+
+// TestNeighborCacheStaticResolution checks that static link addresses are
+// resolved immediately and don't send resolution requests.
+func TestNeighborCacheStaticResolution(t *testing.T) {
+	config := DefaultNUDConfigurations()
+	clock := newFakeClock()
+	neigh := newTestNeighborCache(nil, config, clock)
+	store := newTestEntryStore()
+	linkRes := &testNeighborResolver{
+		clock:   clock,
+		neigh:   neigh,
+		entries: store,
+		delay:   typicalLatency,
+	}
+
+	got, _, err := neigh.entry(testEntryBroadcastAddr, testEntryLocalAddr, linkRes, nil)
+	if err != nil {
+		t.Fatalf("unexpected error from neigh.entry(%s, %s, _, nil): %s", testEntryBroadcastAddr, testEntryLocalAddr, err)
+	}
+	want := NeighborEntry{
+		Addr:      testEntryBroadcastAddr,
+		LocalAddr: testEntryLocalAddr,
+		LinkAddr:  testEntryBroadcastLinkAddr,
+		State:     Static,
+	}
+	if diff := cmp.Diff(got, want, entryDiffOpts()...); diff != "" {
+		t.Errorf("neigh.entry(%s, %s, _, nil) mismatch (-got, +want):\n%s", testEntryBroadcastAddr, testEntryLocalAddr, diff)
+	}
+}
+
+func BenchmarkCacheClear(b *testing.B) {
+	b.StopTimer()
+	config := DefaultNUDConfigurations()
+	clock := &tcpip.StdClock{}
+	neigh := newTestNeighborCache(nil, config, clock)
+	store := newTestEntryStore()
+	linkRes := &testNeighborResolver{
+		clock:   clock,
+		neigh:   neigh,
+		entries: store,
+		delay:   0,
+	}
+
+	// Clear for every possible size of the cache
+	for cacheSize := 0; cacheSize < neighborCacheSize; cacheSize++ {
+		// Fill the neighbor cache to capacity.
+		for i := 0; i < cacheSize; i++ {
+			entry, ok := store.entry(i)
+			if !ok {
+				b.Fatalf("store.entry(%d) not found", i)
+			}
+			_, doneCh, err := neigh.entry(entry.Addr, entry.LocalAddr, linkRes, nil)
+			if err != tcpip.ErrWouldBlock {
+				b.Fatalf("got neigh.entry(%s, %s, _, nil) = %v, want = %s", entry.Addr, entry.LocalAddr, err, tcpip.ErrWouldBlock)
+			}
+			if doneCh != nil {
+				<-doneCh
+			}
+		}
+
+		b.StartTimer()
+		neigh.clear()
+		b.StopTimer()
+	}
+}
diff --git a/pkg/tcpip/stack/neighbor_entry.go b/pkg/tcpip/stack/neighbor_entry.go
new file mode 100644
index 000000000..0068cacb8
--- /dev/null
+++ b/pkg/tcpip/stack/neighbor_entry.go
@@ -0,0 +1,482 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+	"sync"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+// NeighborEntry describes a neighboring device in the local network.
+type NeighborEntry struct {
+	Addr      tcpip.Address
+	LocalAddr tcpip.Address
+	LinkAddr  tcpip.LinkAddress
+	State     NeighborState
+	UpdatedAt time.Time
+}
+
+// NeighborState defines the state of a NeighborEntry within the Neighbor
+// Unreachability Detection state machine, as per RFC 4861 section 7.3.2.
+type NeighborState uint8
+
+const (
+	// Unknown means reachability has not been verified yet. This is the initial
+	// state of entries that have been created automatically by the Neighbor
+	// Unreachability Detection state machine.
+	Unknown NeighborState = iota
+	// Incomplete means that there is an outstanding request to resolve the
+	// address.
+	Incomplete
+	// Reachable means the path to the neighbor is functioning properly for both
+	// receive and transmit paths.
+	Reachable
+	// Stale means reachability to the neighbor is unknown, but packets are still
+	// able to be transmitted to the possibly stale link address.
+	Stale
+	// Delay means reachability to the neighbor is unknown and pending
+	// confirmation from an upper-level protocol like TCP, but packets are still
+	// able to be transmitted to the possibly stale link address.
+	Delay
+	// Probe means a reachability confirmation is actively being sought by
+	// periodically retransmitting reachability probes until a reachability
+	// confirmation is received, or until the max amount of probes has been sent.
+	Probe
+	// Static describes entries that have been explicitly added by the user. They
+	// do not expire and are not deleted until explicitly removed.
+	Static
+	// Failed means traffic should not be sent to this neighbor since attempts of
+	// reachability have returned inconclusive.
+	Failed
+)
+
+// neighborEntry implements a neighbor entry's individual node behavior, as per
+// RFC 4861 section 7.3.3. Neighbor Unreachability Detection operates in
+// parallel with the sending of packets to a neighbor, necessitating the
+// entry's lock to be acquired for all operations.
+type neighborEntry struct {
+	neighborEntryEntry
+
+	nic      *NIC
+	protocol tcpip.NetworkProtocolNumber
+
+	// linkRes provides the functionality to send reachability probes, used in
+	// Neighbor Unreachability Detection.
+	linkRes LinkAddressResolver
+
+	// nudState points to the Neighbor Unreachability Detection configuration.
+	nudState *NUDState
+
+	// mu protects the fields below.
+	mu sync.RWMutex
+
+	neigh NeighborEntry
+
+	// wakers is a set of waiters for address resolution result. Anytime state
+	// transitions out of incomplete these waiters are notified. It is nil iff
+	// address resolution is ongoing and no clients are waiting for the result.
+	wakers map[*sleep.Waker]struct{}
+
+	// done is used to allow callers to wait on address resolution. It is nil
+	// iff nudState is not Reachable and address resolution is not yet in
+	// progress.
+	done chan struct{}
+
+	isRouter bool
+	job      *tcpip.Job
+}
+
+// newNeighborEntry creates a neighbor cache entry starting at the default
+// state, Unknown. Transition out of Unknown by calling either
+// `handlePacketQueuedLocked` or `handleProbeLocked` on the newly created
+// neighborEntry.
+func newNeighborEntry(nic *NIC, remoteAddr tcpip.Address, localAddr tcpip.Address, nudState *NUDState, linkRes LinkAddressResolver) *neighborEntry {
+	return &neighborEntry{
+		nic:      nic,
+		linkRes:  linkRes,
+		nudState: nudState,
+		neigh: NeighborEntry{
+			Addr:      remoteAddr,
+			LocalAddr: localAddr,
+			State:     Unknown,
+		},
+	}
+}
+
+// newStaticNeighborEntry creates a neighbor cache entry starting at the Static
+// state. The entry can only transition out of Static by directly calling
+// `setStateLocked`.
+func newStaticNeighborEntry(nic *NIC, addr tcpip.Address, linkAddr tcpip.LinkAddress, state *NUDState) *neighborEntry {
+	if nic.stack.nudDisp != nil {
+		nic.stack.nudDisp.OnNeighborAdded(nic.id, addr, linkAddr, Static, time.Now())
+	}
+	return &neighborEntry{
+		nic:      nic,
+		nudState: state,
+		neigh: NeighborEntry{
+			Addr:      addr,
+			LinkAddr:  linkAddr,
+			State:     Static,
+			UpdatedAt: time.Now(),
+		},
+	}
+}
+
+// addWaker adds w to the list of wakers waiting for address resolution.
+// Assumes the entry has already been appropriately locked.
+func (e *neighborEntry) addWakerLocked(w *sleep.Waker) {
+	if w == nil {
+		return
+	}
+	if e.wakers == nil {
+		e.wakers = make(map[*sleep.Waker]struct{})
+	}
+	e.wakers[w] = struct{}{}
+}
+
+// notifyWakersLocked notifies those waiting for address resolution, whether it
+// succeeded or failed. Assumes the entry has already been appropriately locked.
+func (e *neighborEntry) notifyWakersLocked() {
+	for w := range e.wakers {
+		w.Assert()
+	}
+	e.wakers = nil
+	if ch := e.done; ch != nil {
+		close(ch)
+		e.done = nil
+	}
+}
+
+// dispatchAddEventLocked signals to stack's NUD Dispatcher that the entry has
+// been added.
+func (e *neighborEntry) dispatchAddEventLocked(nextState NeighborState) {
+	if nudDisp := e.nic.stack.nudDisp; nudDisp != nil {
+		nudDisp.OnNeighborAdded(e.nic.id, e.neigh.Addr, e.neigh.LinkAddr, nextState, time.Now())
+	}
+}
+
+// dispatchChangeEventLocked signals to stack's NUD Dispatcher that the entry
+// has changed state or link-layer address.
+func (e *neighborEntry) dispatchChangeEventLocked(nextState NeighborState) {
+	if nudDisp := e.nic.stack.nudDisp; nudDisp != nil {
+		nudDisp.OnNeighborChanged(e.nic.id, e.neigh.Addr, e.neigh.LinkAddr, nextState, time.Now())
+	}
+}
+
+// dispatchRemoveEventLocked signals to stack's NUD Dispatcher that the entry
+// has been removed.
+func (e *neighborEntry) dispatchRemoveEventLocked() {
+	if nudDisp := e.nic.stack.nudDisp; nudDisp != nil {
+		nudDisp.OnNeighborRemoved(e.nic.id, e.neigh.Addr, e.neigh.LinkAddr, e.neigh.State, time.Now())
+	}
+}
+
+// setStateLocked transitions the entry to the specified state immediately.
+//
+// Follows the logic defined in RFC 4861 section 7.3.3.
+//
+// e.mu MUST be locked.
+func (e *neighborEntry) setStateLocked(next NeighborState) {
+	// Cancel the previously scheduled action, if there is one. Entries in
+	// Unknown, Stale, or Static state do not have scheduled actions.
+	if timer := e.job; timer != nil {
+		timer.Cancel()
+	}
+
+	prev := e.neigh.State
+	e.neigh.State = next
+	e.neigh.UpdatedAt = time.Now()
+	config := e.nudState.Config()
+
+	switch next {
+	case Incomplete:
+		var retryCounter uint32
+		var sendMulticastProbe func()
+
+		sendMulticastProbe = func() {
+			if retryCounter == config.MaxMulticastProbes {
+				// "If no Neighbor Advertisement is received after
+				// MAX_MULTICAST_SOLICIT solicitations, address resolution has failed.
+				// The sender MUST return ICMP destination unreachable indications with
+				// code 3 (Address Unreachable) for each packet queued awaiting address
+				// resolution." - RFC 4861 section 7.2.2
+				//
+				// There is no need to send an ICMP destination unreachable indication
+				// since the failure to resolve the address is expected to only occur
+				// on this node. Thus, redirecting traffic is currently not supported.
+				//
+				// "If the error occurs on a node other than the node originating the
+				// packet, an ICMP error message is generated. If the error occurs on
+				// the originating node, an implementation is not required to actually
+				// create and send an ICMP error packet to the source, as long as the
+				// upper-layer sender is notified through an appropriate mechanism
+				// (e.g. return value from a procedure call). Note, however, that an
+				// implementation may find it convenient in some cases to return errors
+				// to the sender by taking the offending packet, generating an ICMP
+				// error message, and then delivering it (locally) through the generic
+				// error-handling routines.' - RFC 4861 section 2.1
+				e.dispatchRemoveEventLocked()
+				e.setStateLocked(Failed)
+				return
+			}
+
+			if err := e.linkRes.LinkAddressRequest(e.neigh.Addr, e.neigh.LocalAddr, "", e.nic.linkEP); err != nil {
+				// There is no need to log the error here; the NUD implementation may
+				// assume a working link. A valid link should be the responsibility of
+				// the NIC/stack.LinkEndpoint.
+				e.dispatchRemoveEventLocked()
+				e.setStateLocked(Failed)
+				return
+			}
+
+			retryCounter++
+			e.job = e.nic.stack.newJob(&e.mu, sendMulticastProbe)
+			e.job.Schedule(config.RetransmitTimer)
+		}
+
+		sendMulticastProbe()
+
+	case Reachable:
+		e.job = e.nic.stack.newJob(&e.mu, func() {
+			e.dispatchChangeEventLocked(Stale)
+			e.setStateLocked(Stale)
+		})
+		e.job.Schedule(e.nudState.ReachableTime())
+
+	case Delay:
+		e.job = e.nic.stack.newJob(&e.mu, func() {
+			e.dispatchChangeEventLocked(Probe)
+			e.setStateLocked(Probe)
+		})
+		e.job.Schedule(config.DelayFirstProbeTime)
+
+	case Probe:
+		var retryCounter uint32
+		var sendUnicastProbe func()
+
+		sendUnicastProbe = func() {
+			if retryCounter == config.MaxUnicastProbes {
+				e.dispatchRemoveEventLocked()
+				e.setStateLocked(Failed)
+				return
+			}
+
+			if err := e.linkRes.LinkAddressRequest(e.neigh.Addr, e.neigh.LocalAddr, e.neigh.LinkAddr, e.nic.linkEP); err != nil {
+				e.dispatchRemoveEventLocked()
+				e.setStateLocked(Failed)
+				return
+			}
+
+			retryCounter++
+			if retryCounter == config.MaxUnicastProbes {
+				e.dispatchRemoveEventLocked()
+				e.setStateLocked(Failed)
+				return
+			}
+
+			e.job = e.nic.stack.newJob(&e.mu, sendUnicastProbe)
+			e.job.Schedule(config.RetransmitTimer)
+		}
+
+		sendUnicastProbe()
+
+	case Failed:
+		e.notifyWakersLocked()
+		e.job = e.nic.stack.newJob(&e.mu, func() {
+			e.nic.neigh.removeEntryLocked(e)
+		})
+		e.job.Schedule(config.UnreachableTime)
+
+	case Unknown, Stale, Static:
+		// Do nothing
+
+	default:
+		panic(fmt.Sprintf("Invalid state transition from %q to %q", prev, next))
+	}
+}
+
+// handlePacketQueuedLocked advances the state machine according to a packet
+// being queued for outgoing transmission.
+//
+// Follows the logic defined in RFC 4861 section 7.3.3.
+func (e *neighborEntry) handlePacketQueuedLocked() {
+	switch e.neigh.State {
+	case Unknown:
+		e.dispatchAddEventLocked(Incomplete)
+		e.setStateLocked(Incomplete)
+
+	case Stale:
+		e.dispatchChangeEventLocked(Delay)
+		e.setStateLocked(Delay)
+
+	case Incomplete, Reachable, Delay, Probe, Static, Failed:
+		// Do nothing
+
+	default:
+		panic(fmt.Sprintf("Invalid cache entry state: %s", e.neigh.State))
+	}
+}
+
+// handleProbeLocked processes an incoming neighbor probe (e.g. ARP request or
+// Neighbor Solicitation for ARP or NDP, respectively).
+//
+// Follows the logic defined in RFC 4861 section 7.2.3.
+func (e *neighborEntry) handleProbeLocked(remoteLinkAddr tcpip.LinkAddress) {
+	// Probes MUST be silently discarded if the target address is tentative, does
+	// not exist, or not bound to the NIC as per RFC 4861 section 7.2.3. These
+	// checks MUST be done by the NetworkEndpoint.
+
+	switch e.neigh.State {
+	case Unknown, Incomplete, Failed:
+		e.neigh.LinkAddr = remoteLinkAddr
+		e.dispatchAddEventLocked(Stale)
+		e.setStateLocked(Stale)
+		e.notifyWakersLocked()
+
+	case Reachable, Delay, Probe:
+		if e.neigh.LinkAddr != remoteLinkAddr {
+			e.neigh.LinkAddr = remoteLinkAddr
+			e.dispatchChangeEventLocked(Stale)
+			e.setStateLocked(Stale)
+		}
+
+	case Stale:
+		if e.neigh.LinkAddr != remoteLinkAddr {
+			e.neigh.LinkAddr = remoteLinkAddr
+			e.dispatchChangeEventLocked(Stale)
+		}
+
+	case Static:
+		// Do nothing
+
+	default:
+		panic(fmt.Sprintf("Invalid cache entry state: %s", e.neigh.State))
+	}
+}
+
+// handleConfirmationLocked processes an incoming neighbor confirmation
+// (e.g. ARP reply or Neighbor Advertisement for ARP or NDP, respectively).
+//
+// Follows the state machine defined by RFC 4861 section 7.2.5.
+//
+// TODO(gvisor.dev/issue/2277): To protect against ARP poisoning and other
+// attacks against NDP functions, Secure Neighbor Discovery (SEND) Protocol
+// should be deployed where preventing access to the broadcast segment might
+// not be possible. SEND uses RSA key pairs to produce Cryptographically
+// Generated Addresses (CGA), as defined in RFC 3972. This ensures that the
+// claimed source of an NDP message is the owner of the claimed address.
+func (e *neighborEntry) handleConfirmationLocked(linkAddr tcpip.LinkAddress, flags ReachabilityConfirmationFlags) {
+	switch e.neigh.State {
+	case Incomplete:
+		if len(linkAddr) == 0 {
+			// "If the link layer has addresses and no Target Link-Layer Address
+			// option is included, the receiving node SHOULD silently discard the
+			// received advertisement." - RFC 4861 section 7.2.5
+			break
+		}
+
+		e.neigh.LinkAddr = linkAddr
+		if flags.Solicited {
+			e.dispatchChangeEventLocked(Reachable)
+			e.setStateLocked(Reachable)
+		} else {
+			e.dispatchChangeEventLocked(Stale)
+			e.setStateLocked(Stale)
+		}
+		e.isRouter = flags.IsRouter
+		e.notifyWakersLocked()
+
+		// "Note that the Override flag is ignored if the entry is in the
+		// INCOMPLETE state." - RFC 4861 section 7.2.5
+
+	case Reachable, Stale, Delay, Probe:
+		sameLinkAddr := e.neigh.LinkAddr == linkAddr
+
+		if !sameLinkAddr {
+			if !flags.Override {
+				if e.neigh.State == Reachable {
+					e.dispatchChangeEventLocked(Stale)
+					e.setStateLocked(Stale)
+				}
+				break
+			}
+
+			e.neigh.LinkAddr = linkAddr
+
+			if !flags.Solicited {
+				if e.neigh.State != Stale {
+					e.dispatchChangeEventLocked(Stale)
+					e.setStateLocked(Stale)
+				} else {
+					// Notify the LinkAddr change, even though NUD state hasn't changed.
+					e.dispatchChangeEventLocked(e.neigh.State)
+				}
+				break
+			}
+		}
+
+		if flags.Solicited && (flags.Override || sameLinkAddr) {
+			if e.neigh.State != Reachable {
+				e.dispatchChangeEventLocked(Reachable)
+			}
+			// Set state to Reachable again to refresh timers.
+			e.setStateLocked(Reachable)
+			e.notifyWakersLocked()
+		}
+
+		if e.isRouter && !flags.IsRouter {
+			// "In those cases where the IsRouter flag changes from TRUE to FALSE as
+			// a result of this update, the node MUST remove that router from the
+			// Default Router List and update the Destination Cache entries for all
+			// destinations using that neighbor as a router as specified in Section
+			// 7.3.3.  This is needed to detect when a node that is used as a router
+			// stops forwarding packets due to being configured as a host."
+			//  - RFC 4861 section 7.2.5
+			e.nic.mu.Lock()
+			e.nic.mu.ndp.invalidateDefaultRouter(e.neigh.Addr)
+			e.nic.mu.Unlock()
+		}
+		e.isRouter = flags.IsRouter
+
+	case Unknown, Failed, Static:
+		// Do nothing
+
+	default:
+		panic(fmt.Sprintf("Invalid cache entry state: %s", e.neigh.State))
+	}
+}
+
+// handleUpperLevelConfirmationLocked processes an incoming upper-level protocol
+// (e.g. TCP acknowledgements) reachability confirmation.
+func (e *neighborEntry) handleUpperLevelConfirmationLocked() {
+	switch e.neigh.State {
+	case Reachable, Stale, Delay, Probe:
+		if e.neigh.State != Reachable {
+			e.dispatchChangeEventLocked(Reachable)
+			// Set state to Reachable again to refresh timers.
+		}
+		e.setStateLocked(Reachable)
+
+	case Unknown, Incomplete, Failed, Static:
+		// Do nothing
+
+	default:
+		panic(fmt.Sprintf("Invalid cache entry state: %s", e.neigh.State))
+	}
+}
diff --git a/pkg/tcpip/stack/neighbor_entry_test.go b/pkg/tcpip/stack/neighbor_entry_test.go
new file mode 100644
index 000000000..b769fb2fa
--- /dev/null
+++ b/pkg/tcpip/stack/neighbor_entry_test.go
@@ -0,0 +1,2870 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+	"math"
+	"math/rand"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	entryTestNetNumber tcpip.NetworkProtocolNumber = math.MaxUint32
+
+	entryTestNICID tcpip.NICID = 1
+	entryTestAddr1             = tcpip.Address("\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	entryTestAddr2             = tcpip.Address("\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+
+	entryTestLinkAddr1 = tcpip.LinkAddress("\x0a\x00\x00\x00\x00\x01")
+	entryTestLinkAddr2 = tcpip.LinkAddress("\x0a\x00\x00\x00\x00\x02")
+
+	// entryTestNetDefaultMTU is the MTU, in bytes, used throughout the tests,
+	// except where another value is explicitly used. It is chosen to match the
+	// MTU of loopback interfaces on Linux systems.
+	entryTestNetDefaultMTU = 65536
+)
+
+// eventDiffOpts are the options passed to cmp.Diff to compare entry events.
+// The UpdatedAt field is ignored due to a lack of a deterministic method to
+// predict the time that an event will be dispatched.
+func eventDiffOpts() []cmp.Option {
+	return []cmp.Option{
+		cmpopts.IgnoreFields(testEntryEventInfo{}, "UpdatedAt"),
+	}
+}
+
+// eventDiffOptsWithSort is like eventDiffOpts but also includes an option to
+// sort slices of events for cases where ordering must be ignored.
+func eventDiffOptsWithSort() []cmp.Option {
+	return []cmp.Option{
+		cmpopts.IgnoreFields(testEntryEventInfo{}, "UpdatedAt"),
+		cmpopts.SortSlices(func(a, b testEntryEventInfo) bool {
+			return strings.Compare(string(a.Addr), string(b.Addr)) < 0
+		}),
+	}
+}
+
+// The following unit tests exercise every state transition and verify its
+// behavior with RFC 4681.
+//
+// | From       | To         | Cause                                      | Action          | Event   |
+// | ========== | ========== | ========================================== | =============== | ======= |
+// | Unknown    | Unknown    | Confirmation w/ unknown address            |                 | Added   |
+// | Unknown    | Incomplete | Packet queued to unknown address           | Send probe      | Added   |
+// | Unknown    | Stale      | Probe w/ unknown address                   |                 | Added   |
+// | Incomplete | Incomplete | Retransmit timer expired                   | Send probe      | Changed |
+// | Incomplete | Reachable  | Solicited confirmation                     | Notify wakers   | Changed |
+// | Incomplete | Stale      | Unsolicited confirmation                   | Notify wakers   | Changed |
+// | Incomplete | Failed     | Max probes sent without reply              | Notify wakers   | Removed |
+// | Reachable  | Reachable  | Confirmation w/ different isRouter flag    | Update IsRouter |         |
+// | Reachable  | Stale      | Reachable timer expired                    |                 | Changed |
+// | Reachable  | Stale      | Probe or confirmation w/ different address |                 | Changed |
+// | Stale      | Reachable  | Solicited override confirmation            | Update LinkAddr | Changed |
+// | Stale      | Stale      | Override confirmation                      | Update LinkAddr | Changed |
+// | Stale      | Stale      | Probe w/ different address                 | Update LinkAddr | Changed |
+// | Stale      | Delay      | Packet sent                                |                 | Changed |
+// | Delay      | Reachable  | Upper-layer confirmation                   |                 | Changed |
+// | Delay      | Reachable  | Solicited override confirmation            | Update LinkAddr | Changed |
+// | Delay      | Stale      | Probe or confirmation w/ different address |                 | Changed |
+// | Delay      | Probe      | Delay timer expired                        | Send probe      | Changed |
+// | Probe      | Reachable  | Solicited override confirmation            | Update LinkAddr | Changed |
+// | Probe      | Reachable  | Solicited confirmation w/ same address     | Notify wakers   | Changed |
+// | Probe      | Stale      | Probe or confirmation w/ different address |                 | Changed |
+// | Probe      | Probe      | Retransmit timer expired                   | Send probe      | Changed |
+// | Probe      | Failed     | Max probes sent without reply              | Notify wakers   | Removed |
+// | Failed     |            | Unreachability timer expired               | Delete entry    |         |
+
+type testEntryEventType uint8
+
+const (
+	entryTestAdded testEntryEventType = iota
+	entryTestChanged
+	entryTestRemoved
+)
+
+func (t testEntryEventType) String() string {
+	switch t {
+	case entryTestAdded:
+		return "add"
+	case entryTestChanged:
+		return "change"
+	case entryTestRemoved:
+		return "remove"
+	default:
+		return fmt.Sprintf("unknown (%d)", t)
+	}
+}
+
+// Fields are exported for use with cmp.Diff.
+type testEntryEventInfo struct {
+	EventType testEntryEventType
+	NICID     tcpip.NICID
+	Addr      tcpip.Address
+	LinkAddr  tcpip.LinkAddress
+	State     NeighborState
+	UpdatedAt time.Time
+}
+
+func (e testEntryEventInfo) String() string {
+	return fmt.Sprintf("%s event for NIC #%d, addr=%q, linkAddr=%q, state=%q", e.EventType, e.NICID, e.Addr, e.LinkAddr, e.State)
+}
+
+// testNUDDispatcher implements NUDDispatcher to validate the dispatching of
+// events upon certain NUD state machine events.
+type testNUDDispatcher struct {
+	mu     sync.Mutex
+	events []testEntryEventInfo
+}
+
+var _ NUDDispatcher = (*testNUDDispatcher)(nil)
+
+func (d *testNUDDispatcher) queueEvent(e testEntryEventInfo) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	d.events = append(d.events, e)
+}
+
+func (d *testNUDDispatcher) OnNeighborAdded(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time) {
+	d.queueEvent(testEntryEventInfo{
+		EventType: entryTestAdded,
+		NICID:     nicID,
+		Addr:      addr,
+		LinkAddr:  linkAddr,
+		State:     state,
+		UpdatedAt: updatedAt,
+	})
+}
+
+func (d *testNUDDispatcher) OnNeighborChanged(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time) {
+	d.queueEvent(testEntryEventInfo{
+		EventType: entryTestChanged,
+		NICID:     nicID,
+		Addr:      addr,
+		LinkAddr:  linkAddr,
+		State:     state,
+		UpdatedAt: updatedAt,
+	})
+}
+
+func (d *testNUDDispatcher) OnNeighborRemoved(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time) {
+	d.queueEvent(testEntryEventInfo{
+		EventType: entryTestRemoved,
+		NICID:     nicID,
+		Addr:      addr,
+		LinkAddr:  linkAddr,
+		State:     state,
+		UpdatedAt: updatedAt,
+	})
+}
+
+type entryTestLinkResolver struct {
+	mu     sync.Mutex
+	probes []entryTestProbeInfo
+}
+
+var _ LinkAddressResolver = (*entryTestLinkResolver)(nil)
+
+type entryTestProbeInfo struct {
+	RemoteAddress     tcpip.Address
+	RemoteLinkAddress tcpip.LinkAddress
+	LocalAddress      tcpip.Address
+}
+
+func (p entryTestProbeInfo) String() string {
+	return fmt.Sprintf("probe with RemoteAddress=%q, RemoteLinkAddress=%q, LocalAddress=%q", p.RemoteAddress, p.RemoteLinkAddress, p.LocalAddress)
+}
+
+// LinkAddressRequest sends a request for the LinkAddress of addr. Broadcasts
+// to the local network if linkAddr is the zero value.
+func (r *entryTestLinkResolver) LinkAddressRequest(addr, localAddr tcpip.Address, linkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error {
+	p := entryTestProbeInfo{
+		RemoteAddress:     addr,
+		RemoteLinkAddress: linkAddr,
+		LocalAddress:      localAddr,
+	}
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.probes = append(r.probes, p)
+	return nil
+}
+
+// ResolveStaticAddress attempts to resolve address without sending requests.
+// It either resolves the name immediately or returns the empty LinkAddress.
+func (r *entryTestLinkResolver) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+	return "", false
+}
+
+// LinkAddressProtocol returns the network protocol of the addresses this
+// resolver can resolve.
+func (r *entryTestLinkResolver) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
+	return entryTestNetNumber
+}
+
+func entryTestSetup(c NUDConfigurations) (*neighborEntry, *testNUDDispatcher, *entryTestLinkResolver, *fakeClock) {
+	clock := newFakeClock()
+	disp := testNUDDispatcher{}
+	nic := NIC{
+		id:     entryTestNICID,
+		linkEP: nil, // entryTestLinkResolver doesn't use a LinkEndpoint
+		stack: &Stack{
+			clock:   clock,
+			nudDisp: &disp,
+		},
+	}
+
+	rng := rand.New(rand.NewSource(time.Now().UnixNano()))
+	nudState := NewNUDState(c, rng)
+	linkRes := entryTestLinkResolver{}
+	entry := newNeighborEntry(&nic, entryTestAddr1 /* remoteAddr */, entryTestAddr2 /* localAddr */, nudState, &linkRes)
+
+	// Stub out ndpState to verify modification of default routers.
+	nic.mu.ndp = ndpState{
+		nic:            &nic,
+		defaultRouters: make(map[tcpip.Address]defaultRouterState),
+	}
+
+	// Stub out the neighbor cache to verify deletion from the cache.
+	nic.neigh = &neighborCache{
+		nic:   &nic,
+		state: nudState,
+		cache: make(map[tcpip.Address]*neighborEntry, neighborCacheSize),
+	}
+	nic.neigh.cache[entryTestAddr1] = entry
+
+	return entry, &disp, &linkRes, clock
+}
+
+// TestEntryInitiallyUnknown verifies that the state of a newly created
+// neighborEntry is Unknown.
+func TestEntryInitiallyUnknown(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Unknown; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	clock.advance(c.RetransmitTimer)
+
+	// No probes should have been sent.
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, []entryTestProbeInfo(nil))
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	// No events should have been dispatched.
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, []testEntryEventInfo(nil)); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryUnknownToUnknownWhenConfirmationWithUnknownAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Unknown; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	clock.advance(time.Hour)
+
+	// No probes should have been sent.
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, []entryTestProbeInfo(nil))
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	// No events should have been dispatched.
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, []testEntryEventInfo(nil)); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryUnknownToIncomplete(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	if got, want := e.neigh.State, Incomplete; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+	}
+	{
+		nudDisp.mu.Lock()
+		diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...)
+		nudDisp.mu.Unlock()
+		if diff != "" {
+			t.Fatalf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+		}
+	}
+}
+
+func TestEntryUnknownToStale(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handleProbeLocked(entryTestLinkAddr1)
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	// No probes should have been sent.
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, []entryTestProbeInfo(nil))
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryIncompleteToIncompleteDoesNotChangeUpdatedAt(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	c.MaxMulticastProbes = 3
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	if got, want := e.neigh.State, Incomplete; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	updatedAt := e.neigh.UpdatedAt
+	e.mu.Unlock()
+
+	clock.advance(c.RetransmitTimer)
+
+	// UpdatedAt should remain the same during address resolution.
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.probes = nil
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	e.mu.Lock()
+	if got, want := e.neigh.UpdatedAt, updatedAt; got != want {
+		t.Errorf("got e.neigh.UpdatedAt = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	clock.advance(c.RetransmitTimer)
+
+	// UpdatedAt should change after failing address resolution. Timing out after
+	// sending the last probe transitions the entry to Failed.
+	{
+		wantProbes := []entryTestProbeInfo{
+			{
+				RemoteAddress:     entryTestAddr1,
+				RemoteLinkAddress: tcpip.LinkAddress(""),
+				LocalAddress:      entryTestAddr2,
+			},
+		}
+		linkRes.mu.Lock()
+		diff := cmp.Diff(linkRes.probes, wantProbes)
+		linkRes.mu.Unlock()
+		if diff != "" {
+			t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+		}
+	}
+
+	clock.advance(c.RetransmitTimer)
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestRemoved,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+
+	e.mu.Lock()
+	if got, notWant := e.neigh.UpdatedAt, updatedAt; got == notWant {
+		t.Errorf("expected e.neigh.UpdatedAt to change, got = %q", got)
+	}
+	e.mu.Unlock()
+}
+
+func TestEntryIncompleteToReachable(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	if got, want := e.neigh.State, Incomplete; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Reachable,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+// TestEntryAddsAndClearsWakers verifies that wakers are added when
+// addWakerLocked is called and cleared when address resolution finishes. In
+// this case, address resolution will finish when transitioning from Incomplete
+// to Reachable.
+func TestEntryAddsAndClearsWakers(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	w := sleep.Waker{}
+	s := sleep.Sleeper{}
+	s.AddWaker(&w, 123)
+	defer s.Done()
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	if got := e.wakers; got != nil {
+		t.Errorf("got e.wakers = %v, want = nil", got)
+	}
+	e.addWakerLocked(&w)
+	if got, want := w.IsAsserted(), false; got != want {
+		t.Errorf("waker.IsAsserted() = %t, want = %t", got, want)
+	}
+	if e.wakers == nil {
+		t.Error("expected e.wakers to be non-nil")
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if e.wakers != nil {
+		t.Errorf("got e.wakers = %v, want = nil", e.wakers)
+	}
+	if got, want := w.IsAsserted(), true; got != want {
+		t.Errorf("waker.IsAsserted() = %t, want = %t", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Reachable,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryIncompleteToReachableWithRouterFlag(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	if got, want := e.neigh.State, Incomplete; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  true,
+	})
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	if got, want := e.isRouter, true; got != want {
+		t.Errorf("got e.isRouter = %t, want = %t", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	if diff := cmp.Diff(linkRes.probes, wantProbes); diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+	linkRes.mu.Unlock()
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Reachable,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryIncompleteToStale(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	if got, want := e.neigh.State, Incomplete; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryIncompleteToFailed(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	c.MaxMulticastProbes = 3
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	if got, want := e.neigh.State, Incomplete; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	waitFor := c.RetransmitTimer * time.Duration(c.MaxMulticastProbes)
+	clock.advance(waitFor)
+
+	wantProbes := []entryTestProbeInfo{
+		// The Incomplete-to-Incomplete state transition is tested here by
+		// verifying that 3 reachability probes were sent.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestRemoved,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Failed; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+}
+
+type testLocker struct{}
+
+var _ sync.Locker = (*testLocker)(nil)
+
+func (*testLocker) Lock()   {}
+func (*testLocker) Unlock() {}
+
+func TestEntryStaysReachableWhenConfirmationWithRouterFlag(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  true,
+	})
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	if got, want := e.isRouter, true; got != want {
+		t.Errorf("got e.isRouter = %t, want = %t", got, want)
+	}
+	e.nic.mu.ndp.defaultRouters[entryTestAddr1] = defaultRouterState{
+		invalidationJob: e.nic.stack.newJob(&testLocker{}, func() {}),
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.isRouter, false; got != want {
+		t.Errorf("got e.isRouter = %t, want = %t", got, want)
+	}
+	if _, ok := e.nic.mu.ndp.defaultRouters[entryTestAddr1]; ok {
+		t.Errorf("unexpected defaultRouter for %s", entryTestAddr1)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Reachable,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+}
+
+func TestEntryStaysReachableWhenProbeWithSameAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleProbeLocked(entryTestLinkAddr1)
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	if got, want := e.neigh.LinkAddr, entryTestLinkAddr1; got != want {
+		t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Reachable,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryReachableToStaleWhenTimeout(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	// Eliminate random factors from ReachableTime computation so the transition
+	// from Stale to Reachable will only take BaseReachableTime duration.
+	c.MinRandomFactor = 1
+	c.MaxRandomFactor = 1
+
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	clock.advance(c.BaseReachableTime)
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Reachable,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+}
+
+func TestEntryReachableToStaleWhenProbeWithDifferentAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleProbeLocked(entryTestLinkAddr2)
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Reachable,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+}
+
+func TestEntryReachableToStaleWhenConfirmationWithDifferentAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Reachable,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+}
+
+func TestEntryReachableToStaleWhenConfirmationWithDifferentAddressAndOverride(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  true,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Reachable,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+}
+
+func TestEntryStaysStaleWhenProbeWithSameAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleProbeLocked(entryTestLinkAddr1)
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	if got, want := e.neigh.LinkAddr, entryTestLinkAddr1; got != want {
+		t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryStaleToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  true,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	if got, want := e.neigh.LinkAddr, entryTestLinkAddr2; got != want {
+		t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Reachable,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryStaleToStaleWhenOverrideConfirmation(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  true,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	if got, want := e.neigh.LinkAddr, entryTestLinkAddr2; got != want {
+		t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryStaleToStaleWhenProbeUpdateAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleProbeLocked(entryTestLinkAddr2)
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	if got, want := e.neigh.LinkAddr, entryTestLinkAddr2; got != want {
+		t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryStaleToDelay(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handlePacketQueuedLocked()
+	if got, want := e.neigh.State, Delay; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryDelayToReachableWhenUpperLevelConfirmation(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	// Eliminate random factors from ReachableTime computation so the transition
+	// from Stale to Reachable will only take BaseReachableTime duration.
+	c.MinRandomFactor = 1
+	c.MaxRandomFactor = 1
+
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	if got, want := e.neigh.State, Delay; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleUpperLevelConfirmationLocked()
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	clock.advance(c.BaseReachableTime)
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Reachable,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryDelayToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	c.MaxMulticastProbes = 1
+	// Eliminate random factors from ReachableTime computation so the transition
+	// from Stale to Reachable will only take BaseReachableTime duration.
+	c.MinRandomFactor = 1
+	c.MaxRandomFactor = 1
+
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	if got, want := e.neigh.State, Delay; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  true,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	if got, want := e.neigh.LinkAddr, entryTestLinkAddr2; got != want {
+		t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	clock.advance(c.BaseReachableTime)
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Reachable,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryStaysDelayWhenOverrideConfirmationWithSameAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	if got, want := e.neigh.State, Delay; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  true,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Delay; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	if got, want := e.neigh.LinkAddr, entryTestLinkAddr1; got != want {
+		t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryDelayToStaleWhenProbeWithDifferentAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	if got, want := e.neigh.State, Delay; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleProbeLocked(entryTestLinkAddr2)
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryDelayToStaleWhenConfirmationWithDifferentAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, _ := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	if got, want := e.neigh.State, Delay; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  true,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantProbes := []entryTestProbeInfo{
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryDelayToProbe(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	if got, want := e.neigh.State, Delay; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	clock.advance(c.DelayFirstProbeTime)
+
+	wantProbes := []entryTestProbeInfo{
+		// The first probe is caused by the Unknown-to-Incomplete transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+		// The second probe is caused by the Delay-to-Probe transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: entryTestLinkAddr1,
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Probe,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Probe; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+}
+
+func TestEntryProbeToStaleWhenProbeWithDifferentAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	e.mu.Unlock()
+
+	clock.advance(c.DelayFirstProbeTime)
+
+	wantProbes := []entryTestProbeInfo{
+		// The first probe is caused by the Unknown-to-Incomplete transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+		// The second probe is caused by the Delay-to-Probe transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: entryTestLinkAddr1,
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Probe; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleProbeLocked(entryTestLinkAddr2)
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Probe,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+}
+
+func TestEntryProbeToStaleWhenConfirmationWithDifferentAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	e.mu.Unlock()
+
+	clock.advance(c.DelayFirstProbeTime)
+
+	wantProbes := []entryTestProbeInfo{
+		// The first probe is caused by the Unknown-to-Incomplete transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+		// The second probe is caused by the Delay-to-Probe transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: entryTestLinkAddr1,
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Probe; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  true,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Probe,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Stale; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+}
+
+func TestEntryStaysProbeWhenOverrideConfirmationWithSameAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	e.mu.Unlock()
+
+	clock.advance(c.DelayFirstProbeTime)
+
+	wantProbes := []entryTestProbeInfo{
+		// The first probe is caused by the Unknown-to-Incomplete transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+		// The second probe is caused by the Delay-to-Probe transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: entryTestLinkAddr1,
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Probe; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  true,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Probe; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	if got, want := e.neigh.LinkAddr, entryTestLinkAddr1; got != want {
+		t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Probe,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+// TestEntryUnknownToStaleToProbeToReachable exercises the following scenario:
+//   1. Probe is received
+//   2. Entry is created in Stale
+//   3. Packet is queued on the entry
+//   4. Entry transitions to Delay then Probe
+//   5. Probe is sent
+func TestEntryUnknownToStaleToProbeToReachable(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	// Eliminate random factors from ReachableTime computation so the transition
+	// from Probe to Reachable will only take BaseReachableTime duration.
+	c.MinRandomFactor = 1
+	c.MaxRandomFactor = 1
+
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handleProbeLocked(entryTestLinkAddr1)
+	e.handlePacketQueuedLocked()
+	e.mu.Unlock()
+
+	clock.advance(c.DelayFirstProbeTime)
+
+	wantProbes := []entryTestProbeInfo{
+		// Probe caused by the Delay-to-Probe transition
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: entryTestLinkAddr1,
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Probe; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  true,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	if got, want := e.neigh.LinkAddr, entryTestLinkAddr2; got != want {
+		t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	clock.advance(c.BaseReachableTime)
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Probe,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Reachable,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryProbeToReachableWhenSolicitedOverrideConfirmation(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	// Eliminate random factors from ReachableTime computation so the transition
+	// from Stale to Reachable will only take BaseReachableTime duration.
+	c.MinRandomFactor = 1
+	c.MaxRandomFactor = 1
+
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	e.mu.Unlock()
+
+	clock.advance(c.DelayFirstProbeTime)
+
+	wantProbes := []entryTestProbeInfo{
+		// The first probe is caused by the Unknown-to-Incomplete transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+		// The second probe is caused by the Delay-to-Probe transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: entryTestLinkAddr1,
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Probe; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr2, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  true,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	if got, want := e.neigh.LinkAddr, entryTestLinkAddr2; got != want {
+		t.Errorf("got e.neigh.LinkAddr = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	clock.advance(c.BaseReachableTime)
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Probe,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Reachable,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr2,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryProbeToReachableWhenSolicitedConfirmationWithSameAddress(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	// Eliminate random factors from ReachableTime computation so the transition
+	// from Stale to Reachable will only take BaseReachableTime duration.
+	c.MinRandomFactor = 1
+	c.MaxRandomFactor = 1
+
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	e.mu.Unlock()
+
+	clock.advance(c.DelayFirstProbeTime)
+
+	wantProbes := []entryTestProbeInfo{
+		// The first probe is caused by the Unknown-to-Incomplete transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+		// The second probe is caused by the Delay-to-Probe transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: entryTestLinkAddr1,
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Probe; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: true,
+		Override:  false,
+		IsRouter:  false,
+	})
+	if got, want := e.neigh.State, Reachable; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+
+	clock.advance(c.BaseReachableTime)
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Probe,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Reachable,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+}
+
+func TestEntryProbeToFailed(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	c.MaxMulticastProbes = 3
+	c.MaxUnicastProbes = 3
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	e.mu.Unlock()
+
+	waitFor := c.DelayFirstProbeTime + c.RetransmitTimer*time.Duration(c.MaxUnicastProbes)
+	clock.advance(waitFor)
+
+	wantProbes := []entryTestProbeInfo{
+		// The first probe is caused by the Unknown-to-Incomplete transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+		// The next three probe are caused by the Delay-to-Probe transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: entryTestLinkAddr1,
+			LocalAddress:      entryTestAddr2,
+		},
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: entryTestLinkAddr1,
+			LocalAddress:      entryTestAddr2,
+		},
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: entryTestLinkAddr1,
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Probe,
+		},
+		{
+			EventType: entryTestRemoved,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Probe,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+
+	e.mu.Lock()
+	if got, want := e.neigh.State, Failed; got != want {
+		t.Errorf("got e.neigh.State = %q, want = %q", got, want)
+	}
+	e.mu.Unlock()
+}
+
+func TestEntryFailedGetsDeleted(t *testing.T) {
+	c := DefaultNUDConfigurations()
+	c.MaxMulticastProbes = 3
+	c.MaxUnicastProbes = 3
+	e, nudDisp, linkRes, clock := entryTestSetup(c)
+
+	// Verify the cache contains the entry.
+	if _, ok := e.nic.neigh.cache[entryTestAddr1]; !ok {
+		t.Errorf("expected entry %q to exist in the neighbor cache", entryTestAddr1)
+	}
+
+	e.mu.Lock()
+	e.handlePacketQueuedLocked()
+	e.handleConfirmationLocked(entryTestLinkAddr1, ReachabilityConfirmationFlags{
+		Solicited: false,
+		Override:  false,
+		IsRouter:  false,
+	})
+	e.handlePacketQueuedLocked()
+	e.mu.Unlock()
+
+	waitFor := c.DelayFirstProbeTime + c.RetransmitTimer*time.Duration(c.MaxUnicastProbes) + c.UnreachableTime
+	clock.advance(waitFor)
+
+	wantProbes := []entryTestProbeInfo{
+		// The first probe is caused by the Unknown-to-Incomplete transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: tcpip.LinkAddress(""),
+			LocalAddress:      entryTestAddr2,
+		},
+		// The next three probe are caused by the Delay-to-Probe transition.
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: entryTestLinkAddr1,
+			LocalAddress:      entryTestAddr2,
+		},
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: entryTestLinkAddr1,
+			LocalAddress:      entryTestAddr2,
+		},
+		{
+			RemoteAddress:     entryTestAddr1,
+			RemoteLinkAddress: entryTestLinkAddr1,
+			LocalAddress:      entryTestAddr2,
+		},
+	}
+	linkRes.mu.Lock()
+	diff := cmp.Diff(linkRes.probes, wantProbes)
+	linkRes.mu.Unlock()
+	if diff != "" {
+		t.Fatalf("link address resolver probes mismatch (-got, +want):\n%s", diff)
+	}
+
+	wantEvents := []testEntryEventInfo{
+		{
+			EventType: entryTestAdded,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  tcpip.LinkAddress(""),
+			State:     Incomplete,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Stale,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Delay,
+		},
+		{
+			EventType: entryTestChanged,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Probe,
+		},
+		{
+			EventType: entryTestRemoved,
+			NICID:     entryTestNICID,
+			Addr:      entryTestAddr1,
+			LinkAddr:  entryTestLinkAddr1,
+			State:     Probe,
+		},
+	}
+	nudDisp.mu.Lock()
+	if diff := cmp.Diff(nudDisp.events, wantEvents, eventDiffOpts()...); diff != "" {
+		t.Errorf("nud dispatcher events mismatch (-got, +want):\n%s", diff)
+	}
+	nudDisp.mu.Unlock()
+
+	// Verify the cache no longer contains the entry.
+	if _, ok := e.nic.neigh.cache[entryTestAddr1]; ok {
+		t.Errorf("entry %q should have been deleted from the neighbor cache", entryTestAddr1)
+	}
+}
diff --git a/pkg/tcpip/stack/neighborstate_string.go b/pkg/tcpip/stack/neighborstate_string.go
new file mode 100644
index 000000000..aa7311ec6
--- /dev/null
+++ b/pkg/tcpip/stack/neighborstate_string.go
@@ -0,0 +1,44 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by "stringer -type NeighborState"; DO NOT EDIT.
+
+package stack
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[Unknown-0]
+	_ = x[Incomplete-1]
+	_ = x[Reachable-2]
+	_ = x[Stale-3]
+	_ = x[Delay-4]
+	_ = x[Probe-5]
+	_ = x[Static-6]
+	_ = x[Failed-7]
+}
+
+const _NeighborState_name = "UnknownIncompleteReachableStaleDelayProbeStaticFailed"
+
+var _NeighborState_index = [...]uint8{0, 7, 17, 26, 31, 36, 41, 47, 53}
+
+func (i NeighborState) String() string {
+	if i >= NeighborState(len(_NeighborState_index)-1) {
+		return "NeighborState(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+	return _NeighborState_name[_NeighborState_index[i]:_NeighborState_index[i+1]]
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
index 54103fdb3..2315ea5b9 100644
--- a/pkg/tcpip/stack/nic.go
+++ b/pkg/tcpip/stack/nic.go
@@ -16,6 +16,7 @@ package stack
 
 import (
 	"fmt"
+	"math/rand"
 	"reflect"
 	"sort"
 	"strings"
@@ -45,6 +46,7 @@ type NIC struct {
 	context NICContext
 
 	stats NICStats
+	neigh *neighborCache
 
 	mu struct {
 		sync.RWMutex
@@ -141,6 +143,16 @@ func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICC
 		nic.mu.packetEPs[netProto.Number()] = []PacketEndpoint{}
 	}
 
+	// Check for Neighbor Unreachability Detection support.
+	if ep.Capabilities()&CapabilityResolutionRequired != 0 && len(stack.linkAddrResolvers) != 0 {
+		rng := rand.New(rand.NewSource(stack.clock.NowNanoseconds()))
+		nic.neigh = &neighborCache{
+			nic:   nic,
+			state: NewNUDState(stack.nudConfigs, rng),
+			cache: make(map[tcpip.Address]*neighborEntry, neighborCacheSize),
+		}
+	}
+
 	nic.linkEP.Attach(nic)
 
 	return nic
@@ -181,7 +193,7 @@ func (n *NIC) disableLocked() *tcpip.Error {
 		return nil
 	}
 
-	// TODO(b/147015577): Should Routes that are currently bound to n be
+	// TODO(gvisor.dev/issue/1491): Should Routes that are currently bound to n be
 	// invalidated? Currently, Routes will continue to work when a NIC is enabled
 	// again, and applications may not know that the underlying NIC was ever
 	// disabled.
@@ -205,6 +217,11 @@ func (n *NIC) disableLocked() *tcpip.Error {
 	}
 
 	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
+		// The NIC may have already left the multicast group.
+		if err := n.leaveGroupLocked(header.IPv4AllSystems, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress {
+			return err
+		}
+
 		// The address may have already been removed.
 		if err := n.removePermanentAddressLocked(ipv4BroadcastAddr.AddressWithPrefix.Address); err != nil && err != tcpip.ErrBadLocalAddress {
 			return err
@@ -243,6 +260,13 @@ func (n *NIC) enable() *tcpip.Error {
 		if _, err := n.addAddressLocked(ipv4BroadcastAddr, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
 			return err
 		}
+
+		// As per RFC 1122 section 3.3.7, all hosts should join the all-hosts
+		// multicast group. Note, the IANA calls the all-hosts multicast group the
+		// all-systems multicast group.
+		if err := n.joinGroupLocked(header.IPv4ProtocolNumber, header.IPv4AllSystems); err != nil {
+			return err
+		}
 	}
 
 	// Join the IPv6 All-Nodes Multicast group if the stack is configured to
@@ -457,8 +481,20 @@ type ipv6AddrCandidate struct {
 // remoteAddr must be a valid IPv6 address.
 func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEndpoint {
 	n.mu.RLock()
-	defer n.mu.RUnlock()
+	ref := n.primaryIPv6EndpointRLocked(remoteAddr)
+	n.mu.RUnlock()
+	return ref
+}
 
+// primaryIPv6EndpointLocked returns an IPv6 endpoint following Source Address
+// Selection (RFC 6724 section 5).
+//
+// Note, only rules 1-3 and 7 are followed.
+//
+// remoteAddr must be a valid IPv6 address.
+//
+// n.mu MUST be read locked.
+func (n *NIC) primaryIPv6EndpointRLocked(remoteAddr tcpip.Address) *referencedNetworkEndpoint {
 	primaryAddrs := n.mu.primary[header.IPv6ProtocolNumber]
 
 	if len(primaryAddrs) == 0 {
@@ -568,11 +604,6 @@ const (
 	// promiscuous indicates that the NIC's promiscuous flag should be observed
 	// when getting a NIC's referenced network endpoint.
 	promiscuous
-
-	// forceSpoofing indicates that the NIC should be assumed to be spoofing,
-	// regardless of what the NIC's spoofing flag is when getting a NIC's
-	// referenced network endpoint.
-	forceSpoofing
 )
 
 func (n *NIC) getRef(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) *referencedNetworkEndpoint {
@@ -590,9 +621,10 @@ func (n *NIC) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.A
 // If none exists a temporary one may be created if we are in promiscuous mode
 // or spoofing. Promiscuous mode will only be checked if promiscuous is true.
 // Similarly, spoofing will only be checked if spoofing is true.
+//
+// If the address is the IPv4 broadcast address for an endpoint's network, that
+// endpoint will be returned.
 func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior, tempRef getRefBehaviour) *referencedNetworkEndpoint {
-	id := NetworkEndpointID{address}
-
 	n.mu.RLock()
 
 	var spoofingOrPromiscuous bool
@@ -601,24 +633,28 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 		spoofingOrPromiscuous = n.mu.spoofing
 	case promiscuous:
 		spoofingOrPromiscuous = n.mu.promiscuous
-	case forceSpoofing:
-		spoofingOrPromiscuous = true
 	}
 
-	if ref, ok := n.mu.endpoints[id]; ok {
+	if ref, ok := n.mu.endpoints[NetworkEndpointID{address}]; ok {
 		// An endpoint with this id exists, check if it can be used and return it.
-		switch ref.getKind() {
-		case permanentExpired:
-			if !spoofingOrPromiscuous {
-				n.mu.RUnlock()
-				return nil
-			}
-			fallthrough
-		case temporary, permanent:
-			if ref.tryIncRef() {
-				n.mu.RUnlock()
-				return ref
-			}
+		if !ref.isAssignedRLocked(spoofingOrPromiscuous) {
+			n.mu.RUnlock()
+			return nil
+		}
+
+		if ref.tryIncRef() {
+			n.mu.RUnlock()
+			return ref
+		}
+	}
+
+	// Check if address is a broadcast address for the endpoint's network.
+	//
+	// Only IPv4 has a notion of broadcast addresses.
+	if protocol == header.IPv4ProtocolNumber {
+		if ref := n.getRefForBroadcastRLocked(address); ref != nil {
+			n.mu.RUnlock()
+			return ref
 		}
 	}
 
@@ -654,11 +690,44 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 	// endpoint, create a new "temporary" endpoint. It will only exist while
 	// there's a route through it.
 	n.mu.Lock()
-	if ref, ok := n.mu.endpoints[id]; ok {
+	ref := n.getRefOrCreateTempLocked(protocol, address, peb)
+	n.mu.Unlock()
+	return ref
+}
+
+// getRefForBroadcastLocked returns an endpoint where address is the IPv4
+// broadcast address for the endpoint's network.
+//
+// n.mu MUST be read locked.
+func (n *NIC) getRefForBroadcastRLocked(address tcpip.Address) *referencedNetworkEndpoint {
+	for _, ref := range n.mu.endpoints {
+		// Only IPv4 has a notion of broadcast addresses.
+		if ref.protocol != header.IPv4ProtocolNumber {
+			continue
+		}
+
+		addr := ref.addrWithPrefix()
+		subnet := addr.Subnet()
+		if subnet.IsBroadcast(address) && ref.tryIncRef() {
+			return ref
+		}
+	}
+
+	return nil
+}
+
+/// getRefOrCreateTempLocked returns an existing endpoint for address or creates
+/// and returns a temporary endpoint.
+//
+// If the address is the IPv4 broadcast address for an endpoint's network, that
+// endpoint will be returned.
+//
+// n.mu must be write locked.
+func (n *NIC) getRefOrCreateTempLocked(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) *referencedNetworkEndpoint {
+	if ref, ok := n.mu.endpoints[NetworkEndpointID{address}]; ok {
 		// No need to check the type as we are ok with expired endpoints at this
 		// point.
 		if ref.tryIncRef() {
-			n.mu.Unlock()
 			return ref
 		}
 		// tryIncRef failing means the endpoint is scheduled to be removed once the
@@ -667,10 +736,18 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 		n.removeEndpointLocked(ref)
 	}
 
+	// Check if address is a broadcast address for an endpoint's network.
+	//
+	// Only IPv4 has a notion of broadcast addresses.
+	if protocol == header.IPv4ProtocolNumber {
+		if ref := n.getRefForBroadcastRLocked(address); ref != nil {
+			return ref
+		}
+	}
+
 	// Add a new temporary endpoint.
 	netProto, ok := n.stack.networkProtocols[protocol]
 	if !ok {
-		n.mu.Unlock()
 		return nil
 	}
 	ref, _ := n.addAddressLocked(tcpip.ProtocolAddress{
@@ -680,8 +757,6 @@ func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address t
 			PrefixLen: netProto.DefaultPrefixLen(),
 		},
 	}, peb, temporary, static, false)
-
-	n.mu.Unlock()
 	return ref
 }
 
@@ -1153,7 +1228,7 @@ func (n *NIC) isInGroup(addr tcpip.Address) bool {
 	return joins != 0
 }
 
-func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt PacketBuffer) {
+func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt *PacketBuffer) {
 	r := makeRoute(protocol, dst, src, localLinkAddr, ref, false /* handleLocal */, false /* multicastLoop */)
 	r.RemoteLinkAddress = remotelinkAddr
 
@@ -1167,7 +1242,7 @@ func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address,
 // Note that the ownership of the slice backing vv is retained by the caller.
 // This rule applies only to the slice itself, not to the items of the slice;
 // the ownership of the items is not retained by the caller.
-func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
+func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
 	n.mu.RLock()
 	enabled := n.mu.enabled
 	// If the NIC is not yet enabled, don't receive any packets.
@@ -1197,27 +1272,34 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 
 	// Are any packet sockets listening for this network protocol?
 	packetEPs := n.mu.packetEPs[protocol]
-	// Check whether there are packet sockets listening for every protocol.
-	// If we received a packet with protocol EthernetProtocolAll, then the
-	// previous for loop will have handled it.
-	if protocol != header.EthernetProtocolAll {
-		packetEPs = append(packetEPs, n.mu.packetEPs[header.EthernetProtocolAll]...)
-	}
+	// Add any other packet sockets that maybe listening for all protocols.
+	packetEPs = append(packetEPs, n.mu.packetEPs[header.EthernetProtocolAll]...)
 	n.mu.RUnlock()
 	for _, ep := range packetEPs {
-		ep.HandlePacket(n.id, local, protocol, pkt.Clone())
+		p := pkt.Clone()
+		p.PktType = tcpip.PacketHost
+		ep.HandlePacket(n.id, local, protocol, p)
 	}
 
 	if netProto.Number() == header.IPv4ProtocolNumber || netProto.Number() == header.IPv6ProtocolNumber {
 		n.stack.stats.IP.PacketsReceived.Increment()
 	}
 
-	netHeader, ok := pkt.Data.PullUp(netProto.MinimumPacketSize())
+	// Parse headers.
+	transProtoNum, hasTransportHdr, ok := netProto.Parse(pkt)
 	if !ok {
+		// The packet is too small to contain a network header.
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
-	src, dst := netProto.ParseAddresses(netHeader)
+	if hasTransportHdr {
+		// Parse the transport header if present.
+		if state, ok := n.stack.transportProtocols[transProtoNum]; ok {
+			state.proto.Parse(pkt)
+		}
+	}
+
+	src, dst := netProto.ParseAddresses(pkt.NetworkHeader().View())
 
 	if n.stack.handleLocal && !n.isLoopback() && n.getRef(protocol, src) != nil {
 		// The source address is one of our own, so we never should have gotten a
@@ -1229,18 +1311,19 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	}
 
 	// TODO(gvisor.dev/issue/170): Not supporting iptables for IPv6 yet.
-	if protocol == header.IPv4ProtocolNumber {
+	// Loopback traffic skips the prerouting chain.
+	if protocol == header.IPv4ProtocolNumber && !n.isLoopback() {
 		// iptables filtering.
 		ipt := n.stack.IPTables()
 		address := n.primaryAddress(protocol)
-		if ok := ipt.Check(Prerouting, &pkt, nil, nil, address.Address, ""); !ok {
+		if ok := ipt.Check(Prerouting, pkt, nil, nil, address.Address, ""); !ok {
 			// iptables is telling us to drop the packet.
 			return
 		}
 	}
 
 	if ref := n.getRef(protocol, dst); ref != nil {
-		handlePacket(protocol, dst, src, linkEP.LinkAddress(), remote, ref, pkt)
+		handlePacket(protocol, dst, src, n.linkEP.LinkAddress(), remote, ref, pkt)
 		return
 	}
 
@@ -1298,24 +1381,50 @@ func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.Link
 	}
 }
 
-func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) {
-	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
-	if linkHeaderLen := int(n.linkEP.MaxHeaderLength()); linkHeaderLen != 0 {
-		pkt.Header = buffer.NewPrependable(linkHeaderLen)
+// DeliverOutboundPacket implements NetworkDispatcher.DeliverOutboundPacket.
+func (n *NIC) DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	n.mu.RLock()
+	// We do not deliver to protocol specific packet endpoints as on Linux
+	// only ETH_P_ALL endpoints get outbound packets.
+	// Add any other packet sockets that maybe listening for all protocols.
+	packetEPs := n.mu.packetEPs[header.EthernetProtocolAll]
+	n.mu.RUnlock()
+	for _, ep := range packetEPs {
+		p := pkt.Clone()
+		p.PktType = tcpip.PacketOutgoing
+		// Add the link layer header as outgoing packets are intercepted
+		// before the link layer header is created.
+		n.linkEP.AddHeader(local, remote, protocol, p)
+		ep.HandlePacket(n.id, local, protocol, p)
 	}
+}
 
-	if err := n.linkEP.WritePacket(r, nil /* gso */, protocol, pkt); err != nil {
+func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
+
+	// pkt may have set its header and may not have enough headroom for link-layer
+	// header for the other link to prepend. Here we create a new packet to
+	// forward.
+	fwdPkt := NewPacketBuffer(PacketBufferOptions{
+		ReserveHeaderBytes: int(n.linkEP.MaxHeaderLength()),
+		Data:               buffer.NewVectorisedView(pkt.Size(), pkt.Views()),
+	})
+
+	// WritePacket takes ownership of fwdPkt, calculate numBytes first.
+	numBytes := fwdPkt.Size()
+
+	if err := n.linkEP.WritePacket(r, nil /* gso */, protocol, fwdPkt); err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 		return
 	}
 
 	n.stats.Tx.Packets.Increment()
-	n.stats.Tx.Bytes.IncrementBy(uint64(pkt.Header.UsedLength() + pkt.Data.Size()))
+	n.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
 }
 
 // DeliverTransportPacket delivers the packets to the appropriate transport
 // protocol endpoint.
-func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer) {
+func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) {
 	state, ok := n.stack.transportProtocols[protocol]
 	if !ok {
 		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
@@ -1329,13 +1438,31 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 	// validly formed.
 	n.stack.demux.deliverRawPacket(r, protocol, pkt)
 
-	transHeader, ok := pkt.Data.PullUp(transProto.MinimumPacketSize())
-	if !ok {
+	// TransportHeader is empty only when pkt is an ICMP packet or was reassembled
+	// from fragments.
+	if pkt.TransportHeader().View().IsEmpty() {
+		// TODO(gvisor.dev/issue/170): ICMP packets don't have their TransportHeader
+		// fields set yet, parse it here. See icmp/protocol.go:protocol.Parse for a
+		// full explanation.
+		if protocol == header.ICMPv4ProtocolNumber || protocol == header.ICMPv6ProtocolNumber {
+			// ICMP packets may be longer, but until icmp.Parse is implemented, here
+			// we parse it using the minimum size.
+			if _, ok := pkt.TransportHeader().Consume(transProto.MinimumPacketSize()); !ok {
+				n.stack.stats.MalformedRcvdPackets.Increment()
+				return
+			}
+		} else {
+			// This is either a bad packet or was re-assembled from fragments.
+			transProto.Parse(pkt)
+		}
+	}
+
+	if pkt.TransportHeader().View().Size() < transProto.MinimumPacketSize() {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
 	}
 
-	srcPort, dstPort, err := transProto.ParsePorts(transHeader)
+	srcPort, dstPort, err := transProto.ParsePorts(pkt.TransportHeader().View())
 	if err != nil {
 		n.stack.stats.MalformedRcvdPackets.Increment()
 		return
@@ -1362,7 +1489,7 @@ func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolN
 
 // DeliverTransportControlPacket delivers control packets to the appropriate
 // transport protocol endpoint.
-func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt PacketBuffer) {
+func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt *PacketBuffer) {
 	state, ok := n.stack.transportProtocols[trans]
 	if !ok {
 		return
@@ -1477,6 +1604,27 @@ func (n *NIC) setNDPConfigs(c NDPConfigurations) {
 	n.mu.Unlock()
 }
 
+// NUDConfigs gets the NUD configurations for n.
+func (n *NIC) NUDConfigs() (NUDConfigurations, *tcpip.Error) {
+	if n.neigh == nil {
+		return NUDConfigurations{}, tcpip.ErrNotSupported
+	}
+	return n.neigh.config(), nil
+}
+
+// setNUDConfigs sets the NUD configurations for n.
+//
+// Note, if c contains invalid NUD configuration values, it will be fixed to
+// use default values for the erroneous values.
+func (n *NIC) setNUDConfigs(c NUDConfigurations) *tcpip.Error {
+	if n.neigh == nil {
+		return tcpip.ErrNotSupported
+	}
+	c.resetInvalidFields()
+	n.neigh.setConfig(c)
+	return nil
+}
+
 // handleNDPRA handles an NDP Router Advertisement message that arrived on n.
 func (n *NIC) handleNDPRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
 	n.mu.Lock()
@@ -1611,8 +1759,8 @@ func (r *referencedNetworkEndpoint) setKind(kind networkEndpointKind) {
 }
 
 // isValidForOutgoing returns true if the endpoint can be used to send out a
-// packet. It requires the endpoint to not be marked expired (i.e., its address
-// has been removed), or the NIC to be in spoofing mode.
+// packet. It requires the endpoint to not be marked expired (i.e., its address)
+// has been removed) unless the NIC is in spoofing mode, or temporary.
 func (r *referencedNetworkEndpoint) isValidForOutgoing() bool {
 	r.nic.mu.RLock()
 	defer r.nic.mu.RUnlock()
@@ -1620,13 +1768,28 @@ func (r *referencedNetworkEndpoint) isValidForOutgoing() bool {
 	return r.isValidForOutgoingRLocked()
 }
 
-// isValidForOutgoingRLocked returns true if the endpoint can be used to send
-// out a packet. It requires the endpoint to not be marked expired (i.e., its
-// address has been removed), or the NIC to be in spoofing mode.
-//
-// r's NIC must be read locked.
+// isValidForOutgoingRLocked is the same as isValidForOutgoing but requires
+// r.nic.mu to be read locked.
 func (r *referencedNetworkEndpoint) isValidForOutgoingRLocked() bool {
-	return r.nic.mu.enabled && (r.getKind() != permanentExpired || r.nic.mu.spoofing)
+	if !r.nic.mu.enabled {
+		return false
+	}
+
+	return r.isAssignedRLocked(r.nic.mu.spoofing)
+}
+
+// isAssignedRLocked returns true if r is considered to be assigned to the NIC.
+//
+// r.nic.mu must be read locked.
+func (r *referencedNetworkEndpoint) isAssignedRLocked(spoofingOrPromiscuous bool) bool {
+	switch r.getKind() {
+	case permanentTentative:
+		return false
+	case permanentExpired:
+		return spoofingOrPromiscuous
+	default:
+		return true
+	}
 }
 
 // expireLocked decrements the reference count and marks the permanent endpoint
diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go
index d672fc157..0870c8d9c 100644
--- a/pkg/tcpip/stack/nic_test.go
+++ b/pkg/tcpip/stack/nic_test.go
@@ -15,11 +15,278 @@
 package stack
 
 import (
+	"math"
 	"testing"
+	"time"
 
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 )
 
+var _ LinkEndpoint = (*testLinkEndpoint)(nil)
+
+// A LinkEndpoint that throws away outgoing packets.
+//
+// We use this instead of the channel endpoint as the channel package depends on
+// the stack package which this test lives in, causing a cyclic dependency.
+type testLinkEndpoint struct {
+	dispatcher NetworkDispatcher
+}
+
+// Attach implements LinkEndpoint.Attach.
+func (e *testLinkEndpoint) Attach(dispatcher NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements LinkEndpoint.IsAttached.
+func (e *testLinkEndpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements LinkEndpoint.MTU.
+func (*testLinkEndpoint) MTU() uint32 {
+	return math.MaxUint16
+}
+
+// Capabilities implements LinkEndpoint.Capabilities.
+func (*testLinkEndpoint) Capabilities() LinkEndpointCapabilities {
+	return CapabilityResolutionRequired
+}
+
+// MaxHeaderLength implements LinkEndpoint.MaxHeaderLength.
+func (*testLinkEndpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (*testLinkEndpoint) LinkAddress() tcpip.LinkAddress {
+	return ""
+}
+
+// Wait implements LinkEndpoint.Wait.
+func (*testLinkEndpoint) Wait() {}
+
+// WritePacket implements LinkEndpoint.WritePacket.
+func (e *testLinkEndpoint) WritePacket(*Route, *GSO, tcpip.NetworkProtocolNumber, *PacketBuffer) *tcpip.Error {
+	return nil
+}
+
+// WritePackets implements LinkEndpoint.WritePackets.
+func (e *testLinkEndpoint) WritePackets(*Route, *GSO, PacketBufferList, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	// Our tests don't use this so we don't support it.
+	return 0, tcpip.ErrNotSupported
+}
+
+// WriteRawPacket implements LinkEndpoint.WriteRawPacket.
+func (e *testLinkEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
+	// Our tests don't use this so we don't support it.
+	return tcpip.ErrNotSupported
+}
+
+// ARPHardwareType implements stack.LinkEndpoint.ARPHardwareType.
+func (*testLinkEndpoint) ARPHardwareType() header.ARPHardwareType {
+	panic("not implemented")
+}
+
+// AddHeader implements stack.LinkEndpoint.AddHeader.
+func (e *testLinkEndpoint) AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	panic("not implemented")
+}
+
+var _ NetworkEndpoint = (*testIPv6Endpoint)(nil)
+
+// An IPv6 NetworkEndpoint that throws away outgoing packets.
+//
+// We use this instead of ipv6.endpoint because the ipv6 package depends on
+// the stack package which this test lives in, causing a cyclic dependency.
+type testIPv6Endpoint struct {
+	nicID     tcpip.NICID
+	id        NetworkEndpointID
+	prefixLen int
+	linkEP    LinkEndpoint
+	protocol  *testIPv6Protocol
+}
+
+// DefaultTTL implements NetworkEndpoint.DefaultTTL.
+func (*testIPv6Endpoint) DefaultTTL() uint8 {
+	return 0
+}
+
+// MTU implements NetworkEndpoint.MTU.
+func (e *testIPv6Endpoint) MTU() uint32 {
+	return e.linkEP.MTU() - header.IPv6MinimumSize
+}
+
+// Capabilities implements NetworkEndpoint.Capabilities.
+func (e *testIPv6Endpoint) Capabilities() LinkEndpointCapabilities {
+	return e.linkEP.Capabilities()
+}
+
+// MaxHeaderLength implements NetworkEndpoint.MaxHeaderLength.
+func (e *testIPv6Endpoint) MaxHeaderLength() uint16 {
+	return e.linkEP.MaxHeaderLength() + header.IPv6MinimumSize
+}
+
+// WritePacket implements NetworkEndpoint.WritePacket.
+func (*testIPv6Endpoint) WritePacket(*Route, *GSO, NetworkHeaderParams, *PacketBuffer) *tcpip.Error {
+	return nil
+}
+
+// WritePackets implements NetworkEndpoint.WritePackets.
+func (*testIPv6Endpoint) WritePackets(*Route, *GSO, PacketBufferList, NetworkHeaderParams) (int, *tcpip.Error) {
+	// Our tests don't use this so we don't support it.
+	return 0, tcpip.ErrNotSupported
+}
+
+// WriteHeaderIncludedPacket implements
+// NetworkEndpoint.WriteHeaderIncludedPacket.
+func (*testIPv6Endpoint) WriteHeaderIncludedPacket(*Route, *PacketBuffer) *tcpip.Error {
+	// Our tests don't use this so we don't support it.
+	return tcpip.ErrNotSupported
+}
+
+// ID implements NetworkEndpoint.ID.
+func (e *testIPv6Endpoint) ID() *NetworkEndpointID {
+	return &e.id
+}
+
+// PrefixLen implements NetworkEndpoint.PrefixLen.
+func (e *testIPv6Endpoint) PrefixLen() int {
+	return e.prefixLen
+}
+
+// NICID implements NetworkEndpoint.NICID.
+func (e *testIPv6Endpoint) NICID() tcpip.NICID {
+	return e.nicID
+}
+
+// HandlePacket implements NetworkEndpoint.HandlePacket.
+func (*testIPv6Endpoint) HandlePacket(*Route, *PacketBuffer) {
+}
+
+// Close implements NetworkEndpoint.Close.
+func (*testIPv6Endpoint) Close() {}
+
+// NetworkProtocolNumber implements NetworkEndpoint.NetworkProtocolNumber.
+func (*testIPv6Endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return header.IPv6ProtocolNumber
+}
+
+var _ NetworkProtocol = (*testIPv6Protocol)(nil)
+
+// An IPv6 NetworkProtocol that supports the bare minimum to make a stack
+// believe it supports IPv6.
+//
+// We use this instead of ipv6.protocol because the ipv6 package depends on
+// the stack package which this test lives in, causing a cyclic dependency.
+type testIPv6Protocol struct{}
+
+// Number implements NetworkProtocol.Number.
+func (*testIPv6Protocol) Number() tcpip.NetworkProtocolNumber {
+	return header.IPv6ProtocolNumber
+}
+
+// MinimumPacketSize implements NetworkProtocol.MinimumPacketSize.
+func (*testIPv6Protocol) MinimumPacketSize() int {
+	return header.IPv6MinimumSize
+}
+
+// DefaultPrefixLen implements NetworkProtocol.DefaultPrefixLen.
+func (*testIPv6Protocol) DefaultPrefixLen() int {
+	return header.IPv6AddressSize * 8
+}
+
+// ParseAddresses implements NetworkProtocol.ParseAddresses.
+func (*testIPv6Protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
+	h := header.IPv6(v)
+	return h.SourceAddress(), h.DestinationAddress()
+}
+
+// NewEndpoint implements NetworkProtocol.NewEndpoint.
+func (p *testIPv6Protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, _ LinkAddressCache, _ TransportDispatcher, linkEP LinkEndpoint, _ *Stack) (NetworkEndpoint, *tcpip.Error) {
+	return &testIPv6Endpoint{
+		nicID:     nicID,
+		id:        NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
+		prefixLen: addrWithPrefix.PrefixLen,
+		linkEP:    linkEP,
+		protocol:  p,
+	}, nil
+}
+
+// SetOption implements NetworkProtocol.SetOption.
+func (*testIPv6Protocol) SetOption(interface{}) *tcpip.Error {
+	return nil
+}
+
+// Option implements NetworkProtocol.Option.
+func (*testIPv6Protocol) Option(interface{}) *tcpip.Error {
+	return nil
+}
+
+// Close implements NetworkProtocol.Close.
+func (*testIPv6Protocol) Close() {}
+
+// Wait implements NetworkProtocol.Wait.
+func (*testIPv6Protocol) Wait() {}
+
+// Parse implements NetworkProtocol.Parse.
+func (*testIPv6Protocol) Parse(*PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
+	return 0, false, false
+}
+
+var _ LinkAddressResolver = (*testIPv6Protocol)(nil)
+
+// LinkAddressProtocol implements LinkAddressResolver.
+func (*testIPv6Protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
+	return header.IPv6ProtocolNumber
+}
+
+// LinkAddressRequest implements LinkAddressResolver.
+func (*testIPv6Protocol) LinkAddressRequest(_, _ tcpip.Address, _ tcpip.LinkAddress, _ LinkEndpoint) *tcpip.Error {
+	return nil
+}
+
+// ResolveStaticAddress implements LinkAddressResolver.
+func (*testIPv6Protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+	if header.IsV6MulticastAddress(addr) {
+		return header.EthernetAddressFromMulticastIPv6Address(addr), true
+	}
+	return "", false
+}
+
+// Test the race condition where a NIC is removed and an RS timer fires at the
+// same time.
+func TestRemoveNICWhileHandlingRSTimer(t *testing.T) {
+	const (
+		nicID = 1
+
+		maxRtrSolicitations = 5
+	)
+
+	e := testLinkEndpoint{}
+	s := New(Options{
+		NetworkProtocols: []NetworkProtocol{&testIPv6Protocol{}},
+		NDPConfigs: NDPConfigurations{
+			MaxRtrSolicitations:     maxRtrSolicitations,
+			RtrSolicitationInterval: minimumRtrSolicitationInterval,
+		},
+	})
+
+	if err := s.CreateNIC(nicID, &e); err != nil {
+		t.Fatalf("s.CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	s.mu.Lock()
+	// Wait for the router solicitation timer to fire and block trying to obtain
+	// the stack lock when doing link address resolution.
+	time.Sleep(minimumRtrSolicitationInterval * 2)
+	if err := s.removeNICLocked(nicID); err != nil {
+		t.Fatalf("s.removeNICLocked(%d) = %s", nicID, err)
+	}
+	s.mu.Unlock()
+}
+
 func TestDisabledRxStatsWhenNICDisabled(t *testing.T) {
 	// When the NIC is disabled, the only field that matters is the stats field.
 	// This test is limited to stats counter checks.
@@ -44,7 +311,9 @@ func TestDisabledRxStatsWhenNICDisabled(t *testing.T) {
 		t.FailNow()
 	}
 
-	nic.DeliverNetworkPacket(nil, "", "", 0, PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()})
+	nic.DeliverNetworkPacket("", "", 0, NewPacketBuffer(PacketBufferOptions{
+		Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView(),
+	}))
 
 	if got := nic.stats.DisabledRx.Packets.Value(); got != 1 {
 		t.Errorf("got DisabledRx.Packets = %d, want = 1", got)
diff --git a/pkg/tcpip/stack/nud.go b/pkg/tcpip/stack/nud.go
new file mode 100644
index 000000000..e1ec15487
--- /dev/null
+++ b/pkg/tcpip/stack/nud.go
@@ -0,0 +1,466 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"math"
+	"sync"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// defaultBaseReachableTime is the default base duration for computing the
+	// random reachable time.
+	//
+	// Reachable time is the duration for which a neighbor is considered
+	// reachable after a positive reachability confirmation is received. It is a
+	// function of a uniformly distributed random value between the minimum and
+	// maximum random factors, multiplied by the base reachable time. Using a
+	// random component eliminates the possibility that Neighbor Unreachability
+	// Detection messages will synchronize with each other.
+	//
+	// Default taken from REACHABLE_TIME of RFC 4861 section 10.
+	defaultBaseReachableTime = 30 * time.Second
+
+	// minimumBaseReachableTime is the minimum base duration for computing the
+	// random reachable time.
+	//
+	// Minimum = 1ms
+	minimumBaseReachableTime = time.Millisecond
+
+	// defaultMinRandomFactor is the default minimum value of the random factor
+	// used for computing reachable time.
+	//
+	// Default taken from MIN_RANDOM_FACTOR of RFC 4861 section 10.
+	defaultMinRandomFactor = 0.5
+
+	// defaultMaxRandomFactor is the default maximum value of the random factor
+	// used for computing reachable time.
+	//
+	// The default value depends on the value of MinRandomFactor.
+	// If MinRandomFactor is less than MAX_RANDOM_FACTOR of RFC 4861 section 10,
+	// the value from the RFC will be used; otherwise, the default is
+	// MinRandomFactor multiplied by three.
+	defaultMaxRandomFactor = 1.5
+
+	// defaultRetransmitTimer is the default amount of time to wait between
+	// sending reachability probes.
+	//
+	// Default taken from RETRANS_TIMER of RFC 4861 section 10.
+	defaultRetransmitTimer = time.Second
+
+	// minimumRetransmitTimer is the minimum amount of time to wait between
+	// sending reachability probes.
+	//
+	// Note, RFC 4861 does not impose a minimum Retransmit Timer, but we do here
+	// to make sure the messages are not sent all at once. We also come to this
+	// value because in the RetransmitTimer field of a Router Advertisement, a
+	// value of 0 means unspecified, so the smallest valid value is 1. Note, the
+	// unit of the RetransmitTimer field in the Router Advertisement is
+	// milliseconds.
+	minimumRetransmitTimer = time.Millisecond
+
+	// defaultDelayFirstProbeTime is the default duration to wait for a
+	// non-Neighbor-Discovery related protocol to reconfirm reachability after
+	// entering the DELAY state. After this time, a reachability probe will be
+	// sent and the entry will transition to the PROBE state.
+	//
+	// Default taken from DELAY_FIRST_PROBE_TIME of RFC 4861 section 10.
+	defaultDelayFirstProbeTime = 5 * time.Second
+
+	// defaultMaxMulticastProbes is the default number of reachabililty probes
+	// to send before concluding negative reachability and deleting the neighbor
+	// entry from the INCOMPLETE state.
+	//
+	// Default taken from MAX_MULTICAST_SOLICIT of RFC 4861 section 10.
+	defaultMaxMulticastProbes = 3
+
+	// defaultMaxUnicastProbes is the default number of reachability probes to
+	// send before concluding retransmission from within the PROBE state should
+	// cease and the entry SHOULD be deleted.
+	//
+	// Default taken from MAX_UNICASE_SOLICIT of RFC 4861 section 10.
+	defaultMaxUnicastProbes = 3
+
+	// defaultMaxAnycastDelayTime is the default time in which the stack SHOULD
+	// delay sending a response for a random time between 0 and this time, if the
+	// target address is an anycast address.
+	//
+	// Default taken from MAX_ANYCAST_DELAY_TIME of RFC 4861 section 10.
+	defaultMaxAnycastDelayTime = time.Second
+
+	// defaultMaxReachbilityConfirmations is the default amount of unsolicited
+	// reachability confirmation messages a node MAY send to all-node multicast
+	// address when it determines its link-layer address has changed.
+	//
+	// Default taken from MAX_NEIGHBOR_ADVERTISEMENT of RFC 4861 section 10.
+	defaultMaxReachbilityConfirmations = 3
+
+	// defaultUnreachableTime is the default duration for how long an entry will
+	// remain in the FAILED state before being removed from the neighbor cache.
+	//
+	// Note, there is no equivalent protocol constant defined in RFC 4861. It
+	// leaves the specifics of any garbage collection mechanism up to the
+	// implementation.
+	defaultUnreachableTime = 5 * time.Second
+)
+
+// NUDDispatcher is the interface integrators of netstack must implement to
+// receive and handle NUD related events.
+type NUDDispatcher interface {
+	// OnNeighborAdded will be called when a new entry is added to a NIC's (with
+	// ID nicID) neighbor table.
+	//
+	// This function is permitted to block indefinitely without interfering with
+	// the stack's operation.
+	//
+	// May be called concurrently.
+	OnNeighborAdded(nicID tcpip.NICID, ipAddr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time)
+
+	// OnNeighborChanged will be called when an entry in a NIC's (with ID nicID)
+	// neighbor table changes state and/or link address.
+	//
+	// This function is permitted to block indefinitely without interfering with
+	// the stack's operation.
+	//
+	// May be called concurrently.
+	OnNeighborChanged(nicID tcpip.NICID, ipAddr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time)
+
+	// OnNeighborRemoved will be called when an entry is removed from a NIC's
+	// (with ID nicID) neighbor table.
+	//
+	// This function is permitted to block indefinitely without interfering with
+	// the stack's operation.
+	//
+	// May be called concurrently.
+	OnNeighborRemoved(nicID tcpip.NICID, ipAddr tcpip.Address, linkAddr tcpip.LinkAddress, state NeighborState, updatedAt time.Time)
+}
+
+// ReachabilityConfirmationFlags describes the flags used within a reachability
+// confirmation (e.g. ARP reply or Neighbor Advertisement for ARP or NDP,
+// respectively).
+type ReachabilityConfirmationFlags struct {
+	// Solicited indicates that the advertisement was sent in response to a
+	// reachability probe.
+	Solicited bool
+
+	// Override indicates that the reachability confirmation should override an
+	// existing neighbor cache entry and update the cached link-layer address.
+	// When Override is not set the confirmation will not update a cached
+	// link-layer address, but will update an existing neighbor cache entry for
+	// which no link-layer address is known.
+	Override bool
+
+	// IsRouter indicates that the sender is a router.
+	IsRouter bool
+}
+
+// NUDHandler communicates external events to the Neighbor Unreachability
+// Detection state machine, which is implemented per-interface. This is used by
+// network endpoints to inform the Neighbor Cache of probes and confirmations.
+type NUDHandler interface {
+	// HandleProbe processes an incoming neighbor probe (e.g. ARP request or
+	// Neighbor Solicitation for ARP or NDP, respectively). Validation of the
+	// probe needs to be performed before calling this function since the
+	// Neighbor Cache doesn't have access to view the NIC's assigned addresses.
+	HandleProbe(remoteAddr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, remoteLinkAddr tcpip.LinkAddress, linkRes LinkAddressResolver)
+
+	// HandleConfirmation processes an incoming neighbor confirmation (e.g. ARP
+	// reply or Neighbor Advertisement for ARP or NDP, respectively).
+	HandleConfirmation(addr tcpip.Address, linkAddr tcpip.LinkAddress, flags ReachabilityConfirmationFlags)
+
+	// HandleUpperLevelConfirmation processes an incoming upper-level protocol
+	// (e.g. TCP acknowledgements) reachability confirmation.
+	HandleUpperLevelConfirmation(addr tcpip.Address)
+}
+
+// NUDConfigurations is the NUD configurations for the netstack. This is used
+// by the neighbor cache to operate the NUD state machine on each device in the
+// local network.
+type NUDConfigurations struct {
+	// BaseReachableTime is the base duration for computing the random reachable
+	// time.
+	//
+	// Reachable time is the duration for which a neighbor is considered
+	// reachable after a positive reachability confirmation is received. It is a
+	// function of uniformly distributed random value between minRandomFactor and
+	// maxRandomFactor multiplied by baseReachableTime. Using a random component
+	// eliminates the possibility that Neighbor Unreachability Detection messages
+	// will synchronize with each other.
+	//
+	// After this time, a neighbor entry will transition from REACHABLE to STALE
+	// state.
+	//
+	// Must be greater than 0.
+	BaseReachableTime time.Duration
+
+	// LearnBaseReachableTime enables learning BaseReachableTime during runtime
+	// from the neighbor discovery protocol, if supported.
+	//
+	// TODO(gvisor.dev/issue/2240): Implement this NUD configuration option.
+	LearnBaseReachableTime bool
+
+	// MinRandomFactor is the minimum value of the random factor used for
+	// computing reachable time.
+	//
+	// See BaseReachbleTime for more information on computing the reachable time.
+	//
+	// Must be greater than 0.
+	MinRandomFactor float32
+
+	// MaxRandomFactor is the maximum value of the random factor used for
+	// computing reachabile time.
+	//
+	// See BaseReachbleTime for more information on computing the reachable time.
+	//
+	// Must be great than or equal to MinRandomFactor.
+	MaxRandomFactor float32
+
+	// RetransmitTimer is the duration between retransmission of reachability
+	// probes in the PROBE state.
+	RetransmitTimer time.Duration
+
+	// LearnRetransmitTimer enables learning RetransmitTimer during runtime from
+	// the neighbor discovery protocol, if supported.
+	//
+	// TODO(gvisor.dev/issue/2241): Implement this NUD configuration option.
+	LearnRetransmitTimer bool
+
+	// DelayFirstProbeTime is the duration to wait for a non-Neighbor-Discovery
+	// related protocol to reconfirm reachability after entering the DELAY state.
+	// After this time, a reachability probe will be sent and the entry will
+	// transition to the PROBE state.
+	//
+	// Must be greater than 0.
+	DelayFirstProbeTime time.Duration
+
+	// MaxMulticastProbes is the number of reachability probes to send before
+	// concluding negative reachability and deleting the neighbor entry from the
+	// INCOMPLETE state.
+	//
+	// Must be greater than 0.
+	MaxMulticastProbes uint32
+
+	// MaxUnicastProbes is the number of reachability probes to send before
+	// concluding retransmission from within the PROBE state should cease and
+	// entry SHOULD be deleted.
+	//
+	// Must be greater than 0.
+	MaxUnicastProbes uint32
+
+	// MaxAnycastDelayTime is the time in which the stack SHOULD delay sending a
+	// response for a random time between 0 and this time, if the target address
+	// is an anycast address.
+	//
+	// TODO(gvisor.dev/issue/2242): Use this option when sending solicited
+	// neighbor confirmations to anycast addresses and proxying neighbor
+	// confirmations.
+	MaxAnycastDelayTime time.Duration
+
+	// MaxReachabilityConfirmations is the number of unsolicited reachability
+	// confirmation messages a node MAY send to all-node multicast address when
+	// it determines its link-layer address has changed.
+	//
+	// TODO(gvisor.dev/issue/2246): Discuss if implementation of this NUD
+	// configuration option is necessary.
+	MaxReachabilityConfirmations uint32
+
+	// UnreachableTime describes how long an entry will remain in the FAILED
+	// state before being removed from the neighbor cache.
+	UnreachableTime time.Duration
+}
+
+// DefaultNUDConfigurations returns a NUDConfigurations populated with default
+// values defined by RFC 4861 section 10.
+func DefaultNUDConfigurations() NUDConfigurations {
+	return NUDConfigurations{
+		BaseReachableTime:            defaultBaseReachableTime,
+		LearnBaseReachableTime:       true,
+		MinRandomFactor:              defaultMinRandomFactor,
+		MaxRandomFactor:              defaultMaxRandomFactor,
+		RetransmitTimer:              defaultRetransmitTimer,
+		LearnRetransmitTimer:         true,
+		DelayFirstProbeTime:          defaultDelayFirstProbeTime,
+		MaxMulticastProbes:           defaultMaxMulticastProbes,
+		MaxUnicastProbes:             defaultMaxUnicastProbes,
+		MaxAnycastDelayTime:          defaultMaxAnycastDelayTime,
+		MaxReachabilityConfirmations: defaultMaxReachbilityConfirmations,
+		UnreachableTime:              defaultUnreachableTime,
+	}
+}
+
+// resetInvalidFields modifies an invalid NDPConfigurations with valid values.
+// If invalid values are present in c, the corresponding default values will be
+// used instead. This is needed to check, and conditionally fix, user-specified
+// NUDConfigurations.
+func (c *NUDConfigurations) resetInvalidFields() {
+	if c.BaseReachableTime < minimumBaseReachableTime {
+		c.BaseReachableTime = defaultBaseReachableTime
+	}
+	if c.MinRandomFactor <= 0 {
+		c.MinRandomFactor = defaultMinRandomFactor
+	}
+	if c.MaxRandomFactor < c.MinRandomFactor {
+		c.MaxRandomFactor = calcMaxRandomFactor(c.MinRandomFactor)
+	}
+	if c.RetransmitTimer < minimumRetransmitTimer {
+		c.RetransmitTimer = defaultRetransmitTimer
+	}
+	if c.DelayFirstProbeTime == 0 {
+		c.DelayFirstProbeTime = defaultDelayFirstProbeTime
+	}
+	if c.MaxMulticastProbes == 0 {
+		c.MaxMulticastProbes = defaultMaxMulticastProbes
+	}
+	if c.MaxUnicastProbes == 0 {
+		c.MaxUnicastProbes = defaultMaxUnicastProbes
+	}
+	if c.UnreachableTime == 0 {
+		c.UnreachableTime = defaultUnreachableTime
+	}
+}
+
+// calcMaxRandomFactor calculates the maximum value of the random factor used
+// for computing reachable time. This function is necessary for when the
+// default specified in RFC 4861 section 10 is less than the current
+// MinRandomFactor.
+//
+// Assumes minRandomFactor is positive since validation of the minimum value
+// should come before the validation of the maximum.
+func calcMaxRandomFactor(minRandomFactor float32) float32 {
+	if minRandomFactor > defaultMaxRandomFactor {
+		return minRandomFactor * 3
+	}
+	return defaultMaxRandomFactor
+}
+
+// A Rand is a source of random numbers.
+type Rand interface {
+	// Float32 returns, as a float32, a pseudo-random number in [0.0,1.0).
+	Float32() float32
+}
+
+// NUDState stores states needed for calculating reachable time.
+type NUDState struct {
+	rng Rand
+
+	// mu protects the fields below.
+	//
+	// It is necessary for NUDState to handle its own locking since neighbor
+	// entries may access the NUD state from within the goroutine spawned by
+	// time.AfterFunc(). This goroutine may run concurrently with the main
+	// process for controlling the neighbor cache and would otherwise introduce
+	// race conditions if NUDState was not locked properly.
+	mu sync.RWMutex
+
+	config NUDConfigurations
+
+	// reachableTime is the duration to wait for a REACHABLE entry to
+	// transition into STALE after inactivity. This value is calculated with
+	// the algorithm defined in RFC 4861 section 6.3.2.
+	reachableTime time.Duration
+
+	expiration            time.Time
+	prevBaseReachableTime time.Duration
+	prevMinRandomFactor   float32
+	prevMaxRandomFactor   float32
+}
+
+// NewNUDState returns new NUDState using c as configuration and the specified
+// random number generator for use in recomputing ReachableTime.
+func NewNUDState(c NUDConfigurations, rng Rand) *NUDState {
+	s := &NUDState{
+		rng: rng,
+	}
+	s.config = c
+	return s
+}
+
+// Config returns the NUD configuration.
+func (s *NUDState) Config() NUDConfigurations {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	return s.config
+}
+
+// SetConfig replaces the existing NUD configurations with c.
+func (s *NUDState) SetConfig(c NUDConfigurations) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.config = c
+}
+
+// ReachableTime returns the duration to wait for a REACHABLE entry to
+// transition into STALE after inactivity. This value is recalculated for new
+// values of BaseReachableTime, MinRandomFactor, and MaxRandomFactor using the
+// algorithm defined in RFC 4861 section 6.3.2.
+func (s *NUDState) ReachableTime() time.Duration {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if time.Now().After(s.expiration) ||
+		s.config.BaseReachableTime != s.prevBaseReachableTime ||
+		s.config.MinRandomFactor != s.prevMinRandomFactor ||
+		s.config.MaxRandomFactor != s.prevMaxRandomFactor {
+		return s.recomputeReachableTimeLocked()
+	}
+	return s.reachableTime
+}
+
+// recomputeReachableTimeLocked forces a recalculation of ReachableTime using
+// the algorithm defined in RFC 4861 section 6.3.2.
+//
+// This SHOULD automatically be invoked during certain situations, as per
+// RFC 4861 section 6.3.4:
+//
+//    If the received Reachable Time value is non-zero, the host SHOULD set its
+//    BaseReachableTime variable to the received value.  If the new value
+//    differs from the previous value, the host SHOULD re-compute a new random
+//    ReachableTime value.  ReachableTime is computed as a uniformly
+//    distributed random value between MIN_RANDOM_FACTOR and MAX_RANDOM_FACTOR
+//    times the BaseReachableTime.  Using a random component eliminates the
+//    possibility that Neighbor Unreachability Detection messages will
+//    synchronize with each other.
+//
+//    In most cases, the advertised Reachable Time value will be the same in
+//    consecutive Router Advertisements, and a host's BaseReachableTime rarely
+//    changes.  In such cases, an implementation SHOULD ensure that a new
+//    random value gets re-computed at least once every few hours.
+//
+// s.mu MUST be locked for writing.
+func (s *NUDState) recomputeReachableTimeLocked() time.Duration {
+	s.prevBaseReachableTime = s.config.BaseReachableTime
+	s.prevMinRandomFactor = s.config.MinRandomFactor
+	s.prevMaxRandomFactor = s.config.MaxRandomFactor
+
+	randomFactor := s.config.MinRandomFactor + s.rng.Float32()*(s.config.MaxRandomFactor-s.config.MinRandomFactor)
+
+	// Check for overflow, given that minRandomFactor and maxRandomFactor are
+	// guaranteed to be positive numbers.
+	if float32(math.MaxInt64)/randomFactor < float32(s.config.BaseReachableTime) {
+		s.reachableTime = time.Duration(math.MaxInt64)
+	} else if randomFactor == 1 {
+		// Avoid loss of precision when a large base reachable time is used.
+		s.reachableTime = s.config.BaseReachableTime
+	} else {
+		reachableTime := int64(float32(s.config.BaseReachableTime) * randomFactor)
+		s.reachableTime = time.Duration(reachableTime)
+	}
+
+	s.expiration = time.Now().Add(2 * time.Hour)
+	return s.reachableTime
+}
diff --git a/pkg/tcpip/stack/nud_test.go b/pkg/tcpip/stack/nud_test.go
new file mode 100644
index 000000000..2494ee610
--- /dev/null
+++ b/pkg/tcpip/stack/nud_test.go
@@ -0,0 +1,795 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack_test
+
+import (
+	"math"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	defaultBaseReachableTime           = 30 * time.Second
+	minimumBaseReachableTime           = time.Millisecond
+	defaultMinRandomFactor             = 0.5
+	defaultMaxRandomFactor             = 1.5
+	defaultRetransmitTimer             = time.Second
+	minimumRetransmitTimer             = time.Millisecond
+	defaultDelayFirstProbeTime         = 5 * time.Second
+	defaultMaxMulticastProbes          = 3
+	defaultMaxUnicastProbes            = 3
+	defaultMaxAnycastDelayTime         = time.Second
+	defaultMaxReachbilityConfirmations = 3
+	defaultUnreachableTime             = 5 * time.Second
+
+	defaultFakeRandomNum = 0.5
+)
+
+// fakeRand is a deterministic random number generator.
+type fakeRand struct {
+	num float32
+}
+
+var _ stack.Rand = (*fakeRand)(nil)
+
+func (f *fakeRand) Float32() float32 {
+	return f.num
+}
+
+// TestSetNUDConfigurationFailsForBadNICID tests to make sure we get an error if
+// we attempt to update NUD configurations using an invalid NICID.
+func TestSetNUDConfigurationFailsForBadNICID(t *testing.T) {
+	s := stack.New(stack.Options{
+		// A neighbor cache is required to store NUDConfigurations. The networking
+		// stack will only allocate neighbor caches if a protocol providing link
+		// address resolution is specified (e.g. ARP or IPv6).
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+	})
+
+	// No NIC with ID 1 yet.
+	config := stack.NUDConfigurations{}
+	if err := s.SetNUDConfigurations(1, config); err != tcpip.ErrUnknownNICID {
+		t.Fatalf("got s.SetNDPConfigurations(1, %+v) = %v, want = %s", config, err, tcpip.ErrUnknownNICID)
+	}
+}
+
+// TestNUDConfigurationFailsForNotSupported tests to make sure we get a
+// NotSupported error if we attempt to retrieve NUD configurations when the
+// stack doesn't support NUD.
+//
+// The stack will report to not support NUD if a neighbor cache for a given NIC
+// is not allocated. The networking stack will only allocate neighbor caches if
+// a protocol providing link address resolution is specified (e.g. ARP, IPv6).
+func TestNUDConfigurationFailsForNotSupported(t *testing.T) {
+	const nicID = 1
+
+	e := channel.New(0, 1280, linkAddr1)
+	e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+	s := stack.New(stack.Options{
+		NUDConfigs: stack.DefaultNUDConfigurations(),
+	})
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+	if _, err := s.NUDConfigurations(nicID); err != tcpip.ErrNotSupported {
+		t.Fatalf("got s.NDPConfigurations(%d) = %v, want = %s", nicID, err, tcpip.ErrNotSupported)
+	}
+}
+
+// TestNUDConfigurationFailsForNotSupported tests to make sure we get a
+// NotSupported error if we attempt to set NUD configurations when the stack
+// doesn't support NUD.
+//
+// The stack will report to not support NUD if a neighbor cache for a given NIC
+// is not allocated. The networking stack will only allocate neighbor caches if
+// a protocol providing link address resolution is specified (e.g. ARP, IPv6).
+func TestSetNUDConfigurationFailsForNotSupported(t *testing.T) {
+	const nicID = 1
+
+	e := channel.New(0, 1280, linkAddr1)
+	e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+	s := stack.New(stack.Options{
+		NUDConfigs: stack.DefaultNUDConfigurations(),
+	})
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	config := stack.NUDConfigurations{}
+	if err := s.SetNUDConfigurations(nicID, config); err != tcpip.ErrNotSupported {
+		t.Fatalf("got s.SetNDPConfigurations(%d, %+v) = %v, want = %s", nicID, config, err, tcpip.ErrNotSupported)
+	}
+}
+
+// TestDefaultNUDConfigurationIsValid verifies that calling
+// resetInvalidFields() on the result of DefaultNUDConfigurations() does not
+// change anything. DefaultNUDConfigurations() should return a valid
+// NUDConfigurations.
+func TestDefaultNUDConfigurations(t *testing.T) {
+	const nicID = 1
+
+	e := channel.New(0, 1280, linkAddr1)
+	e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+	s := stack.New(stack.Options{
+		// A neighbor cache is required to store NUDConfigurations. The networking
+		// stack will only allocate neighbor caches if a protocol providing link
+		// address resolution is specified (e.g. ARP or IPv6).
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NUDConfigs:       stack.DefaultNUDConfigurations(),
+	})
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+	c, err := s.NUDConfigurations(nicID)
+	if err != nil {
+		t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err)
+	}
+	if got, want := c, stack.DefaultNUDConfigurations(); got != want {
+		t.Errorf("got stack.NUDConfigurations(%d) = %+v, want = %+v", nicID, got, want)
+	}
+}
+
+func TestNUDConfigurationsBaseReachableTime(t *testing.T) {
+	tests := []struct {
+		name              string
+		baseReachableTime time.Duration
+		want              time.Duration
+	}{
+		// Invalid cases
+		{
+			name:              "EqualToZero",
+			baseReachableTime: 0,
+			want:              defaultBaseReachableTime,
+		},
+		// Valid cases
+		{
+			name:              "MoreThanZero",
+			baseReachableTime: time.Millisecond,
+			want:              time.Millisecond,
+		},
+		{
+			name:              "MoreThanDefaultBaseReachableTime",
+			baseReachableTime: 2 * defaultBaseReachableTime,
+			want:              2 * defaultBaseReachableTime,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			const nicID = 1
+
+			c := stack.DefaultNUDConfigurations()
+			c.BaseReachableTime = test.baseReachableTime
+
+			e := channel.New(0, 1280, linkAddr1)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+			s := stack.New(stack.Options{
+				// A neighbor cache is required to store NUDConfigurations. The
+				// networking stack will only allocate neighbor caches if a protocol
+				// providing link address resolution is specified (e.g. ARP or IPv6).
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NUDConfigs:       c,
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			sc, err := s.NUDConfigurations(nicID)
+			if err != nil {
+				t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err)
+			}
+			if got := sc.BaseReachableTime; got != test.want {
+				t.Errorf("got BaseReachableTime = %q, want = %q", got, test.want)
+			}
+		})
+	}
+}
+
+func TestNUDConfigurationsMinRandomFactor(t *testing.T) {
+	tests := []struct {
+		name            string
+		minRandomFactor float32
+		want            float32
+	}{
+		// Invalid cases
+		{
+			name:            "LessThanZero",
+			minRandomFactor: -1,
+			want:            defaultMinRandomFactor,
+		},
+		{
+			name:            "EqualToZero",
+			minRandomFactor: 0,
+			want:            defaultMinRandomFactor,
+		},
+		// Valid cases
+		{
+			name:            "MoreThanZero",
+			minRandomFactor: 1,
+			want:            1,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			const nicID = 1
+
+			c := stack.DefaultNUDConfigurations()
+			c.MinRandomFactor = test.minRandomFactor
+
+			e := channel.New(0, 1280, linkAddr1)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+			s := stack.New(stack.Options{
+				// A neighbor cache is required to store NUDConfigurations. The
+				// networking stack will only allocate neighbor caches if a protocol
+				// providing link address resolution is specified (e.g. ARP or IPv6).
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NUDConfigs:       c,
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			sc, err := s.NUDConfigurations(nicID)
+			if err != nil {
+				t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err)
+			}
+			if got := sc.MinRandomFactor; got != test.want {
+				t.Errorf("got MinRandomFactor = %f, want = %f", got, test.want)
+			}
+		})
+	}
+}
+
+func TestNUDConfigurationsMaxRandomFactor(t *testing.T) {
+	tests := []struct {
+		name            string
+		minRandomFactor float32
+		maxRandomFactor float32
+		want            float32
+	}{
+		// Invalid cases
+		{
+			name:            "LessThanZero",
+			minRandomFactor: defaultMinRandomFactor,
+			maxRandomFactor: -1,
+			want:            defaultMaxRandomFactor,
+		},
+		{
+			name:            "EqualToZero",
+			minRandomFactor: defaultMinRandomFactor,
+			maxRandomFactor: 0,
+			want:            defaultMaxRandomFactor,
+		},
+		{
+			name:            "LessThanMinRandomFactor",
+			minRandomFactor: defaultMinRandomFactor,
+			maxRandomFactor: defaultMinRandomFactor * 0.99,
+			want:            defaultMaxRandomFactor,
+		},
+		{
+			name:            "MoreThanMinRandomFactorWhenMinRandomFactorIsLargerThanMaxRandomFactorDefault",
+			minRandomFactor: defaultMaxRandomFactor * 2,
+			maxRandomFactor: defaultMaxRandomFactor,
+			want:            defaultMaxRandomFactor * 6,
+		},
+		// Valid cases
+		{
+			name:            "EqualToMinRandomFactor",
+			minRandomFactor: defaultMinRandomFactor,
+			maxRandomFactor: defaultMinRandomFactor,
+			want:            defaultMinRandomFactor,
+		},
+		{
+			name:            "MoreThanMinRandomFactor",
+			minRandomFactor: defaultMinRandomFactor,
+			maxRandomFactor: defaultMinRandomFactor * 1.1,
+			want:            defaultMinRandomFactor * 1.1,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			const nicID = 1
+
+			c := stack.DefaultNUDConfigurations()
+			c.MinRandomFactor = test.minRandomFactor
+			c.MaxRandomFactor = test.maxRandomFactor
+
+			e := channel.New(0, 1280, linkAddr1)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+			s := stack.New(stack.Options{
+				// A neighbor cache is required to store NUDConfigurations. The
+				// networking stack will only allocate neighbor caches if a protocol
+				// providing link address resolution is specified (e.g. ARP or IPv6).
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NUDConfigs:       c,
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			sc, err := s.NUDConfigurations(nicID)
+			if err != nil {
+				t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err)
+			}
+			if got := sc.MaxRandomFactor; got != test.want {
+				t.Errorf("got MaxRandomFactor = %f, want = %f", got, test.want)
+			}
+		})
+	}
+}
+
+func TestNUDConfigurationsRetransmitTimer(t *testing.T) {
+	tests := []struct {
+		name            string
+		retransmitTimer time.Duration
+		want            time.Duration
+	}{
+		// Invalid cases
+		{
+			name:            "EqualToZero",
+			retransmitTimer: 0,
+			want:            defaultRetransmitTimer,
+		},
+		{
+			name:            "LessThanMinimumRetransmitTimer",
+			retransmitTimer: minimumRetransmitTimer - time.Nanosecond,
+			want:            defaultRetransmitTimer,
+		},
+		// Valid cases
+		{
+			name:            "EqualToMinimumRetransmitTimer",
+			retransmitTimer: minimumRetransmitTimer,
+			want:            minimumBaseReachableTime,
+		},
+		{
+			name:            "LargetThanMinimumRetransmitTimer",
+			retransmitTimer: 2 * minimumBaseReachableTime,
+			want:            2 * minimumBaseReachableTime,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			const nicID = 1
+
+			c := stack.DefaultNUDConfigurations()
+			c.RetransmitTimer = test.retransmitTimer
+
+			e := channel.New(0, 1280, linkAddr1)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+			s := stack.New(stack.Options{
+				// A neighbor cache is required to store NUDConfigurations. The
+				// networking stack will only allocate neighbor caches if a protocol
+				// providing link address resolution is specified (e.g. ARP or IPv6).
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NUDConfigs:       c,
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			sc, err := s.NUDConfigurations(nicID)
+			if err != nil {
+				t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err)
+			}
+			if got := sc.RetransmitTimer; got != test.want {
+				t.Errorf("got RetransmitTimer = %q, want = %q", got, test.want)
+			}
+		})
+	}
+}
+
+func TestNUDConfigurationsDelayFirstProbeTime(t *testing.T) {
+	tests := []struct {
+		name                string
+		delayFirstProbeTime time.Duration
+		want                time.Duration
+	}{
+		// Invalid cases
+		{
+			name:                "EqualToZero",
+			delayFirstProbeTime: 0,
+			want:                defaultDelayFirstProbeTime,
+		},
+		// Valid cases
+		{
+			name:                "MoreThanZero",
+			delayFirstProbeTime: time.Millisecond,
+			want:                time.Millisecond,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			const nicID = 1
+
+			c := stack.DefaultNUDConfigurations()
+			c.DelayFirstProbeTime = test.delayFirstProbeTime
+
+			e := channel.New(0, 1280, linkAddr1)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+			s := stack.New(stack.Options{
+				// A neighbor cache is required to store NUDConfigurations. The
+				// networking stack will only allocate neighbor caches if a protocol
+				// providing link address resolution is specified (e.g. ARP or IPv6).
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NUDConfigs:       c,
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			sc, err := s.NUDConfigurations(nicID)
+			if err != nil {
+				t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err)
+			}
+			if got := sc.DelayFirstProbeTime; got != test.want {
+				t.Errorf("got DelayFirstProbeTime = %q, want = %q", got, test.want)
+			}
+		})
+	}
+}
+
+func TestNUDConfigurationsMaxMulticastProbes(t *testing.T) {
+	tests := []struct {
+		name               string
+		maxMulticastProbes uint32
+		want               uint32
+	}{
+		// Invalid cases
+		{
+			name:               "EqualToZero",
+			maxMulticastProbes: 0,
+			want:               defaultMaxMulticastProbes,
+		},
+		// Valid cases
+		{
+			name:               "MoreThanZero",
+			maxMulticastProbes: 1,
+			want:               1,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			const nicID = 1
+
+			c := stack.DefaultNUDConfigurations()
+			c.MaxMulticastProbes = test.maxMulticastProbes
+
+			e := channel.New(0, 1280, linkAddr1)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+			s := stack.New(stack.Options{
+				// A neighbor cache is required to store NUDConfigurations. The
+				// networking stack will only allocate neighbor caches if a protocol
+				// providing link address resolution is specified (e.g. ARP or IPv6).
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NUDConfigs:       c,
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			sc, err := s.NUDConfigurations(nicID)
+			if err != nil {
+				t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err)
+			}
+			if got := sc.MaxMulticastProbes; got != test.want {
+				t.Errorf("got MaxMulticastProbes = %q, want = %q", got, test.want)
+			}
+		})
+	}
+}
+
+func TestNUDConfigurationsMaxUnicastProbes(t *testing.T) {
+	tests := []struct {
+		name             string
+		maxUnicastProbes uint32
+		want             uint32
+	}{
+		// Invalid cases
+		{
+			name:             "EqualToZero",
+			maxUnicastProbes: 0,
+			want:             defaultMaxUnicastProbes,
+		},
+		// Valid cases
+		{
+			name:             "MoreThanZero",
+			maxUnicastProbes: 1,
+			want:             1,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			const nicID = 1
+
+			c := stack.DefaultNUDConfigurations()
+			c.MaxUnicastProbes = test.maxUnicastProbes
+
+			e := channel.New(0, 1280, linkAddr1)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+			s := stack.New(stack.Options{
+				// A neighbor cache is required to store NUDConfigurations. The
+				// networking stack will only allocate neighbor caches if a protocol
+				// providing link address resolution is specified (e.g. ARP or IPv6).
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NUDConfigs:       c,
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			sc, err := s.NUDConfigurations(nicID)
+			if err != nil {
+				t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err)
+			}
+			if got := sc.MaxUnicastProbes; got != test.want {
+				t.Errorf("got MaxUnicastProbes = %q, want = %q", got, test.want)
+			}
+		})
+	}
+}
+
+func TestNUDConfigurationsUnreachableTime(t *testing.T) {
+	tests := []struct {
+		name            string
+		unreachableTime time.Duration
+		want            time.Duration
+	}{
+		// Invalid cases
+		{
+			name:            "EqualToZero",
+			unreachableTime: 0,
+			want:            defaultUnreachableTime,
+		},
+		// Valid cases
+		{
+			name:            "MoreThanZero",
+			unreachableTime: time.Millisecond,
+			want:            time.Millisecond,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			const nicID = 1
+
+			c := stack.DefaultNUDConfigurations()
+			c.UnreachableTime = test.unreachableTime
+
+			e := channel.New(0, 1280, linkAddr1)
+			e.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+
+			s := stack.New(stack.Options{
+				// A neighbor cache is required to store NUDConfigurations. The
+				// networking stack will only allocate neighbor caches if a protocol
+				// providing link address resolution is specified (e.g. ARP or IPv6).
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NUDConfigs:       c,
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			sc, err := s.NUDConfigurations(nicID)
+			if err != nil {
+				t.Fatalf("got stack.NUDConfigurations(%d) = %s", nicID, err)
+			}
+			if got := sc.UnreachableTime; got != test.want {
+				t.Errorf("got UnreachableTime = %q, want = %q", got, test.want)
+			}
+		})
+	}
+}
+
+// TestNUDStateReachableTime verifies the correctness of the ReachableTime
+// computation.
+func TestNUDStateReachableTime(t *testing.T) {
+	tests := []struct {
+		name              string
+		baseReachableTime time.Duration
+		minRandomFactor   float32
+		maxRandomFactor   float32
+		want              time.Duration
+	}{
+		{
+			name:              "AllZeros",
+			baseReachableTime: 0,
+			minRandomFactor:   0,
+			maxRandomFactor:   0,
+			want:              0,
+		},
+		{
+			name:              "ZeroMaxRandomFactor",
+			baseReachableTime: time.Second,
+			minRandomFactor:   0,
+			maxRandomFactor:   0,
+			want:              0,
+		},
+		{
+			name:              "ZeroMinRandomFactor",
+			baseReachableTime: time.Second,
+			minRandomFactor:   0,
+			maxRandomFactor:   1,
+			want:              time.Duration(defaultFakeRandomNum * float32(time.Second)),
+		},
+		{
+			name:              "FractionalRandomFactor",
+			baseReachableTime: time.Duration(math.MaxInt64),
+			minRandomFactor:   0.001,
+			maxRandomFactor:   0.002,
+			want:              time.Duration((0.001 + (0.001 * defaultFakeRandomNum)) * float32(math.MaxInt64)),
+		},
+		{
+			name:              "MinAndMaxRandomFactorsEqual",
+			baseReachableTime: time.Second,
+			minRandomFactor:   1,
+			maxRandomFactor:   1,
+			want:              time.Second,
+		},
+		{
+			name:              "MinAndMaxRandomFactorsDifferent",
+			baseReachableTime: time.Second,
+			minRandomFactor:   1,
+			maxRandomFactor:   2,
+			want:              time.Duration((1.0 + defaultFakeRandomNum) * float32(time.Second)),
+		},
+		{
+			name:              "MaxInt64",
+			baseReachableTime: time.Duration(math.MaxInt64),
+			minRandomFactor:   1,
+			maxRandomFactor:   1,
+			want:              time.Duration(math.MaxInt64),
+		},
+		{
+			name:              "Overflow",
+			baseReachableTime: time.Duration(math.MaxInt64),
+			minRandomFactor:   1.5,
+			maxRandomFactor:   1.5,
+			want:              time.Duration(math.MaxInt64),
+		},
+		{
+			name:              "DoubleOverflow",
+			baseReachableTime: time.Duration(math.MaxInt64),
+			minRandomFactor:   2.5,
+			maxRandomFactor:   2.5,
+			want:              time.Duration(math.MaxInt64),
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			c := stack.NUDConfigurations{
+				BaseReachableTime: test.baseReachableTime,
+				MinRandomFactor:   test.minRandomFactor,
+				MaxRandomFactor:   test.maxRandomFactor,
+			}
+			// A fake random number generator is used to ensure deterministic
+			// results.
+			rng := fakeRand{
+				num: defaultFakeRandomNum,
+			}
+			s := stack.NewNUDState(c, &rng)
+			if got, want := s.ReachableTime(), test.want; got != want {
+				t.Errorf("got ReachableTime = %q, want = %q", got, want)
+			}
+		})
+	}
+}
+
+// TestNUDStateRecomputeReachableTime exercises the ReachableTime function
+// twice to verify recomputation of reachable time when the min random factor,
+// max random factor, or base reachable time changes.
+func TestNUDStateRecomputeReachableTime(t *testing.T) {
+	const defaultBase = time.Second
+	const defaultMin = 2.0 * defaultMaxRandomFactor
+	const defaultMax = 3.0 * defaultMaxRandomFactor
+
+	tests := []struct {
+		name              string
+		baseReachableTime time.Duration
+		minRandomFactor   float32
+		maxRandomFactor   float32
+		want              time.Duration
+	}{
+		{
+			name:              "BaseReachableTime",
+			baseReachableTime: 2 * defaultBase,
+			minRandomFactor:   defaultMin,
+			maxRandomFactor:   defaultMax,
+			want:              time.Duration((defaultMin + (defaultMax-defaultMin)*defaultFakeRandomNum) * float32(2*defaultBase)),
+		},
+		{
+			name:              "MinRandomFactor",
+			baseReachableTime: defaultBase,
+			minRandomFactor:   defaultMax,
+			maxRandomFactor:   defaultMax,
+			want:              time.Duration(defaultMax * float32(defaultBase)),
+		},
+		{
+			name:              "MaxRandomFactor",
+			baseReachableTime: defaultBase,
+			minRandomFactor:   defaultMin,
+			maxRandomFactor:   defaultMin,
+			want:              time.Duration(defaultMin * float32(defaultBase)),
+		},
+		{
+			name:              "BothRandomFactor",
+			baseReachableTime: defaultBase,
+			minRandomFactor:   2 * defaultMin,
+			maxRandomFactor:   2 * defaultMax,
+			want:              time.Duration((2*defaultMin + (2*defaultMax-2*defaultMin)*defaultFakeRandomNum) * float32(defaultBase)),
+		},
+		{
+			name:              "BaseReachableTimeAndBothRandomFactors",
+			baseReachableTime: 2 * defaultBase,
+			minRandomFactor:   2 * defaultMin,
+			maxRandomFactor:   2 * defaultMax,
+			want:              time.Duration((2*defaultMin + (2*defaultMax-2*defaultMin)*defaultFakeRandomNum) * float32(2*defaultBase)),
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			c := stack.DefaultNUDConfigurations()
+			c.BaseReachableTime = defaultBase
+			c.MinRandomFactor = defaultMin
+			c.MaxRandomFactor = defaultMax
+
+			// A fake random number generator is used to ensure deterministic
+			// results.
+			rng := fakeRand{
+				num: defaultFakeRandomNum,
+			}
+			s := stack.NewNUDState(c, &rng)
+			old := s.ReachableTime()
+
+			if got, want := s.ReachableTime(), old; got != want {
+				t.Errorf("got ReachableTime = %q, want = %q", got, want)
+			}
+
+			// Check for recomputation when changing the min random factor, the max
+			// random factor, the base reachability time, or any permutation of those
+			// three options.
+			c.BaseReachableTime = test.baseReachableTime
+			c.MinRandomFactor = test.minRandomFactor
+			c.MaxRandomFactor = test.maxRandomFactor
+			s.SetConfig(c)
+
+			if got, want := s.ReachableTime(), test.want; got != want {
+				t.Errorf("got ReachableTime = %q, want = %q", got, want)
+			}
+
+			// Verify that ReachableTime isn't recomputed when none of the
+			// configuration options change. The random factor is changed so that if
+			// a recompution were to occur, ReachableTime would change.
+			rng.num = defaultFakeRandomNum / 2.0
+			if got, want := s.ReachableTime(), test.want; got != want {
+				t.Errorf("got ReachableTime = %q, want = %q", got, want)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
index 926df4d7b..17b8beebb 100644
--- a/pkg/tcpip/stack/packet_buffer.go
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -14,50 +14,76 @@
 package stack
 
 import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 )
 
+type headerType int
+
+const (
+	linkHeader headerType = iota
+	networkHeader
+	transportHeader
+	numHeaderType
+)
+
+// PacketBufferOptions specifies options for PacketBuffer creation.
+type PacketBufferOptions struct {
+	// ReserveHeaderBytes is the number of bytes to reserve for headers. Total
+	// number of bytes pushed onto the headers must not exceed this value.
+	ReserveHeaderBytes int
+
+	// Data is the initial unparsed data for the new packet. If set, it will be
+	// owned by the new packet.
+	Data buffer.VectorisedView
+}
+
 // A PacketBuffer contains all the data of a network packet.
 //
 // As a PacketBuffer traverses up the stack, it may be necessary to pass it to
-// multiple endpoints. Clone() should be called in such cases so that
-// modifications to the Data field do not affect other copies.
+// multiple endpoints.
+//
+// The whole packet is expected to be a series of bytes in the following order:
+// LinkHeader, NetworkHeader, TransportHeader, and Data. Any of them can be
+// empty. Use of PacketBuffer in any other order is unsupported.
+//
+// PacketBuffer must be created with NewPacketBuffer.
 type PacketBuffer struct {
+	_ sync.NoCopy
+
 	// PacketBufferEntry is used to build an intrusive list of
 	// PacketBuffers.
 	PacketBufferEntry
 
-	// Data holds the payload of the packet. For inbound packets, it also
-	// holds the headers, which are consumed as the packet moves up the
-	// stack. Headers are guaranteed not to be split across views.
+	// Data holds the payload of the packet.
+	//
+	// For inbound packets, Data is initially the whole packet. Then gets moved to
+	// headers via PacketHeader.Consume, when the packet is being parsed.
 	//
-	// The bytes backing Data are immutable, but Data itself may be trimmed
-	// or otherwise modified.
+	// For outbound packets, Data is the innermost layer, defined by the protocol.
+	// Headers are pushed in front of it via PacketHeader.Push.
+	//
+	// The bytes backing Data are immutable, a.k.a. users shouldn't write to its
+	// backing storage.
 	Data buffer.VectorisedView
 
-	// Header holds the headers of outbound packets. As a packet is passed
-	// down the stack, each layer adds to Header. Note that forwarded
-	// packets don't populate Headers on their way out -- their headers and
-	// payload are never parsed out and remain in Data.
-	//
-	// TODO(gvisor.dev/issue/170): Forwarded packets don't currently
-	// populate Header, but should. This will be doable once early parsing
-	// (https://github.com/google/gvisor/pull/1995) is supported.
-	Header buffer.Prependable
+	// headers stores metadata about each header.
+	headers [numHeaderType]headerInfo
 
-	// These fields are used by both inbound and outbound packets. They
-	// typically overlap with the Data and Header fields.
-	//
-	// The bytes backing these views are immutable. Each field may be nil
-	// if either it has not been set yet or no such header exists (e.g.
-	// packets sent via loopback may not have a link header).
+	// header is the internal storage for outbound packets. Headers will be pushed
+	// (prepended) on this storage as the packet is being constructed.
 	//
-	// These fields may be Views into other slices (either Data or Header).
-	// SR dosen't support this, so deep copies are necessary in some cases.
-	LinkHeader      buffer.View
-	NetworkHeader   buffer.View
-	TransportHeader buffer.View
+	// TODO(gvisor.dev/issue/2404): Switch to an implementation that header and
+	// data are held in the same underlying buffer storage.
+	header buffer.Prependable
+
+	// NetworkProtocol is only valid when NetworkHeader is set.
+	// TODO(gvisor.dev/issue/3574): Remove the separately passed protocol
+	// numbers in registration APIs that take a PacketBuffer.
+	NetworkProtocolNumber tcpip.NetworkProtocolNumber
 
 	// Hash is the transport layer hash of this packet. A value of zero
 	// indicates no valid hash has been set.
@@ -69,20 +95,205 @@ type PacketBuffer struct {
 
 	// The following fields are only set by the qdisc layer when the packet
 	// is added to a queue.
-	EgressRoute           *Route
-	GSOOptions            *GSO
-	NetworkProtocolNumber tcpip.NetworkProtocolNumber
+	EgressRoute *Route
+	GSOOptions  *GSO
 
 	// NatDone indicates if the packet has been manipulated as per NAT
 	// iptables rule.
 	NatDone bool
+
+	// PktType indicates the SockAddrLink.PacketType of the packet as defined in
+	// https://www.man7.org/linux/man-pages/man7/packet.7.html.
+	PktType tcpip.PacketType
 }
 
-// Clone makes a copy of pk. It clones the Data field, which creates a new
-// VectorisedView but does not deep copy the underlying bytes.
-//
-// Clone also does not deep copy any of its other fields.
-func (pk PacketBuffer) Clone() PacketBuffer {
-	pk.Data = pk.Data.Clone(nil)
+// NewPacketBuffer creates a new PacketBuffer with opts.
+func NewPacketBuffer(opts PacketBufferOptions) *PacketBuffer {
+	pk := &PacketBuffer{
+		Data: opts.Data,
+	}
+	if opts.ReserveHeaderBytes != 0 {
+		pk.header = buffer.NewPrependable(opts.ReserveHeaderBytes)
+	}
 	return pk
 }
+
+// ReservedHeaderBytes returns the number of bytes initially reserved for
+// headers.
+func (pk *PacketBuffer) ReservedHeaderBytes() int {
+	return pk.header.UsedLength() + pk.header.AvailableLength()
+}
+
+// AvailableHeaderBytes returns the number of bytes currently available for
+// headers. This is relevant to PacketHeader.Push method only.
+func (pk *PacketBuffer) AvailableHeaderBytes() int {
+	return pk.header.AvailableLength()
+}
+
+// LinkHeader returns the handle to link-layer header.
+func (pk *PacketBuffer) LinkHeader() PacketHeader {
+	return PacketHeader{
+		pk:  pk,
+		typ: linkHeader,
+	}
+}
+
+// NetworkHeader returns the handle to network-layer header.
+func (pk *PacketBuffer) NetworkHeader() PacketHeader {
+	return PacketHeader{
+		pk:  pk,
+		typ: networkHeader,
+	}
+}
+
+// TransportHeader returns the handle to transport-layer header.
+func (pk *PacketBuffer) TransportHeader() PacketHeader {
+	return PacketHeader{
+		pk:  pk,
+		typ: transportHeader,
+	}
+}
+
+// HeaderSize returns the total size of all headers in bytes.
+func (pk *PacketBuffer) HeaderSize() int {
+	// Note for inbound packets (Consume called), headers are not stored in
+	// pk.header. Thus, calculation of size of each header is needed.
+	var size int
+	for i := range pk.headers {
+		size += len(pk.headers[i].buf)
+	}
+	return size
+}
+
+// Size returns the size of packet in bytes.
+func (pk *PacketBuffer) Size() int {
+	return pk.HeaderSize() + pk.Data.Size()
+}
+
+// Views returns the underlying storage of the whole packet.
+func (pk *PacketBuffer) Views() []buffer.View {
+	// Optimization for outbound packets that headers are in pk.header.
+	useHeader := true
+	for i := range pk.headers {
+		if !canUseHeader(&pk.headers[i]) {
+			useHeader = false
+			break
+		}
+	}
+
+	dataViews := pk.Data.Views()
+
+	var vs []buffer.View
+	if useHeader {
+		vs = make([]buffer.View, 0, 1+len(dataViews))
+		vs = append(vs, pk.header.View())
+	} else {
+		vs = make([]buffer.View, 0, len(pk.headers)+len(dataViews))
+		for i := range pk.headers {
+			if v := pk.headers[i].buf; len(v) > 0 {
+				vs = append(vs, v)
+			}
+		}
+	}
+	return append(vs, dataViews...)
+}
+
+func canUseHeader(h *headerInfo) bool {
+	// h.offset will be negative if the header was pushed in to prependable
+	// portion, or doesn't matter when it's empty.
+	return len(h.buf) == 0 || h.offset < 0
+}
+
+func (pk *PacketBuffer) push(typ headerType, size int) buffer.View {
+	h := &pk.headers[typ]
+	if h.buf != nil {
+		panic(fmt.Sprintf("push must not be called twice: type %s", typ))
+	}
+	h.buf = buffer.View(pk.header.Prepend(size))
+	h.offset = -pk.header.UsedLength()
+	return h.buf
+}
+
+func (pk *PacketBuffer) consume(typ headerType, size int) (v buffer.View, consumed bool) {
+	h := &pk.headers[typ]
+	if h.buf != nil {
+		panic(fmt.Sprintf("consume must not be called twice: type %s", typ))
+	}
+	v, ok := pk.Data.PullUp(size)
+	if !ok {
+		return
+	}
+	pk.Data.TrimFront(size)
+	h.buf = v
+	return h.buf, true
+}
+
+// Clone makes a shallow copy of pk.
+//
+// Clone should be called in such cases so that no modifications is done to
+// underlying packet payload.
+func (pk *PacketBuffer) Clone() *PacketBuffer {
+	newPk := &PacketBuffer{
+		PacketBufferEntry:     pk.PacketBufferEntry,
+		Data:                  pk.Data.Clone(nil),
+		headers:               pk.headers,
+		header:                pk.header,
+		Hash:                  pk.Hash,
+		Owner:                 pk.Owner,
+		EgressRoute:           pk.EgressRoute,
+		GSOOptions:            pk.GSOOptions,
+		NetworkProtocolNumber: pk.NetworkProtocolNumber,
+		NatDone:               pk.NatDone,
+	}
+	return newPk
+}
+
+// headerInfo stores metadata about a header in a packet.
+type headerInfo struct {
+	// buf is the memorized slice for both prepended and consumed header.
+	// When header is prepended, buf serves as memorized value, which is a slice
+	// of pk.header. When header is consumed, buf is the slice pulled out from
+	// pk.Data, which is the only place to hold this header.
+	buf buffer.View
+
+	// offset will be a negative number denoting the offset where this header is
+	// from the end of pk.header, if it is prepended. Otherwise, zero.
+	offset int
+}
+
+// PacketHeader is a handle object to a header in the underlying packet.
+type PacketHeader struct {
+	pk  *PacketBuffer
+	typ headerType
+}
+
+// View returns the underlying storage of h.
+func (h PacketHeader) View() buffer.View {
+	return h.pk.headers[h.typ].buf
+}
+
+// Push pushes size bytes in the front of its residing packet, and returns the
+// backing storage. Callers may only call one of Push or Consume once on each
+// header in the lifetime of the underlying packet.
+func (h PacketHeader) Push(size int) buffer.View {
+	return h.pk.push(h.typ, size)
+}
+
+// Consume moves the first size bytes of the unparsed data portion in the packet
+// to h, and returns the backing storage. In the case of data is shorter than
+// size, consumed will be false, and the state of h will not be affected.
+// Callers may only call one of Push or Consume once on each header in the
+// lifetime of the underlying packet.
+func (h PacketHeader) Consume(size int) (v buffer.View, consumed bool) {
+	return h.pk.consume(h.typ, size)
+}
+
+// PayloadSince returns packet payload starting from and including a particular
+// header. This method isn't optimized and should be used in test only.
+func PayloadSince(h PacketHeader) buffer.View {
+	var v buffer.View
+	for _, hinfo := range h.pk.headers[h.typ:] {
+		v = append(v, hinfo.buf...)
+	}
+	return append(v, h.pk.Data.ToView()...)
+}
diff --git a/pkg/tcpip/stack/packet_buffer_test.go b/pkg/tcpip/stack/packet_buffer_test.go
new file mode 100644
index 000000000..c6fa8da5f
--- /dev/null
+++ b/pkg/tcpip/stack/packet_buffer_test.go
@@ -0,0 +1,397 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at //
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"bytes"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+func TestPacketHeaderPush(t *testing.T) {
+	for _, test := range []struct {
+		name      string
+		reserved  int
+		link      []byte
+		network   []byte
+		transport []byte
+		data      []byte
+	}{
+		{
+			name: "construct empty packet",
+		},
+		{
+			name:     "construct link header only packet",
+			reserved: 60,
+			link:     makeView(10),
+		},
+		{
+			name:     "construct link and network header only packet",
+			reserved: 60,
+			link:     makeView(10),
+			network:  makeView(20),
+		},
+		{
+			name:      "construct header only packet",
+			reserved:  60,
+			link:      makeView(10),
+			network:   makeView(20),
+			transport: makeView(30),
+		},
+		{
+			name: "construct data only packet",
+			data: makeView(40),
+		},
+		{
+			name:      "construct L3 packet",
+			reserved:  60,
+			network:   makeView(20),
+			transport: makeView(30),
+			data:      makeView(40),
+		},
+		{
+			name:      "construct L2 packet",
+			reserved:  60,
+			link:      makeView(10),
+			network:   makeView(20),
+			transport: makeView(30),
+			data:      makeView(40),
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			pk := NewPacketBuffer(PacketBufferOptions{
+				ReserveHeaderBytes: test.reserved,
+				// Make a copy of data to make sure our truth data won't be taint by
+				// PacketBuffer.
+				Data: buffer.NewViewFromBytes(test.data).ToVectorisedView(),
+			})
+
+			allHdrSize := len(test.link) + len(test.network) + len(test.transport)
+
+			// Check the initial values for packet.
+			checkInitialPacketBuffer(t, pk, PacketBufferOptions{
+				ReserveHeaderBytes: test.reserved,
+				Data:               buffer.View(test.data).ToVectorisedView(),
+			})
+
+			// Push headers.
+			if v := test.transport; len(v) > 0 {
+				copy(pk.TransportHeader().Push(len(v)), v)
+			}
+			if v := test.network; len(v) > 0 {
+				copy(pk.NetworkHeader().Push(len(v)), v)
+			}
+			if v := test.link; len(v) > 0 {
+				copy(pk.LinkHeader().Push(len(v)), v)
+			}
+
+			// Check the after values for packet.
+			if got, want := pk.ReservedHeaderBytes(), test.reserved; got != want {
+				t.Errorf("After pk.ReservedHeaderBytes() = %d, want %d", got, want)
+			}
+			if got, want := pk.AvailableHeaderBytes(), test.reserved-allHdrSize; got != want {
+				t.Errorf("After pk.AvailableHeaderBytes() = %d, want %d", got, want)
+			}
+			if got, want := pk.HeaderSize(), allHdrSize; got != want {
+				t.Errorf("After pk.HeaderSize() = %d, want %d", got, want)
+			}
+			if got, want := pk.Size(), allHdrSize+len(test.data); got != want {
+				t.Errorf("After pk.Size() = %d, want %d", got, want)
+			}
+			checkViewEqual(t, "After pk.Data.Views()", concatViews(pk.Data.Views()...), test.data)
+			checkViewEqual(t, "After pk.Views()", concatViews(pk.Views()...),
+				concatViews(test.link, test.network, test.transport, test.data))
+			// Check the after values for each header.
+			checkPacketHeader(t, "After pk.LinkHeader", pk.LinkHeader(), test.link)
+			checkPacketHeader(t, "After pk.NetworkHeader", pk.NetworkHeader(), test.network)
+			checkPacketHeader(t, "After pk.TransportHeader", pk.TransportHeader(), test.transport)
+			// Check the after values for PayloadSince.
+			checkViewEqual(t, "After PayloadSince(LinkHeader)",
+				PayloadSince(pk.LinkHeader()),
+				concatViews(test.link, test.network, test.transport, test.data))
+			checkViewEqual(t, "After PayloadSince(NetworkHeader)",
+				PayloadSince(pk.NetworkHeader()),
+				concatViews(test.network, test.transport, test.data))
+			checkViewEqual(t, "After PayloadSince(TransportHeader)",
+				PayloadSince(pk.TransportHeader()),
+				concatViews(test.transport, test.data))
+		})
+	}
+}
+
+func TestPacketHeaderConsume(t *testing.T) {
+	for _, test := range []struct {
+		name      string
+		data      []byte
+		link      int
+		network   int
+		transport int
+	}{
+		{
+			name:      "parse L2 packet",
+			data:      concatViews(makeView(10), makeView(20), makeView(30), makeView(40)),
+			link:      10,
+			network:   20,
+			transport: 30,
+		},
+		{
+			name:      "parse L3 packet",
+			data:      concatViews(makeView(20), makeView(30), makeView(40)),
+			network:   20,
+			transport: 30,
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			pk := NewPacketBuffer(PacketBufferOptions{
+				// Make a copy of data to make sure our truth data won't be taint by
+				// PacketBuffer.
+				Data: buffer.NewViewFromBytes(test.data).ToVectorisedView(),
+			})
+
+			// Check the initial values for packet.
+			checkInitialPacketBuffer(t, pk, PacketBufferOptions{
+				Data: buffer.View(test.data).ToVectorisedView(),
+			})
+
+			// Consume headers.
+			if size := test.link; size > 0 {
+				if _, ok := pk.LinkHeader().Consume(size); !ok {
+					t.Fatalf("pk.LinkHeader().Consume() = false, want true")
+				}
+			}
+			if size := test.network; size > 0 {
+				if _, ok := pk.NetworkHeader().Consume(size); !ok {
+					t.Fatalf("pk.NetworkHeader().Consume() = false, want true")
+				}
+			}
+			if size := test.transport; size > 0 {
+				if _, ok := pk.TransportHeader().Consume(size); !ok {
+					t.Fatalf("pk.TransportHeader().Consume() = false, want true")
+				}
+			}
+
+			allHdrSize := test.link + test.network + test.transport
+
+			// Check the after values for packet.
+			if got, want := pk.ReservedHeaderBytes(), 0; got != want {
+				t.Errorf("After pk.ReservedHeaderBytes() = %d, want %d", got, want)
+			}
+			if got, want := pk.AvailableHeaderBytes(), 0; got != want {
+				t.Errorf("After pk.AvailableHeaderBytes() = %d, want %d", got, want)
+			}
+			if got, want := pk.HeaderSize(), allHdrSize; got != want {
+				t.Errorf("After pk.HeaderSize() = %d, want %d", got, want)
+			}
+			if got, want := pk.Size(), len(test.data); got != want {
+				t.Errorf("After pk.Size() = %d, want %d", got, want)
+			}
+			// After state of pk.
+			var (
+				link      = test.data[:test.link]
+				network   = test.data[test.link:][:test.network]
+				transport = test.data[test.link+test.network:][:test.transport]
+				payload   = test.data[allHdrSize:]
+			)
+			checkViewEqual(t, "After pk.Data.Views()", concatViews(pk.Data.Views()...), payload)
+			checkViewEqual(t, "After pk.Views()", concatViews(pk.Views()...), test.data)
+			// Check the after values for each header.
+			checkPacketHeader(t, "After pk.LinkHeader", pk.LinkHeader(), link)
+			checkPacketHeader(t, "After pk.NetworkHeader", pk.NetworkHeader(), network)
+			checkPacketHeader(t, "After pk.TransportHeader", pk.TransportHeader(), transport)
+			// Check the after values for PayloadSince.
+			checkViewEqual(t, "After PayloadSince(LinkHeader)",
+				PayloadSince(pk.LinkHeader()),
+				concatViews(link, network, transport, payload))
+			checkViewEqual(t, "After PayloadSince(NetworkHeader)",
+				PayloadSince(pk.NetworkHeader()),
+				concatViews(network, transport, payload))
+			checkViewEqual(t, "After PayloadSince(TransportHeader)",
+				PayloadSince(pk.TransportHeader()),
+				concatViews(transport, payload))
+		})
+	}
+}
+
+func TestPacketHeaderConsumeDataTooShort(t *testing.T) {
+	data := makeView(10)
+
+	pk := NewPacketBuffer(PacketBufferOptions{
+		// Make a copy of data to make sure our truth data won't be taint by
+		// PacketBuffer.
+		Data: buffer.NewViewFromBytes(data).ToVectorisedView(),
+	})
+
+	// Consume should fail if pkt.Data is too short.
+	if _, ok := pk.LinkHeader().Consume(11); ok {
+		t.Fatalf("pk.LinkHeader().Consume() = _, true; want _, false")
+	}
+	if _, ok := pk.NetworkHeader().Consume(11); ok {
+		t.Fatalf("pk.NetworkHeader().Consume() = _, true; want _, false")
+	}
+	if _, ok := pk.TransportHeader().Consume(11); ok {
+		t.Fatalf("pk.TransportHeader().Consume() = _, true; want _, false")
+	}
+
+	// Check packet should look the same as initial packet.
+	checkInitialPacketBuffer(t, pk, PacketBufferOptions{
+		Data: buffer.View(data).ToVectorisedView(),
+	})
+}
+
+func TestPacketHeaderPushCalledAtMostOnce(t *testing.T) {
+	const headerSize = 10
+
+	pk := NewPacketBuffer(PacketBufferOptions{
+		ReserveHeaderBytes: headerSize * int(numHeaderType),
+	})
+
+	for _, h := range []PacketHeader{
+		pk.TransportHeader(),
+		pk.NetworkHeader(),
+		pk.LinkHeader(),
+	} {
+		t.Run("PushedTwice/"+h.typ.String(), func(t *testing.T) {
+			h.Push(headerSize)
+
+			defer func() { recover() }()
+			h.Push(headerSize)
+			t.Fatal("Second push should have panicked")
+		})
+	}
+}
+
+func TestPacketHeaderConsumeCalledAtMostOnce(t *testing.T) {
+	const headerSize = 10
+
+	pk := NewPacketBuffer(PacketBufferOptions{
+		Data: makeView(headerSize * int(numHeaderType)).ToVectorisedView(),
+	})
+
+	for _, h := range []PacketHeader{
+		pk.LinkHeader(),
+		pk.NetworkHeader(),
+		pk.TransportHeader(),
+	} {
+		t.Run("ConsumedTwice/"+h.typ.String(), func(t *testing.T) {
+			if _, ok := h.Consume(headerSize); !ok {
+				t.Fatal("First consume should succeed")
+			}
+
+			defer func() { recover() }()
+			h.Consume(headerSize)
+			t.Fatal("Second consume should have panicked")
+		})
+	}
+}
+
+func TestPacketHeaderPushThenConsumePanics(t *testing.T) {
+	const headerSize = 10
+
+	pk := NewPacketBuffer(PacketBufferOptions{
+		ReserveHeaderBytes: headerSize * int(numHeaderType),
+	})
+
+	for _, h := range []PacketHeader{
+		pk.TransportHeader(),
+		pk.NetworkHeader(),
+		pk.LinkHeader(),
+	} {
+		t.Run(h.typ.String(), func(t *testing.T) {
+			h.Push(headerSize)
+
+			defer func() { recover() }()
+			h.Consume(headerSize)
+			t.Fatal("Consume should have panicked")
+		})
+	}
+}
+
+func TestPacketHeaderConsumeThenPushPanics(t *testing.T) {
+	const headerSize = 10
+
+	pk := NewPacketBuffer(PacketBufferOptions{
+		Data: makeView(headerSize * int(numHeaderType)).ToVectorisedView(),
+	})
+
+	for _, h := range []PacketHeader{
+		pk.LinkHeader(),
+		pk.NetworkHeader(),
+		pk.TransportHeader(),
+	} {
+		t.Run(h.typ.String(), func(t *testing.T) {
+			h.Consume(headerSize)
+
+			defer func() { recover() }()
+			h.Push(headerSize)
+			t.Fatal("Push should have panicked")
+		})
+	}
+}
+
+func checkInitialPacketBuffer(t *testing.T, pk *PacketBuffer, opts PacketBufferOptions) {
+	t.Helper()
+	reserved := opts.ReserveHeaderBytes
+	if got, want := pk.ReservedHeaderBytes(), reserved; got != want {
+		t.Errorf("Initial pk.ReservedHeaderBytes() = %d, want %d", got, want)
+	}
+	if got, want := pk.AvailableHeaderBytes(), reserved; got != want {
+		t.Errorf("Initial pk.AvailableHeaderBytes() = %d, want %d", got, want)
+	}
+	if got, want := pk.HeaderSize(), 0; got != want {
+		t.Errorf("Initial pk.HeaderSize() = %d, want %d", got, want)
+	}
+	data := opts.Data.ToView()
+	if got, want := pk.Size(), len(data); got != want {
+		t.Errorf("Initial pk.Size() = %d, want %d", got, want)
+	}
+	checkViewEqual(t, "Initial pk.Data.Views()", concatViews(pk.Data.Views()...), data)
+	checkViewEqual(t, "Initial pk.Views()", concatViews(pk.Views()...), data)
+	// Check the initial values for each header.
+	checkPacketHeader(t, "Initial pk.LinkHeader", pk.LinkHeader(), nil)
+	checkPacketHeader(t, "Initial pk.NetworkHeader", pk.NetworkHeader(), nil)
+	checkPacketHeader(t, "Initial pk.TransportHeader", pk.TransportHeader(), nil)
+	// Check the initial valies for PayloadSince.
+	checkViewEqual(t, "Initial PayloadSince(LinkHeader)",
+		PayloadSince(pk.LinkHeader()), data)
+	checkViewEqual(t, "Initial PayloadSince(NetworkHeader)",
+		PayloadSince(pk.NetworkHeader()), data)
+	checkViewEqual(t, "Initial PayloadSince(TransportHeader)",
+		PayloadSince(pk.TransportHeader()), data)
+}
+
+func checkPacketHeader(t *testing.T, name string, h PacketHeader, want []byte) {
+	t.Helper()
+	checkViewEqual(t, name+".View()", h.View(), want)
+}
+
+func checkViewEqual(t *testing.T, what string, got, want buffer.View) {
+	t.Helper()
+	if !bytes.Equal(got, want) {
+		t.Errorf("%s = %x, want %x", what, got, want)
+	}
+}
+
+func makeView(size int) buffer.View {
+	b := byte(size)
+	return bytes.Repeat([]byte{b}, size)
+}
+
+func concatViews(views ...buffer.View) buffer.View {
+	var all buffer.View
+	for _, v := range views {
+		all = append(all, v...)
+	}
+	return all
+}
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
index b331427c6..4570e8969 100644
--- a/pkg/tcpip/stack/registration.go
+++ b/pkg/tcpip/stack/registration.go
@@ -18,6 +18,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -51,8 +52,11 @@ type TransportEndpointID struct {
 type ControlType int
 
 // The following are the allowed values for ControlType values.
+// TODO(http://gvisor.dev/issue/3210): Support time exceeded messages.
 const (
-	ControlPacketTooBig ControlType = iota
+	ControlNetworkUnreachable ControlType = iota
+	ControlNoRoute
+	ControlPacketTooBig
 	ControlPortUnreachable
 	ControlUnknown
 )
@@ -67,12 +71,12 @@ type TransportEndpoint interface {
 	// this transport endpoint. It sets pkt.TransportHeader.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, id TransportEndpointID, pkt PacketBuffer)
+	HandlePacket(r *Route, id TransportEndpointID, pkt *PacketBuffer)
 
 	// HandleControlPacket is called by the stack when new control (e.g.
 	// ICMP) packets arrive to this transport endpoint.
 	// HandleControlPacket takes ownership of pkt.
-	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt PacketBuffer)
+	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt *PacketBuffer)
 
 	// Abort initiates an expedited endpoint teardown. It puts the endpoint
 	// in a closed state and frees all resources associated with it. This
@@ -100,7 +104,7 @@ type RawTransportEndpoint interface {
 	// layer up.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, pkt PacketBuffer)
+	HandlePacket(r *Route, pkt *PacketBuffer)
 }
 
 // PacketEndpoint is the interface that needs to be implemented by packet
@@ -118,7 +122,7 @@ type PacketEndpoint interface {
 	// should construct its own ethernet header for applications.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt PacketBuffer)
+	HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
 }
 
 // TransportProtocol is the interface that needs to be implemented by transport
@@ -150,7 +154,7 @@ type TransportProtocol interface {
 	// stats purposes only).
 	//
 	// HandleUnknownDestinationPacket takes ownership of pkt.
-	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt PacketBuffer) bool
+	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) bool
 
 	// SetOption allows enabling/disabling protocol specific features.
 	// SetOption returns an error if the option is not supported or the
@@ -168,6 +172,11 @@ type TransportProtocol interface {
 
 	// Wait waits for any worker goroutines owned by the protocol to stop.
 	Wait()
+
+	// Parse sets pkt.TransportHeader and trims pkt.Data appropriately. It does
+	// neither and returns false if pkt.Data is too small, i.e. pkt.Data.Size() <
+	// MinimumPacketSize()
+	Parse(pkt *PacketBuffer) (ok bool)
 }
 
 // TransportDispatcher contains the methods used by the network stack to deliver
@@ -180,7 +189,7 @@ type TransportDispatcher interface {
 	// pkt.NetworkHeader must be set before calling DeliverTransportPacket.
 	//
 	// DeliverTransportPacket takes ownership of pkt.
-	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer)
+	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer)
 
 	// DeliverTransportControlPacket delivers control packets to the
 	// appropriate transport protocol endpoint.
@@ -189,7 +198,7 @@ type TransportDispatcher interface {
 	// DeliverTransportControlPacket.
 	//
 	// DeliverTransportControlPacket takes ownership of pkt.
-	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt PacketBuffer)
+	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt *PacketBuffer)
 }
 
 // PacketLooping specifies where an outbound packet should be sent.
@@ -240,17 +249,18 @@ type NetworkEndpoint interface {
 	MaxHeaderLength() uint16
 
 	// WritePacket writes a packet to the given destination address and
-	// protocol. It sets pkt.NetworkHeader. pkt.TransportHeader must have
+	// protocol. It takes ownership of pkt. pkt.TransportHeader must have
 	// already been set.
-	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt PacketBuffer) *tcpip.Error
+	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets to the given destination address and
-	// protocol. pkts must not be zero length.
+	// protocol. pkts must not be zero length. It takes ownership of pkts and
+	// underlying packets.
 	WritePackets(r *Route, gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error)
 
 	// WriteHeaderIncludedPacket writes a packet that includes a network
-	// header to the given destination address.
-	WriteHeaderIncludedPacket(r *Route, pkt PacketBuffer) *tcpip.Error
+	// header to the given destination address. It takes ownership of pkt.
+	WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error
 
 	// ID returns the network protocol endpoint ID.
 	ID() *NetworkEndpointID
@@ -265,7 +275,7 @@ type NetworkEndpoint interface {
 	// this network endpoint. It sets pkt.NetworkHeader.
 	//
 	// HandlePacket takes ownership of pkt.
-	HandlePacket(r *Route, pkt PacketBuffer)
+	HandlePacket(r *Route, pkt *PacketBuffer)
 
 	// Close is called when the endpoint is reomved from a stack.
 	Close()
@@ -312,11 +322,18 @@ type NetworkProtocol interface {
 
 	// Wait waits for any worker goroutines owned by the protocol to stop.
 	Wait()
+
+	// Parse sets pkt.NetworkHeader and trims pkt.Data appropriately. It
+	// returns:
+	// - The encapsulated protocol, if present.
+	// - Whether there is an encapsulated transport protocol payload (e.g. ARP
+	//   does not encapsulate anything).
+	// - Whether pkt.Data was large enough to parse and set pkt.NetworkHeader.
+	Parse(pkt *PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool)
 }
 
 // NetworkDispatcher contains the methods used by the network stack to deliver
-// packets to the appropriate network endpoint after it has been handled by
-// the data link layer.
+// inbound/outbound packets to the appropriate network/packet(if any) endpoints.
 type NetworkDispatcher interface {
 	// DeliverNetworkPacket finds the appropriate network protocol endpoint
 	// and hands the packet over for further processing.
@@ -326,7 +343,17 @@ type NetworkDispatcher interface {
 	// packets sent via loopback), and won't have the field set.
 	//
 	// DeliverNetworkPacket takes ownership of pkt.
-	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer)
+	DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
+
+	// DeliverOutboundPacket is called by link layer when a packet is being
+	// sent out.
+	//
+	// pkt.LinkHeader may or may not be set before calling
+	// DeliverOutboundPacket. Some packets do not have link headers (e.g.
+	// packets sent via loopback), and won't have the field set.
+	//
+	// DeliverOutboundPacket takes ownership of pkt.
+	DeliverOutboundPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
 }
 
 // LinkEndpointCapabilities is the type associated with the capabilities
@@ -382,17 +409,17 @@ type LinkEndpoint interface {
 	LinkAddress() tcpip.LinkAddress
 
 	// WritePacket writes a packet with the given protocol through the
-	// given route. It sets pkt.LinkHeader if a link layer header exists.
-	// pkt.NetworkHeader and pkt.TransportHeader must have already been
-	// set.
+	// given route. It takes ownership of pkt. pkt.NetworkHeader and
+	// pkt.TransportHeader must have already been set.
 	//
 	// To participate in transparent bridging, a LinkEndpoint implementation
 	// should call eth.Encode with header.EthernetFields.SrcAddr set to
 	// r.LocalLinkAddress if it is provided.
-	WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer) *tcpip.Error
+	WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error
 
 	// WritePackets writes packets with the given protocol through the
-	// given route. pkts must not be zero length.
+	// given route. pkts must not be zero length. It takes ownership of pkts and
+	// underlying packets.
 	//
 	// Right now, WritePackets is used only when the software segmentation
 	// offload is enabled. If it will be used for something else, it may
@@ -400,7 +427,7 @@ type LinkEndpoint interface {
 	WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
 
 	// WriteRawPacket writes a packet directly to the link. The packet
-	// should already have an ethernet header.
+	// should already have an ethernet header. It takes ownership of vv.
 	WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error
 
 	// Attach attaches the data link layer endpoint to the network-layer
@@ -422,6 +449,15 @@ type LinkEndpoint interface {
 	// Wait will not block if the endpoint hasn't started any goroutines
 	// yet, even if it might later.
 	Wait()
+
+	// ARPHardwareType returns the ARPHRD_TYPE of the link endpoint.
+	//
+	// See:
+	// https://github.com/torvalds/linux/blob/aa0c9086b40c17a7ad94425b3b70dd1fdd7497bf/include/uapi/linux/if_arp.h#L30
+	ARPHardwareType() header.ARPHardwareType
+
+	// AddHeader adds a link layer header to pkt if required.
+	AddHeader(local, remote tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
 }
 
 // InjectableLinkEndpoint is a LinkEndpoint where inbound packets are
@@ -430,7 +466,7 @@ type InjectableLinkEndpoint interface {
 	LinkEndpoint
 
 	// InjectInbound injects an inbound packet.
-	InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt PacketBuffer)
+	InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
 
 	// InjectOutbound writes a fully formed outbound packet directly to the
 	// link.
@@ -442,12 +478,13 @@ type InjectableLinkEndpoint interface {
 // A LinkAddressResolver is an extension to a NetworkProtocol that
 // can resolve link addresses.
 type LinkAddressResolver interface {
-	// LinkAddressRequest sends a request for the LinkAddress of addr.
-	// The request is sent on linkEP with localAddr as the source.
+	// LinkAddressRequest sends a request for the LinkAddress of addr. Broadcasts
+	// the request on the local network if remoteLinkAddr is the zero value. The
+	// request is sent on linkEP with localAddr as the source.
 	//
 	// A valid response will cause the discovery protocol's network
 	// endpoint to call AddLinkAddress.
-	LinkAddressRequest(addr, localAddr tcpip.Address, linkEP LinkEndpoint) *tcpip.Error
+	LinkAddressRequest(addr, localAddr tcpip.Address, remoteLinkAddr tcpip.LinkAddress, linkEP LinkEndpoint) *tcpip.Error
 
 	// ResolveStaticAddress attempts to resolve address without sending
 	// requests. It either resolves the name immediately or returns the
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
index 150297ab9..e267bebb0 100644
--- a/pkg/tcpip/stack/route.go
+++ b/pkg/tcpip/stack/route.go
@@ -48,6 +48,10 @@ type Route struct {
 
 	// Loop controls where WritePacket should send packets.
 	Loop PacketLooping
+
+	// directedBroadcast indicates whether this route is sending a directed
+	// broadcast packet.
+	directedBroadcast bool
 }
 
 // makeRoute initializes a new route. It takes ownership of the provided
@@ -106,6 +110,12 @@ func (r *Route) GSOMaxSize() uint32 {
 	return 0
 }
 
+// ResolveWith immediately resolves a route with the specified remote link
+// address.
+func (r *Route) ResolveWith(addr tcpip.LinkAddress) {
+	r.RemoteLinkAddress = addr
+}
+
 // Resolve attempts to resolve the link address if necessary. Returns ErrWouldBlock in
 // case address resolution requires blocking, e.g. wait for ARP reply. Waker is
 // notified when address resolution is complete (success or not).
@@ -113,6 +123,8 @@ func (r *Route) GSOMaxSize() uint32 {
 // If address resolution is required, ErrNoLinkAddress and a notification channel is
 // returned for the top level caller to block. Channel is closed once address resolution
 // is complete (success or not).
+//
+// The NIC r uses must not be locked.
 func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) {
 	if !r.IsResolutionRequired() {
 		// Nothing to do if there is no cache (which does the resolution on cache miss) or
@@ -148,22 +160,27 @@ func (r *Route) RemoveWaker(waker *sleep.Waker) {
 
 // IsResolutionRequired returns true if Resolve() must be called to resolve
 // the link address before the this route can be written to.
+//
+// The NIC r uses must not be locked.
 func (r *Route) IsResolutionRequired() bool {
 	return r.ref.isValidForOutgoing() && r.ref.linkCache != nil && r.RemoteLinkAddress == ""
 }
 
 // WritePacket writes the packet through the given route.
-func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt PacketBuffer) *tcpip.Error {
+func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error {
 	if !r.ref.isValidForOutgoing() {
 		return tcpip.ErrInvalidEndpointState
 	}
 
+	// WritePacket takes ownership of pkt, calculate numBytes first.
+	numBytes := pkt.Size()
+
 	err := r.ref.ep.WritePacket(r, gso, params, pkt)
 	if err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 	} else {
 		r.ref.nic.stats.Tx.Packets.Increment()
-		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(pkt.Header.UsedLength() + pkt.Data.Size()))
+		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
 	}
 	return err
 }
@@ -175,16 +192,18 @@ func (r *Route) WritePackets(gso *GSO, pkts PacketBufferList, params NetworkHead
 		return 0, tcpip.ErrInvalidEndpointState
 	}
 
+	// WritePackets takes ownership of pkt, calculate length first.
+	numPkts := pkts.Len()
+
 	n, err := r.ref.ep.WritePackets(r, gso, pkts, params)
 	if err != nil {
-		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(pkts.Len() - n))
+		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(numPkts - n))
 	}
 	r.ref.nic.stats.Tx.Packets.IncrementBy(uint64(n))
 
 	writtenBytes := 0
 	for i, pb := 0, pkts.Front(); i < n && pb != nil; i, pb = i+1, pb.Next() {
-		writtenBytes += pb.Header.UsedLength()
-		writtenBytes += pb.Data.Size()
+		writtenBytes += pb.Size()
 	}
 
 	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(writtenBytes))
@@ -193,17 +212,20 @@ func (r *Route) WritePackets(gso *GSO, pkts PacketBufferList, params NetworkHead
 
 // WriteHeaderIncludedPacket writes a packet already containing a network
 // header through the given route.
-func (r *Route) WriteHeaderIncludedPacket(pkt PacketBuffer) *tcpip.Error {
+func (r *Route) WriteHeaderIncludedPacket(pkt *PacketBuffer) *tcpip.Error {
 	if !r.ref.isValidForOutgoing() {
 		return tcpip.ErrInvalidEndpointState
 	}
 
+	// WriteHeaderIncludedPacket takes ownership of pkt, calculate numBytes first.
+	numBytes := pkt.Data.Size()
+
 	if err := r.ref.ep.WriteHeaderIncludedPacket(r, pkt); err != nil {
 		r.Stats().IP.OutgoingPacketErrors.Increment()
 		return err
 	}
 	r.ref.nic.stats.Tx.Packets.Increment()
-	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(pkt.Data.Size()))
+	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
 	return nil
 }
 
@@ -262,6 +284,26 @@ func (r *Route) Stack() *Stack {
 	return r.ref.stack()
 }
 
+// IsOutboundBroadcast returns true if the route is for an outbound broadcast
+// packet.
+func (r *Route) IsOutboundBroadcast() bool {
+	// Only IPv4 has a notion of broadcast.
+	return r.directedBroadcast || r.RemoteAddress == header.IPv4Broadcast
+}
+
+// IsInboundBroadcast returns true if the route is for an inbound broadcast
+// packet.
+func (r *Route) IsInboundBroadcast() bool {
+	// Only IPv4 has a notion of broadcast.
+	if r.LocalAddress == header.IPv4Broadcast {
+		return true
+	}
+
+	addr := r.ref.addrWithPrefix()
+	subnet := addr.Subnet()
+	return subnet.IsBroadcast(r.LocalAddress)
+}
+
 // ReverseRoute returns new route with given source and destination address.
 func (r *Route) ReverseRoute(src tcpip.Address, dst tcpip.Address) Route {
 	return Route{
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
index 0ab4c3e19..5b19c5d59 100644
--- a/pkg/tcpip/stack/stack.go
+++ b/pkg/tcpip/stack/stack.go
@@ -52,7 +52,7 @@ const (
 
 type transportProtocolState struct {
 	proto          TransportProtocol
-	defaultHandler func(r *Route, id TransportEndpointID, pkt PacketBuffer) bool
+	defaultHandler func(r *Route, id TransportEndpointID, pkt *PacketBuffer) bool
 }
 
 // TCPProbeFunc is the expected function type for a TCP probe function to be
@@ -73,6 +73,16 @@ type TCPCubicState struct {
 	WEst                    float64
 }
 
+// TCPRACKState is used to hold a copy of the internal RACK state when the
+// TCPProbeFunc is invoked.
+type TCPRACKState struct {
+	XmitTime    time.Time
+	EndSequence seqnum.Value
+	FACK        seqnum.Value
+	RTT         time.Duration
+	Reord       bool
+}
+
 // TCPEndpointID is the unique 4 tuple that identifies a given endpoint.
 type TCPEndpointID struct {
 	// LocalPort is the local port associated with the endpoint.
@@ -212,6 +222,9 @@ type TCPSenderState struct {
 
 	// Cubic holds the state related to CUBIC congestion control.
 	Cubic TCPCubicState
+
+	// RACKState holds the state related to RACK loss detection algorithm.
+	RACKState TCPRACKState
 }
 
 // TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
@@ -424,12 +437,9 @@ type Stack struct {
 	// handleLocal allows non-loopback interfaces to loop packets.
 	handleLocal bool
 
-	// tablesMu protects iptables.
-	tablesMu sync.RWMutex
-
-	// tables are the iptables packet filtering and manipulation rules. The are
-	// protected by tablesMu.`
-	tables IPTables
+	// tables are the iptables packet filtering and manipulation rules.
+	// TODO(gvisor.dev/issue/170): S/R this field.
+	tables *IPTables
 
 	// resumableEndpoints is a list of endpoints that need to be resumed if the
 	// stack is being restored.
@@ -448,6 +458,9 @@ type Stack struct {
 	// ndpConfigs is the default NDP configurations used by interfaces.
 	ndpConfigs NDPConfigurations
 
+	// nudConfigs is the default NUD configurations used by interfaces.
+	nudConfigs NUDConfigurations
+
 	// autoGenIPv6LinkLocal determines whether or not the stack will attempt
 	// to auto-generate an IPv6 link-local address for newly enabled non-loopback
 	// NICs. See the AutoGenIPv6LinkLocal field of Options for more details.
@@ -457,6 +470,10 @@ type Stack struct {
 	// integrator NDP related events.
 	ndpDisp NDPDispatcher
 
+	// nudDisp is the NUD event dispatcher that is used to send the netstack
+	// integrator NUD related events.
+	nudDisp NUDDispatcher
+
 	// uniqueIDGenerator is a generator of unique identifiers.
 	uniqueIDGenerator UniqueID
 
@@ -475,6 +492,14 @@ type Stack struct {
 	// randomGenerator is an injectable pseudo random generator that can be
 	// used when a random number is required.
 	randomGenerator *mathrand.Rand
+
+	// sendBufferSize holds the min/default/max send buffer sizes for
+	// endpoints other than TCP.
+	sendBufferSize SendBufferSizeOption
+
+	// receiveBufferSize holds the min/default/max receive buffer sizes for
+	// endpoints other than TCP.
+	receiveBufferSize ReceiveBufferSizeOption
 }
 
 // UniqueID is an abstract generator of unique identifiers.
@@ -513,6 +538,9 @@ type Options struct {
 	// before assigning an address to a NIC.
 	NDPConfigs NDPConfigurations
 
+	// NUDConfigs is the default NUD configurations used by interfaces.
+	NUDConfigs NUDConfigurations
+
 	// AutoGenIPv6LinkLocal determines whether or not the stack will attempt to
 	// auto-generate an IPv6 link-local address for newly enabled non-loopback
 	// NICs.
@@ -531,6 +559,10 @@ type Options struct {
 	// receive NDP related events.
 	NDPDisp NDPDispatcher
 
+	// NUDDisp is the NUD event dispatcher that an integrator can provide to
+	// receive NUD related events.
+	NUDDisp NUDDispatcher
+
 	// RawFactory produces raw endpoints. Raw endpoints are enabled only if
 	// this is non-nil.
 	RawFactory RawFactory
@@ -665,6 +697,8 @@ func New(opts Options) *Stack {
 	// Make sure opts.NDPConfigs contains valid values only.
 	opts.NDPConfigs.validate()
 
+	opts.NUDConfigs.resetInvalidFields()
+
 	s := &Stack{
 		transportProtocols:   make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
 		networkProtocols:     make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
@@ -676,16 +710,29 @@ func New(opts Options) *Stack {
 		clock:                clock,
 		stats:                opts.Stats.FillIn(),
 		handleLocal:          opts.HandleLocal,
+		tables:               DefaultTables(),
 		icmpRateLimiter:      NewICMPRateLimiter(),
 		seed:                 generateRandUint32(),
 		ndpConfigs:           opts.NDPConfigs,
+		nudConfigs:           opts.NUDConfigs,
 		autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
 		uniqueIDGenerator:    opts.UniqueID,
 		ndpDisp:              opts.NDPDisp,
+		nudDisp:              opts.NUDDisp,
 		opaqueIIDOpts:        opts.OpaqueIIDOpts,
 		tempIIDSeed:          opts.TempIIDSeed,
 		forwarder:            newForwardQueue(),
 		randomGenerator:      mathrand.New(randSrc),
+		sendBufferSize: SendBufferSizeOption{
+			Min:     MinBufferSize,
+			Default: DefaultBufferSize,
+			Max:     DefaultMaxBufferSize,
+		},
+		receiveBufferSize: ReceiveBufferSizeOption{
+			Min:     MinBufferSize,
+			Default: DefaultBufferSize,
+			Max:     DefaultMaxBufferSize,
+		},
 	}
 
 	// Add specified network protocols.
@@ -712,6 +759,11 @@ func New(opts Options) *Stack {
 	return s
 }
 
+// newJob returns a tcpip.Job using the Stack clock.
+func (s *Stack) newJob(l sync.Locker, f func()) *tcpip.Job {
+	return tcpip.NewJob(s.clock, l, f)
+}
+
 // UniqueID returns a unique identifier.
 func (s *Stack) UniqueID() uint64 {
 	return s.uniqueIDGenerator.UniqueID()
@@ -778,16 +830,17 @@ func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber,
 //
 // It must be called only during initialization of the stack. Changing it as the
 // stack is operating is not supported.
-func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, PacketBuffer) bool) {
+func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, *PacketBuffer) bool) {
 	state := s.transportProtocols[p]
 	if state != nil {
 		state.defaultHandler = h
 	}
 }
 
-// NowNanoseconds implements tcpip.Clock.NowNanoseconds.
-func (s *Stack) NowNanoseconds() int64 {
-	return s.clock.NowNanoseconds()
+// Clock returns the Stack's clock for retrieving the current time and
+// scheduling work.
+func (s *Stack) Clock() tcpip.Clock {
+	return s.clock
 }
 
 // Stats returns a mutable copy of the current stats.
@@ -1020,6 +1073,13 @@ func (s *Stack) RemoveNIC(id tcpip.NICID) *tcpip.Error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
+	return s.removeNICLocked(id)
+}
+
+// removeNICLocked removes NIC and all related routes from the network stack.
+//
+// s.mu must be locked.
+func (s *Stack) removeNICLocked(id tcpip.NICID) *tcpip.Error {
 	nic, ok := s.nics[id]
 	if !ok {
 		return tcpip.ErrUnknownNICID
@@ -1029,14 +1089,14 @@ func (s *Stack) RemoveNIC(id tcpip.NICID) *tcpip.Error {
 	// Remove routes in-place. n tracks the number of routes written.
 	n := 0
 	for i, r := range s.routeTable {
+		s.routeTable[i] = tcpip.Route{}
 		if r.NIC != id {
 			// Keep this route.
-			if i > n {
-				s.routeTable[n] = r
-			}
+			s.routeTable[n] = r
 			n++
 		}
 	}
+
 	s.routeTable = s.routeTable[:n]
 
 	return nic.remove()
@@ -1072,6 +1132,11 @@ type NICInfo struct {
 	// Context is user-supplied data optionally supplied in CreateNICWithOptions.
 	// See type NICOptions for more details.
 	Context NICContext
+
+	// ARPHardwareType holds the ARP Hardware type of the NIC. This is the
+	// value sent in haType field of an ARP Request sent by this NIC and the
+	// value expected in the haType field of an ARP response.
+	ARPHardwareType header.ARPHardwareType
 }
 
 // HasNIC returns true if the NICID is defined in the stack.
@@ -1103,6 +1168,7 @@ func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
 			MTU:               nic.linkEP.MTU(),
 			Stats:             nic.stats,
 			Context:           nic.context,
+			ARPHardwareType:   nic.linkEP.ARPHardwareType(),
 		}
 	}
 	return nics
@@ -1249,9 +1315,9 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 
-	isBroadcast := remoteAddr == header.IPv4Broadcast
+	isLocalBroadcast := remoteAddr == header.IPv4Broadcast
 	isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)
-	needRoute := !(isBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr))
+	needRoute := !(isLocalBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr))
 	if id != 0 && !needRoute {
 		if nic, ok := s.nics[id]; ok && nic.enabled() {
 			if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
@@ -1272,9 +1338,16 @@ func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, n
 					}
 
 					r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback())
-					if needRoute {
-						r.NextHop = route.Gateway
+					r.directedBroadcast = route.Destination.IsBroadcast(remoteAddr)
+
+					if len(route.Gateway) > 0 {
+						if needRoute {
+							r.NextHop = route.Gateway
+						}
+					} else if r.directedBroadcast {
+						r.RemoteLinkAddress = header.EthernetBroadcastAddress
 					}
+
 					return r, nil
 				}
 			}
@@ -1400,25 +1473,31 @@ func (s *Stack) RemoveWaker(nicID tcpip.NICID, addr tcpip.Address, waker *sleep.
 // transport dispatcher. Received packets that match the provided id will be
 // delivered to the given endpoint; specifying a nic is optional, but
 // nic-specific IDs have precedence over global ones.
-func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
-	return s.demux.registerEndpoint(netProtos, protocol, id, ep, reusePort, bindToDevice)
+func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+	return s.demux.registerEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
+}
+
+// CheckRegisterTransportEndpoint checks if an endpoint can be registered with
+// the stack transport dispatcher.
+func (s *Stack) CheckRegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+	return s.demux.checkEndpoint(netProtos, protocol, id, flags, bindToDevice)
 }
 
 // UnregisterTransportEndpoint removes the endpoint with the given id from the
 // stack transport dispatcher.
-func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
-	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, bindToDevice)
+func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
+	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
 }
 
 // StartTransportEndpointCleanup removes the endpoint with the given id from
 // the stack transport dispatcher. It also transitions it to the cleanup stage.
-func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
+func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
 	s.cleanupEndpoints[ep] = struct{}{}
 
-	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, bindToDevice)
+	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
 }
 
 // CompleteTransportEndpointCleanup removes the endpoint from the cleanup
@@ -1741,18 +1820,8 @@ func (s *Stack) IsInGroup(nicID tcpip.NICID, multicastAddr tcpip.Address) (bool,
 }
 
 // IPTables returns the stack's iptables.
-func (s *Stack) IPTables() IPTables {
-	s.tablesMu.RLock()
-	t := s.tables
-	s.tablesMu.RUnlock()
-	return t
-}
-
-// SetIPTables sets the stack's iptables.
-func (s *Stack) SetIPTables(ipt IPTables) {
-	s.tablesMu.Lock()
-	s.tables = ipt
-	s.tablesMu.Unlock()
+func (s *Stack) IPTables() *IPTables {
+	return s.tables
 }
 
 // ICMPLimit returns the maximum number of ICMP messages that can be sent
@@ -1831,10 +1900,38 @@ func (s *Stack) SetNDPConfigurations(id tcpip.NICID, c NDPConfigurations) *tcpip
 	}
 
 	nic.setNDPConfigs(c)
-
 	return nil
 }
 
+// NUDConfigurations gets the per-interface NUD configurations.
+func (s *Stack) NUDConfigurations(id tcpip.NICID) (NUDConfigurations, *tcpip.Error) {
+	s.mu.RLock()
+	nic, ok := s.nics[id]
+	s.mu.RUnlock()
+
+	if !ok {
+		return NUDConfigurations{}, tcpip.ErrUnknownNICID
+	}
+
+	return nic.NUDConfigs()
+}
+
+// SetNUDConfigurations sets the per-interface NUD configurations.
+//
+// Note, if c contains invalid NUD configuration values, it will be fixed to
+// use default values for the erroneous values.
+func (s *Stack) SetNUDConfigurations(id tcpip.NICID, c NUDConfigurations) *tcpip.Error {
+	s.mu.RLock()
+	nic, ok := s.nics[id]
+	s.mu.RUnlock()
+
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.setNUDConfigs(c)
+}
+
 // HandleNDPRA provides a NIC with ID id a validated NDP Router Advertisement
 // message that it needs to handle.
 func (s *Stack) HandleNDPRA(id tcpip.NICID, ip tcpip.Address, ra header.NDPRouterAdvert) *tcpip.Error {
@@ -1888,8 +1985,8 @@ func generateRandInt64() int64 {
 
 // FindNetworkEndpoint returns the network endpoint for the given address.
 func (s *Stack) FindNetworkEndpoint(netProto tcpip.NetworkProtocolNumber, address tcpip.Address) (NetworkEndpoint, *tcpip.Error) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
+	s.mu.RLock()
+	defer s.mu.RUnlock()
 
 	for _, nic := range s.nics {
 		id := NetworkEndpointID{address}
@@ -1908,8 +2005,8 @@ func (s *Stack) FindNetworkEndpoint(netProto tcpip.NetworkProtocolNumber, addres
 
 // FindNICNameFromID returns the name of the nic for the given NICID.
 func (s *Stack) FindNICNameFromID(id tcpip.NICID) string {
-	s.mu.Lock()
-	defer s.mu.Unlock()
+	s.mu.RLock()
+	defer s.mu.RUnlock()
 
 	nic, ok := s.nics[id]
 	if !ok {
diff --git a/pkg/tcpip/stack/stack_options.go b/pkg/tcpip/stack/stack_options.go
new file mode 100644
index 000000000..0b093e6c5
--- /dev/null
+++ b/pkg/tcpip/stack/stack_options.go
@@ -0,0 +1,106 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import "gvisor.dev/gvisor/pkg/tcpip"
+
+const (
+	// MinBufferSize is the smallest size of a receive or send buffer.
+	MinBufferSize = 4 << 10 // 4 KiB
+
+	// DefaultBufferSize is the default size of the send/recv buffer for a
+	// transport endpoint.
+	DefaultBufferSize = 212 << 10 // 212 KiB
+
+	// DefaultMaxBufferSize is the default maximum permitted size of a
+	// send/receive buffer.
+	DefaultMaxBufferSize = 4 << 20 // 4 MiB
+)
+
+// SendBufferSizeOption is used by stack.(Stack*).Option/SetOption to
+// get/set the default, min and max send buffer sizes.
+type SendBufferSizeOption struct {
+	Min     int
+	Default int
+	Max     int
+}
+
+// ReceiveBufferSizeOption is used by stack.(Stack*).Option/SetOption to
+// get/set the default, min and max receive buffer sizes.
+type ReceiveBufferSizeOption struct {
+	Min     int
+	Default int
+	Max     int
+}
+
+// SetOption allows setting stack wide options.
+func (s *Stack) SetOption(option interface{}) *tcpip.Error {
+	switch v := option.(type) {
+	case SendBufferSizeOption:
+		// Make sure we don't allow lowering the buffer below minimum
+		// required for stack to work.
+		if v.Min < MinBufferSize {
+			return tcpip.ErrInvalidOptionValue
+		}
+
+		if v.Default < v.Min || v.Default > v.Max {
+			return tcpip.ErrInvalidOptionValue
+		}
+
+		s.mu.Lock()
+		s.sendBufferSize = v
+		s.mu.Unlock()
+		return nil
+
+	case ReceiveBufferSizeOption:
+		// Make sure we don't allow lowering the buffer below minimum
+		// required for stack to work.
+		if v.Min < MinBufferSize {
+			return tcpip.ErrInvalidOptionValue
+		}
+
+		if v.Default < v.Min || v.Default > v.Max {
+			return tcpip.ErrInvalidOptionValue
+		}
+
+		s.mu.Lock()
+		s.receiveBufferSize = v
+		s.mu.Unlock()
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// Option allows retrieving stack wide options.
+func (s *Stack) Option(option interface{}) *tcpip.Error {
+	switch v := option.(type) {
+	case *SendBufferSizeOption:
+		s.mu.RLock()
+		*v = s.sendBufferSize
+		s.mu.RUnlock()
+		return nil
+
+	case *ReceiveBufferSizeOption:
+		s.mu.RLock()
+		*v = s.receiveBufferSize
+		s.mu.RUnlock()
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
index 1a2cf007c..0273b3c63 100644
--- a/pkg/tcpip/stack/stack_test.go
+++ b/pkg/tcpip/stack/stack_test.go
@@ -21,18 +21,21 @@ import (
 	"bytes"
 	"fmt"
 	"math"
+	"net"
 	"sort"
 	"strings"
 	"testing"
 	"time"
 
 	"github.com/google/go-cmp/cmp"
+	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
@@ -52,6 +55,10 @@ const (
 	// where another value is explicitly used. It is chosen to match the MTU
 	// of loopback interfaces on linux systems.
 	defaultMTU = 65536
+
+	dstAddrOffset        = 0
+	srcAddrOffset        = 1
+	protocolNumberOffset = 2
 )
 
 // fakeNetworkEndpoint is a network-layer protocol endpoint. It counts sent and
@@ -90,30 +97,28 @@ func (f *fakeNetworkEndpoint) ID() *stack.NetworkEndpointID {
 	return &f.id
 }
 
-func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt stack.PacketBuffer) {
+func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
 	// Increment the received packet count in the protocol descriptor.
 	f.proto.packetCount[int(f.id.LocalAddress[0])%len(f.proto.packetCount)]++
 
-	// Consume the network header.
-	b, ok := pkt.Data.PullUp(fakeNetHeaderLen)
-	if !ok {
-		return
-	}
-	pkt.Data.TrimFront(fakeNetHeaderLen)
-
 	// Handle control packets.
-	if b[2] == uint8(fakeControlProtocol) {
+	if pkt.NetworkHeader().View()[protocolNumberOffset] == uint8(fakeControlProtocol) {
 		nb, ok := pkt.Data.PullUp(fakeNetHeaderLen)
 		if !ok {
 			return
 		}
 		pkt.Data.TrimFront(fakeNetHeaderLen)
-		f.dispatcher.DeliverTransportControlPacket(tcpip.Address(nb[1:2]), tcpip.Address(nb[0:1]), fakeNetNumber, tcpip.TransportProtocolNumber(nb[2]), stack.ControlPortUnreachable, 0, pkt)
+		f.dispatcher.DeliverTransportControlPacket(
+			tcpip.Address(nb[srcAddrOffset:srcAddrOffset+1]),
+			tcpip.Address(nb[dstAddrOffset:dstAddrOffset+1]),
+			fakeNetNumber,
+			tcpip.TransportProtocolNumber(nb[protocolNumberOffset]),
+			stack.ControlPortUnreachable, 0, pkt)
 		return
 	}
 
 	// Dispatch the packet to the transport protocol.
-	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(b[2]), pkt)
+	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(pkt.NetworkHeader().View()[protocolNumberOffset]), pkt)
 }
 
 func (f *fakeNetworkEndpoint) MaxHeaderLength() uint16 {
@@ -132,24 +137,19 @@ func (f *fakeNetworkEndpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumbe
 	return f.proto.Number()
 }
 
-func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt stack.PacketBuffer) *tcpip.Error {
+func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
 	// Increment the sent packet count in the protocol descriptor.
 	f.proto.sendPacketCount[int(r.RemoteAddress[0])%len(f.proto.sendPacketCount)]++
 
 	// Add the protocol's header to the packet and send it to the link
 	// endpoint.
-	b := pkt.Header.Prepend(fakeNetHeaderLen)
-	b[0] = r.RemoteAddress[0]
-	b[1] = f.id.LocalAddress[0]
-	b[2] = byte(params.Protocol)
+	hdr := pkt.NetworkHeader().Push(fakeNetHeaderLen)
+	hdr[dstAddrOffset] = r.RemoteAddress[0]
+	hdr[srcAddrOffset] = f.id.LocalAddress[0]
+	hdr[protocolNumberOffset] = byte(params.Protocol)
 
 	if r.Loop&stack.PacketLoop != 0 {
-		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
-		views[0] = pkt.Header.View()
-		views = append(views, pkt.Data.Views()...)
-		f.HandlePacket(r, stack.PacketBuffer{
-			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
-		})
+		f.HandlePacket(r, pkt)
 	}
 	if r.Loop&stack.PacketOut == 0 {
 		return nil
@@ -163,7 +163,7 @@ func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts
 	panic("not implemented")
 }
 
-func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt stack.PacketBuffer) *tcpip.Error {
+func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
 	return tcpip.ErrNotSupported
 }
 
@@ -205,7 +205,7 @@ func (f *fakeNetworkProtocol) PacketCount(intfAddr byte) int {
 }
 
 func (*fakeNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
-	return tcpip.Address(v[1:2]), tcpip.Address(v[0:1])
+	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
 }
 
 func (f *fakeNetworkProtocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, ep stack.LinkEndpoint, _ *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
@@ -247,6 +247,15 @@ func (*fakeNetworkProtocol) Close() {}
 // Wait implements TransportProtocol.Wait.
 func (*fakeNetworkProtocol) Wait() {}
 
+// Parse implements TransportProtocol.Parse.
+func (*fakeNetworkProtocol) Parse(pkt *stack.PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
+	hdr, ok := pkt.NetworkHeader().Consume(fakeNetHeaderLen)
+	if !ok {
+		return 0, false, false
+	}
+	return tcpip.TransportProtocolNumber(hdr[protocolNumberOffset]), true, true
+}
+
 func fakeNetFactory() stack.NetworkProtocol {
 	return &fakeNetworkProtocol{}
 }
@@ -268,6 +277,17 @@ func (l *linkEPWithMockedAttach) isAttached() bool {
 	return l.attached
 }
 
+// Checks to see if list contains an address.
+func containsAddr(list []tcpip.ProtocolAddress, item tcpip.ProtocolAddress) bool {
+	for _, i := range list {
+		if i == item {
+			return true
+		}
+	}
+
+	return false
+}
+
 func TestNetworkReceive(t *testing.T) {
 	// Create a stack with the fake network protocol, one nic, and two
 	// addresses attached to it: 1 & 2.
@@ -292,10 +312,10 @@ func TestNetworkReceive(t *testing.T) {
 	buf := buffer.NewView(30)
 
 	// Make sure packet with wrong address is not delivered.
-	buf[0] = 3
-	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	buf[dstAddrOffset] = 3
+	ep.InjectInbound(fakeNetNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 	if fakeNet.packetCount[1] != 0 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 0)
 	}
@@ -304,10 +324,10 @@ func TestNetworkReceive(t *testing.T) {
 	}
 
 	// Make sure packet is delivered to first endpoint.
-	buf[0] = 1
-	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	buf[dstAddrOffset] = 1
+	ep.InjectInbound(fakeNetNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 	if fakeNet.packetCount[1] != 1 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
 	}
@@ -316,10 +336,10 @@ func TestNetworkReceive(t *testing.T) {
 	}
 
 	// Make sure packet is delivered to second endpoint.
-	buf[0] = 2
-	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	buf[dstAddrOffset] = 2
+	ep.InjectInbound(fakeNetNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 	if fakeNet.packetCount[1] != 1 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
 	}
@@ -328,9 +348,9 @@ func TestNetworkReceive(t *testing.T) {
 	}
 
 	// Make sure packet is not delivered if protocol number is wrong.
-	ep.InjectInbound(fakeNetNumber-1, stack.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber-1, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 	if fakeNet.packetCount[1] != 1 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
 	}
@@ -340,9 +360,9 @@ func TestNetworkReceive(t *testing.T) {
 
 	// Make sure packet that is too small is dropped.
 	buf.CapLength(2)
-	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 	if fakeNet.packetCount[1] != 1 {
 		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
 	}
@@ -361,11 +381,10 @@ func sendTo(s *stack.Stack, addr tcpip.Address, payload buffer.View) *tcpip.Erro
 }
 
 func send(r stack.Route, payload buffer.View) *tcpip.Error {
-	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()))
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
-		Header: hdr,
-		Data:   payload.ToVectorisedView(),
-	})
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(r.MaxHeaderLength()),
+		Data:               payload.ToVectorisedView(),
+	}))
 }
 
 func testSendTo(t *testing.T, s *stack.Stack, addr tcpip.Address, ep *channel.Endpoint, payload buffer.View) {
@@ -420,9 +439,9 @@ func testFailingRecv(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte b
 
 func testRecvInternal(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte byte, ep *channel.Endpoint, buf buffer.View, want int) {
 	t.Helper()
-	ep.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	ep.InjectInbound(fakeNetNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 	if got := fakeNet.PacketCount(localAddrByte); got != want {
 		t.Errorf("receive packet count: got = %d, want %d", got, want)
 	}
@@ -859,9 +878,9 @@ func TestRouteWithDownNIC(t *testing.T) {
 					// Writes with Routes that use NIC1 after being brought up should
 					// succeed.
 					//
-					// TODO(b/147015577): Should we instead completely invalidate all
-					// Routes that were bound to a NIC that was brought down at some
-					// point?
+					// TODO(gvisor.dev/issue/1491): Should we instead completely
+					// invalidate all Routes that were bound to a NIC that was brought
+					// down at some point?
 					if err := upFn(s, nicID1); err != nil {
 						t.Fatalf("test.upFn(_, %d): %s", nicID1, err)
 					}
@@ -982,7 +1001,7 @@ func TestAddressRemoval(t *testing.T) {
 	buf := buffer.NewView(30)
 
 	// Send and receive packets, and verify they are received.
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	testRecv(t, fakeNet, localAddrByte, ep, buf)
 	testSendTo(t, s, remoteAddr, ep, nil)
 
@@ -1032,7 +1051,7 @@ func TestAddressRemovalWithRouteHeld(t *testing.T) {
 	}
 
 	// Send and receive packets, and verify they are received.
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	testRecv(t, fakeNet, localAddrByte, ep, buf)
 	testSend(t, r, ep, nil)
 	testSendTo(t, s, remoteAddr, ep, nil)
@@ -1114,7 +1133,7 @@ func TestEndpointExpiration(t *testing.T) {
 
 				fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol)
 				buf := buffer.NewView(30)
-				buf[0] = localAddrByte
+				buf[dstAddrOffset] = localAddrByte
 
 				if promiscuous {
 					if err := s.SetPromiscuousMode(nicID, true); err != nil {
@@ -1277,7 +1296,7 @@ func TestPromiscuousMode(t *testing.T) {
 	// Write a packet, and check that it doesn't get delivered as we don't
 	// have a matching endpoint.
 	const localAddrByte byte = 0x01
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	testFailingRecv(t, fakeNet, localAddrByte, ep, buf)
 
 	// Set promiscuous mode, then check that packet is delivered.
@@ -1658,7 +1677,7 @@ func TestAddressRangeAcceptsMatchingPacket(t *testing.T) {
 	buf := buffer.NewView(30)
 
 	const localAddrByte byte = 0x01
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	subnet, err := tcpip.NewSubnet(tcpip.Address("\x00"), tcpip.AddressMask("\xF0"))
 	if err != nil {
 		t.Fatal("NewSubnet failed:", err)
@@ -1695,7 +1714,7 @@ func testNicForAddressRange(t *testing.T, nicID tcpip.NICID, s *stack.Stack, sub
 
 	// Trying the next address should always fail since it is outside the range.
 	if gotNicID := s.CheckLocalAddress(0, fakeNetNumber, tcpip.Address(addrBytes)); gotNicID != 0 {
-		t.Errorf("got CheckLocalAddress(0, %d, %s) = %d, want = %d", fakeNetNumber, tcpip.Address(addrBytes), gotNicID, 0)
+		t.Errorf("got CheckLocalAddress(0, %d, %s) = %d, want = 0", fakeNetNumber, tcpip.Address(addrBytes), gotNicID)
 	}
 }
 
@@ -1766,7 +1785,7 @@ func TestAddressRangeRejectsNonmatchingPacket(t *testing.T) {
 	buf := buffer.NewView(30)
 
 	const localAddrByte byte = 0x01
-	buf[0] = localAddrByte
+	buf[dstAddrOffset] = localAddrByte
 	subnet, err := tcpip.NewSubnet(tcpip.Address("\x10"), tcpip.AddressMask("\xF0"))
 	if err != nil {
 		t.Fatal("NewSubnet failed:", err)
@@ -2263,9 +2282,9 @@ func TestNICStats(t *testing.T) {
 
 	// Send a packet to address 1.
 	buf := buffer.NewView(30)
-	ep1.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	ep1.InjectInbound(fakeNetNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 	if got, want := s.NICInfo()[1].Stats.Rx.Packets.Value(), uint64(1); got != want {
 		t.Errorf("got Rx.Packets.Value() = %d, want = %d", got, want)
 	}
@@ -2344,10 +2363,10 @@ func TestNICForwarding(t *testing.T) {
 
 			// Send a packet to dstAddr.
 			buf := buffer.NewView(30)
-			buf[0] = dstAddr[0]
-			ep1.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+			buf[dstAddrOffset] = dstAddr[0]
+			ep1.InjectInbound(fakeNetNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 				Data: buf.ToVectorisedView(),
-			})
+			}))
 
 			pkt, ok := ep2.Read()
 			if !ok {
@@ -2355,8 +2374,8 @@ func TestNICForwarding(t *testing.T) {
 			}
 
 			// Test that the link's MaxHeaderLength is honoured.
-			if capacity, want := pkt.Pkt.Header.AvailableLength(), int(test.headerLen); capacity != want {
-				t.Errorf("got Header.AvailableLength() = %d, want = %d", capacity, want)
+			if capacity, want := pkt.Pkt.AvailableHeaderBytes(), int(test.headerLen); capacity != want {
+				t.Errorf("got LinkHeader.AvailableLength() = %d, want = %d", capacity, want)
 			}
 
 			// Test that forwarding increments Tx stats correctly.
@@ -3080,6 +3099,13 @@ func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
 
 func TestAddRemoveIPv4BroadcastAddressOnNICEnableDisable(t *testing.T) {
 	const nicID = 1
+	broadcastAddr := tcpip.ProtocolAddress{
+		Protocol: header.IPv4ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   header.IPv4Broadcast,
+			PrefixLen: 32,
+		},
+	}
 
 	e := loopback.New()
 	s := stack.New(stack.Options{
@@ -3090,49 +3116,41 @@ func TestAddRemoveIPv4BroadcastAddressOnNICEnableDisable(t *testing.T) {
 		t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err)
 	}
 
-	allStackAddrs := s.AllAddresses()
-	allNICAddrs, ok := allStackAddrs[nicID]
-	if !ok {
-		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
-	}
-	if l := len(allNICAddrs); l != 0 {
-		t.Fatalf("got len(allNICAddrs) = %d, want = 0", l)
+	{
+		allStackAddrs := s.AllAddresses()
+		if allNICAddrs, ok := allStackAddrs[nicID]; !ok {
+			t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+		} else if containsAddr(allNICAddrs, broadcastAddr) {
+			t.Fatalf("got allNICAddrs = %+v, don't want = %+v", allNICAddrs, broadcastAddr)
+		}
 	}
 
 	// Enabling the NIC should add the IPv4 broadcast address.
 	if err := s.EnableNIC(nicID); err != nil {
 		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
 	}
-	allStackAddrs = s.AllAddresses()
-	allNICAddrs, ok = allStackAddrs[nicID]
-	if !ok {
-		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
-	}
-	if l := len(allNICAddrs); l != 1 {
-		t.Fatalf("got len(allNICAddrs) = %d, want = 1", l)
-	}
-	want := tcpip.ProtocolAddress{
-		Protocol: header.IPv4ProtocolNumber,
-		AddressWithPrefix: tcpip.AddressWithPrefix{
-			Address:   header.IPv4Broadcast,
-			PrefixLen: 32,
-		},
-	}
-	if allNICAddrs[0] != want {
-		t.Fatalf("got allNICAddrs[0] = %+v, want = %+v", allNICAddrs[0], want)
+
+	{
+		allStackAddrs := s.AllAddresses()
+		if allNICAddrs, ok := allStackAddrs[nicID]; !ok {
+			t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+		} else if !containsAddr(allNICAddrs, broadcastAddr) {
+			t.Fatalf("got allNICAddrs = %+v, want = %+v", allNICAddrs, broadcastAddr)
+		}
 	}
 
 	// Disabling the NIC should remove the IPv4 broadcast address.
 	if err := s.DisableNIC(nicID); err != nil {
 		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
 	}
-	allStackAddrs = s.AllAddresses()
-	allNICAddrs, ok = allStackAddrs[nicID]
-	if !ok {
-		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
-	}
-	if l := len(allNICAddrs); l != 0 {
-		t.Fatalf("got len(allNICAddrs) = %d, want = 0", l)
+
+	{
+		allStackAddrs := s.AllAddresses()
+		if allNICAddrs, ok := allStackAddrs[nicID]; !ok {
+			t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+		} else if containsAddr(allNICAddrs, broadcastAddr) {
+			t.Fatalf("got allNICAddrs = %+v, don't want = %+v", allNICAddrs, broadcastAddr)
+		}
 	}
 }
 
@@ -3180,50 +3198,93 @@ func TestLeaveIPv6SolicitedNodeAddrBeforeAddrRemoval(t *testing.T) {
 	}
 }
 
-func TestJoinLeaveAllNodesMulticastOnNICEnableDisable(t *testing.T) {
+func TestJoinLeaveMulticastOnNICEnableDisable(t *testing.T) {
 	const nicID = 1
 
-	e := loopback.New()
-	s := stack.New(stack.Options{
-		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
-	})
-	nicOpts := stack.NICOptions{Disabled: true}
-	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
-		t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err)
+	tests := []struct {
+		name  string
+		proto tcpip.NetworkProtocolNumber
+		addr  tcpip.Address
+	}{
+		{
+			name:  "IPv6 All-Nodes",
+			proto: header.IPv6ProtocolNumber,
+			addr:  header.IPv6AllNodesMulticastAddress,
+		},
+		{
+			name:  "IPv4 All-Systems",
+			proto: header.IPv4ProtocolNumber,
+			addr:  header.IPv4AllSystems,
+		},
 	}
 
-	// Should not be in the IPv6 all-nodes multicast group yet because the NIC has
-	// not been enabled yet.
-	isInGroup, err := s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
-	if err != nil {
-		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
-	}
-	if isInGroup {
-		t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, header.IPv6AllNodesMulticastAddress)
-	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e := loopback.New()
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+			})
+			nicOpts := stack.NICOptions{Disabled: true}
+			if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+				t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err)
+			}
 
-	// The all-nodes multicast group should be joined when the NIC is enabled.
-	if err := s.EnableNIC(nicID); err != nil {
-		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
-	}
-	isInGroup, err = s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
-	if err != nil {
-		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
-	}
-	if !isInGroup {
-		t.Fatalf("got IsInGroup(%d, %s) = false, want = true", nicID, header.IPv6AllNodesMulticastAddress)
-	}
+			// Should not be in the multicast group yet because the NIC has not been
+			// enabled yet.
+			if isInGroup, err := s.IsInGroup(nicID, test.addr); err != nil {
+				t.Fatalf("IsInGroup(%d, %s): %s", nicID, test.addr, err)
+			} else if isInGroup {
+				t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, test.addr)
+			}
 
-	// The all-nodes multicast group should be left when the NIC is disabled.
-	if err := s.DisableNIC(nicID); err != nil {
-		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
-	}
-	isInGroup, err = s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
-	if err != nil {
-		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
-	}
-	if isInGroup {
-		t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, header.IPv6AllNodesMulticastAddress)
+			// The all-nodes multicast group should be joined when the NIC is enabled.
+			if err := s.EnableNIC(nicID); err != nil {
+				t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+			}
+
+			if isInGroup, err := s.IsInGroup(nicID, test.addr); err != nil {
+				t.Fatalf("IsInGroup(%d, %s): %s", nicID, test.addr, err)
+			} else if !isInGroup {
+				t.Fatalf("got IsInGroup(%d, %s) = false, want = true", nicID, test.addr)
+			}
+
+			// The multicast group should be left when the NIC is disabled.
+			if err := s.DisableNIC(nicID); err != nil {
+				t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+			}
+
+			if isInGroup, err := s.IsInGroup(nicID, test.addr); err != nil {
+				t.Fatalf("IsInGroup(%d, %s): %s", nicID, test.addr, err)
+			} else if isInGroup {
+				t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, test.addr)
+			}
+
+			// The all-nodes multicast group should be joined when the NIC is enabled.
+			if err := s.EnableNIC(nicID); err != nil {
+				t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+			}
+
+			if isInGroup, err := s.IsInGroup(nicID, test.addr); err != nil {
+				t.Fatalf("IsInGroup(%d, %s): %s", nicID, test.addr, err)
+			} else if !isInGroup {
+				t.Fatalf("got IsInGroup(%d, %s) = false, want = true", nicID, test.addr)
+			}
+
+			// Leaving the group before disabling the NIC should not cause an error.
+			if err := s.LeaveGroup(test.proto, nicID, test.addr); err != nil {
+				t.Fatalf("s.LeaveGroup(%d, %d, %s): %s", test.proto, nicID, test.addr, err)
+			}
+
+			if err := s.DisableNIC(nicID); err != nil {
+				t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+			}
+
+			if isInGroup, err := s.IsInGroup(nicID, test.addr); err != nil {
+				t.Fatalf("IsInGroup(%d, %s): %s", nicID, test.addr, err)
+			} else if isInGroup {
+				t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, test.addr)
+			}
+		})
 	}
 }
 
@@ -3297,7 +3358,7 @@ func TestDoDADWhenNICEnabled(t *testing.T) {
 
 	// Wait for DAD to resolve.
 	select {
-	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncEventTimeout):
+	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncPositiveEventTimeout):
 		t.Fatal("timed out waiting for DAD resolution")
 	case e := <-ndpDisp.dadC:
 		if diff := checkDADEvent(e, nicID, addr.AddressWithPrefix.Address, true, nil); diff != "" {
@@ -3330,3 +3391,351 @@ func TestDoDADWhenNICEnabled(t *testing.T) {
 		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, addr.AddressWithPrefix)
 	}
 }
+
+func TestStackReceiveBufferSizeOption(t *testing.T) {
+	const sMin = stack.MinBufferSize
+	testCases := []struct {
+		name string
+		rs   stack.ReceiveBufferSizeOption
+		err  *tcpip.Error
+	}{
+		// Invalid configurations.
+		{"min_below_zero", stack.ReceiveBufferSizeOption{Min: -1, Default: sMin, Max: sMin}, tcpip.ErrInvalidOptionValue},
+		{"min_zero", stack.ReceiveBufferSizeOption{Min: 0, Default: sMin, Max: sMin}, tcpip.ErrInvalidOptionValue},
+		{"default_below_min", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin - 1, Max: sMin - 1}, tcpip.ErrInvalidOptionValue},
+		{"default_above_max", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin}, tcpip.ErrInvalidOptionValue},
+		{"max_below_min", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin - 1}, tcpip.ErrInvalidOptionValue},
+
+		// Valid Configurations
+		{"in_ascending_order", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 2}, nil},
+		{"all_equal", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin, Max: sMin}, nil},
+		{"min_default_equal", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin, Max: sMin + 1}, nil},
+		{"default_max_equal", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 1}, nil},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			s := stack.New(stack.Options{})
+			defer s.Close()
+			if err := s.SetOption(tc.rs); err != tc.err {
+				t.Fatalf("s.SetOption(%#v) = %v, want: %v", tc.rs, err, tc.err)
+			}
+			var rs stack.ReceiveBufferSizeOption
+			if tc.err == nil {
+				if err := s.Option(&rs); err != nil {
+					t.Fatalf("s.Option(%#v) = %v, want: nil", rs, err)
+				}
+				if got, want := rs, tc.rs; got != want {
+					t.Fatalf("s.Option(..) returned unexpected value got: %#v, want: %#v", got, want)
+				}
+			}
+		})
+	}
+}
+
+func TestStackSendBufferSizeOption(t *testing.T) {
+	const sMin = stack.MinBufferSize
+	testCases := []struct {
+		name string
+		ss   stack.SendBufferSizeOption
+		err  *tcpip.Error
+	}{
+		// Invalid configurations.
+		{"min_below_zero", stack.SendBufferSizeOption{Min: -1, Default: sMin, Max: sMin}, tcpip.ErrInvalidOptionValue},
+		{"min_zero", stack.SendBufferSizeOption{Min: 0, Default: sMin, Max: sMin}, tcpip.ErrInvalidOptionValue},
+		{"default_below_min", stack.SendBufferSizeOption{Min: 0, Default: sMin - 1, Max: sMin - 1}, tcpip.ErrInvalidOptionValue},
+		{"default_above_max", stack.SendBufferSizeOption{Min: 0, Default: sMin + 1, Max: sMin}, tcpip.ErrInvalidOptionValue},
+		{"max_below_min", stack.SendBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin - 1}, tcpip.ErrInvalidOptionValue},
+
+		// Valid Configurations
+		{"in_ascending_order", stack.SendBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 2}, nil},
+		{"all_equal", stack.SendBufferSizeOption{Min: sMin, Default: sMin, Max: sMin}, nil},
+		{"min_default_equal", stack.SendBufferSizeOption{Min: sMin, Default: sMin, Max: sMin + 1}, nil},
+		{"default_max_equal", stack.SendBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 1}, nil},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			s := stack.New(stack.Options{})
+			defer s.Close()
+			if err := s.SetOption(tc.ss); err != tc.err {
+				t.Fatalf("s.SetOption(%+v) = %v, want: %v", tc.ss, err, tc.err)
+			}
+			var ss stack.SendBufferSizeOption
+			if tc.err == nil {
+				if err := s.Option(&ss); err != nil {
+					t.Fatalf("s.Option(%+v) = %v, want: nil", ss, err)
+				}
+				if got, want := ss, tc.ss; got != want {
+					t.Fatalf("s.Option(..) returned unexpected value got: %#v, want: %#v", got, want)
+				}
+			}
+		})
+	}
+}
+
+func TestOutgoingSubnetBroadcast(t *testing.T) {
+	const (
+		unspecifiedNICID = 0
+		nicID1           = 1
+	)
+
+	defaultAddr := tcpip.AddressWithPrefix{
+		Address:   header.IPv4Any,
+		PrefixLen: 0,
+	}
+	defaultSubnet := defaultAddr.Subnet()
+	ipv4Addr := tcpip.AddressWithPrefix{
+		Address:   "\xc0\xa8\x01\x3a",
+		PrefixLen: 24,
+	}
+	ipv4Subnet := ipv4Addr.Subnet()
+	ipv4SubnetBcast := ipv4Subnet.Broadcast()
+	ipv4Gateway := tcpip.Address("\xc0\xa8\x01\x01")
+	ipv4AddrPrefix31 := tcpip.AddressWithPrefix{
+		Address:   "\xc0\xa8\x01\x3a",
+		PrefixLen: 31,
+	}
+	ipv4Subnet31 := ipv4AddrPrefix31.Subnet()
+	ipv4Subnet31Bcast := ipv4Subnet31.Broadcast()
+	ipv4AddrPrefix32 := tcpip.AddressWithPrefix{
+		Address:   "\xc0\xa8\x01\x3a",
+		PrefixLen: 32,
+	}
+	ipv4Subnet32 := ipv4AddrPrefix32.Subnet()
+	ipv4Subnet32Bcast := ipv4Subnet32.Broadcast()
+	ipv6Addr := tcpip.AddressWithPrefix{
+		Address:   "\x20\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01",
+		PrefixLen: 64,
+	}
+	ipv6Subnet := ipv6Addr.Subnet()
+	ipv6SubnetBcast := ipv6Subnet.Broadcast()
+	remNetAddr := tcpip.AddressWithPrefix{
+		Address:   "\x64\x0a\x7b\x18",
+		PrefixLen: 24,
+	}
+	remNetSubnet := remNetAddr.Subnet()
+	remNetSubnetBcast := remNetSubnet.Broadcast()
+
+	tests := []struct {
+		name          string
+		nicAddr       tcpip.ProtocolAddress
+		routes        []tcpip.Route
+		remoteAddr    tcpip.Address
+		expectedRoute stack.Route
+	}{
+		// Broadcast to a locally attached subnet populates the broadcast MAC.
+		{
+			name: "IPv4 Broadcast to local subnet",
+			nicAddr: tcpip.ProtocolAddress{
+				Protocol:          header.IPv4ProtocolNumber,
+				AddressWithPrefix: ipv4Addr,
+			},
+			routes: []tcpip.Route{
+				{
+					Destination: ipv4Subnet,
+					NIC:         nicID1,
+				},
+			},
+			remoteAddr: ipv4SubnetBcast,
+			expectedRoute: stack.Route{
+				LocalAddress:      ipv4Addr.Address,
+				RemoteAddress:     ipv4SubnetBcast,
+				RemoteLinkAddress: header.EthernetBroadcastAddress,
+				NetProto:          header.IPv4ProtocolNumber,
+				Loop:              stack.PacketOut,
+			},
+		},
+		// Broadcast to a locally attached /31 subnet does not populate the
+		// broadcast MAC.
+		{
+			name: "IPv4 Broadcast to local /31 subnet",
+			nicAddr: tcpip.ProtocolAddress{
+				Protocol:          header.IPv4ProtocolNumber,
+				AddressWithPrefix: ipv4AddrPrefix31,
+			},
+			routes: []tcpip.Route{
+				{
+					Destination: ipv4Subnet31,
+					NIC:         nicID1,
+				},
+			},
+			remoteAddr: ipv4Subnet31Bcast,
+			expectedRoute: stack.Route{
+				LocalAddress:  ipv4AddrPrefix31.Address,
+				RemoteAddress: ipv4Subnet31Bcast,
+				NetProto:      header.IPv4ProtocolNumber,
+				Loop:          stack.PacketOut,
+			},
+		},
+		// Broadcast to a locally attached /32 subnet does not populate the
+		// broadcast MAC.
+		{
+			name: "IPv4 Broadcast to local /32 subnet",
+			nicAddr: tcpip.ProtocolAddress{
+				Protocol:          header.IPv4ProtocolNumber,
+				AddressWithPrefix: ipv4AddrPrefix32,
+			},
+			routes: []tcpip.Route{
+				{
+					Destination: ipv4Subnet32,
+					NIC:         nicID1,
+				},
+			},
+			remoteAddr: ipv4Subnet32Bcast,
+			expectedRoute: stack.Route{
+				LocalAddress:  ipv4AddrPrefix32.Address,
+				RemoteAddress: ipv4Subnet32Bcast,
+				NetProto:      header.IPv4ProtocolNumber,
+				Loop:          stack.PacketOut,
+			},
+		},
+		// IPv6 has no notion of a broadcast.
+		{
+			name: "IPv6 'Broadcast' to local subnet",
+			nicAddr: tcpip.ProtocolAddress{
+				Protocol:          header.IPv6ProtocolNumber,
+				AddressWithPrefix: ipv6Addr,
+			},
+			routes: []tcpip.Route{
+				{
+					Destination: ipv6Subnet,
+					NIC:         nicID1,
+				},
+			},
+			remoteAddr: ipv6SubnetBcast,
+			expectedRoute: stack.Route{
+				LocalAddress:  ipv6Addr.Address,
+				RemoteAddress: ipv6SubnetBcast,
+				NetProto:      header.IPv6ProtocolNumber,
+				Loop:          stack.PacketOut,
+			},
+		},
+		// Broadcast to a remote subnet in the route table is send to the next-hop
+		// gateway.
+		{
+			name: "IPv4 Broadcast to remote subnet",
+			nicAddr: tcpip.ProtocolAddress{
+				Protocol:          header.IPv4ProtocolNumber,
+				AddressWithPrefix: ipv4Addr,
+			},
+			routes: []tcpip.Route{
+				{
+					Destination: remNetSubnet,
+					Gateway:     ipv4Gateway,
+					NIC:         nicID1,
+				},
+			},
+			remoteAddr: remNetSubnetBcast,
+			expectedRoute: stack.Route{
+				LocalAddress:  ipv4Addr.Address,
+				RemoteAddress: remNetSubnetBcast,
+				NextHop:       ipv4Gateway,
+				NetProto:      header.IPv4ProtocolNumber,
+				Loop:          stack.PacketOut,
+			},
+		},
+		// Broadcast to an unknown subnet follows the default route. Note that this
+		// is essentially just routing an unknown destination IP, because w/o any
+		// subnet prefix information a subnet broadcast address is just a normal IP.
+		{
+			name: "IPv4 Broadcast to unknown subnet",
+			nicAddr: tcpip.ProtocolAddress{
+				Protocol:          header.IPv4ProtocolNumber,
+				AddressWithPrefix: ipv4Addr,
+			},
+			routes: []tcpip.Route{
+				{
+					Destination: defaultSubnet,
+					Gateway:     ipv4Gateway,
+					NIC:         nicID1,
+				},
+			},
+			remoteAddr: remNetSubnetBcast,
+			expectedRoute: stack.Route{
+				LocalAddress:  ipv4Addr.Address,
+				RemoteAddress: remNetSubnetBcast,
+				NextHop:       ipv4Gateway,
+				NetProto:      header.IPv4ProtocolNumber,
+				Loop:          stack.PacketOut,
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+			})
+			ep := channel.New(0, defaultMTU, "")
+			if err := s.CreateNIC(nicID1, ep); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
+			}
+			if err := s.AddProtocolAddress(nicID1, test.nicAddr); err != nil {
+				t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID1, test.nicAddr, err)
+			}
+
+			s.SetRouteTable(test.routes)
+
+			var netProto tcpip.NetworkProtocolNumber
+			switch l := len(test.remoteAddr); l {
+			case header.IPv4AddressSize:
+				netProto = header.IPv4ProtocolNumber
+			case header.IPv6AddressSize:
+				netProto = header.IPv6ProtocolNumber
+			default:
+				t.Fatalf("got unexpected address length = %d bytes", l)
+			}
+
+			if r, err := s.FindRoute(unspecifiedNICID, "" /* localAddr */, test.remoteAddr, netProto, false /* multicastLoop */); err != nil {
+				t.Fatalf("FindRoute(%d, '', %s, %d): %s", unspecifiedNICID, test.remoteAddr, netProto, err)
+			} else if diff := cmp.Diff(r, test.expectedRoute, cmpopts.IgnoreUnexported(r)); diff != "" {
+				t.Errorf("route mismatch (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestResolveWith(t *testing.T) {
+	const (
+		unspecifiedNICID = 0
+		nicID            = 1
+	)
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), arp.NewProtocol()},
+	})
+	ep := channel.New(0, defaultMTU, "")
+	ep.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+	if err := s.CreateNIC(nicID, ep); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+	}
+	addr := tcpip.ProtocolAddress{
+		Protocol: header.IPv4ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(net.ParseIP("192.168.1.58").To4()),
+			PrefixLen: 24,
+		},
+	}
+	if err := s.AddProtocolAddress(nicID, addr); err != nil {
+		t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, addr, err)
+	}
+
+	s.SetRouteTable([]tcpip.Route{{Destination: header.IPv4EmptySubnet, NIC: nicID}})
+
+	remoteAddr := tcpip.Address(net.ParseIP("192.168.1.59").To4())
+	r, err := s.FindRoute(unspecifiedNICID, "" /* localAddr */, remoteAddr, header.IPv4ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("FindRoute(%d, '', %s, %d): %s", unspecifiedNICID, remoteAddr, header.IPv4ProtocolNumber, err)
+	}
+	defer r.Release()
+
+	// Should initially require resolution.
+	if !r.IsResolutionRequired() {
+		t.Fatal("got r.IsResolutionRequired() = false, want = true")
+	}
+
+	// Manually resolving the route should no longer require resolution.
+	r.ResolveWith("\x01")
+	if r.IsResolutionRequired() {
+		t.Fatal("got r.IsResolutionRequired() = true, want = false")
+	}
+}
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
index 9a33ed375..b902c6ca9 100644
--- a/pkg/tcpip/stack/transport_demuxer.go
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -15,7 +15,6 @@
 package stack
 
 import (
-	"container/heap"
 	"fmt"
 	"math/rand"
 
@@ -23,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 )
 
 type protocolIDs struct {
@@ -43,14 +43,14 @@ type transportEndpoints struct {
 
 // unregisterEndpoint unregisters the endpoint with the given id such that it
 // won't receive any more packets.
-func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
+func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
 	eps.mu.Lock()
 	defer eps.mu.Unlock()
 	epsByNIC, ok := eps.endpoints[id]
 	if !ok {
 		return
 	}
-	if !epsByNIC.unregisterEndpoint(bindToDevice, ep) {
+	if !epsByNIC.unregisterEndpoint(bindToDevice, ep, flags) {
 		return
 	}
 	delete(eps.endpoints, id)
@@ -152,7 +152,7 @@ func (epsByNIC *endpointsByNIC) transportEndpoints() []TransportEndpoint {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, pkt PacketBuffer) {
+func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) {
 	epsByNIC.mu.RLock()
 
 	mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
@@ -183,7 +183,7 @@ func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, p
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (epsByNIC *endpointsByNIC) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt PacketBuffer) {
+func (epsByNIC *endpointsByNIC) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt *PacketBuffer) {
 	epsByNIC.mu.RLock()
 	defer epsByNIC.mu.RUnlock()
 
@@ -204,7 +204,7 @@ func (epsByNIC *endpointsByNIC) handleControlPacket(n *NIC, id TransportEndpoint
 
 // registerEndpoint returns true if it succeeds. It fails and returns
 // false if ep already has an element with the same key.
-func (epsByNIC *endpointsByNIC) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+func (epsByNIC *endpointsByNIC) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
 	epsByNIC.mu.Lock()
 	defer epsByNIC.mu.Unlock()
 
@@ -214,23 +214,34 @@ func (epsByNIC *endpointsByNIC) registerEndpoint(d *transportDemuxer, netProto t
 			demux:      d,
 			netProto:   netProto,
 			transProto: transProto,
-			reuse:      reusePort,
 		}
 		epsByNIC.endpoints[bindToDevice] = multiPortEp
 	}
 
-	return multiPortEp.singleRegisterEndpoint(t, reusePort)
+	return multiPortEp.singleRegisterEndpoint(t, flags)
+}
+
+func (epsByNIC *endpointsByNIC) checkEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+	epsByNIC.mu.RLock()
+	defer epsByNIC.mu.RUnlock()
+
+	multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
+	if !ok {
+		return nil
+	}
+
+	return multiPortEp.singleCheckEndpoint(flags)
 }
 
 // unregisterEndpoint returns true if endpointsByNIC has to be unregistered.
-func (epsByNIC *endpointsByNIC) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint) bool {
+func (epsByNIC *endpointsByNIC) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint, flags ports.Flags) bool {
 	epsByNIC.mu.Lock()
 	defer epsByNIC.mu.Unlock()
 	multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
 	if !ok {
 		return false
 	}
-	if multiPortEp.unregisterEndpoint(t) {
+	if multiPortEp.unregisterEndpoint(t, flags) {
 		delete(epsByNIC.endpoints, bindToDevice)
 	}
 	return len(epsByNIC.endpoints) == 0
@@ -251,7 +262,7 @@ type transportDemuxer struct {
 // the dispatcher to delivery packets to the QueuePacket method instead of
 // calling HandlePacket directly on the endpoint.
 type queuedTransportProtocol interface {
-	QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt PacketBuffer)
+	QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt *PacketBuffer)
 }
 
 func newTransportDemuxer(stack *Stack) *transportDemuxer {
@@ -279,10 +290,10 @@ func newTransportDemuxer(stack *Stack) *transportDemuxer {
 
 // registerEndpoint registers the given endpoint with the dispatcher such that
 // packets that match the endpoint ID are delivered to it.
-func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
 	for i, n := range netProtos {
-		if err := d.singleRegisterEndpoint(n, protocol, id, ep, reusePort, bindToDevice); err != nil {
-			d.unregisterEndpoint(netProtos[:i], protocol, id, ep, bindToDevice)
+		if err := d.singleRegisterEndpoint(n, protocol, id, ep, flags, bindToDevice); err != nil {
+			d.unregisterEndpoint(netProtos[:i], protocol, id, ep, flags, bindToDevice)
 			return err
 		}
 	}
@@ -290,33 +301,15 @@ func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNum
 	return nil
 }
 
-type transportEndpointHeap []TransportEndpoint
-
-var _ heap.Interface = (*transportEndpointHeap)(nil)
-
-func (h *transportEndpointHeap) Len() int {
-	return len(*h)
-}
-
-func (h *transportEndpointHeap) Less(i, j int) bool {
-	return (*h)[i].UniqueID() < (*h)[j].UniqueID()
-}
-
-func (h *transportEndpointHeap) Swap(i, j int) {
-	(*h)[i], (*h)[j] = (*h)[j], (*h)[i]
-}
-
-func (h *transportEndpointHeap) Push(x interface{}) {
-	*h = append(*h, x.(TransportEndpoint))
-}
+// checkEndpoint checks if an endpoint can be registered with the dispatcher.
+func (d *transportDemuxer) checkEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+	for _, n := range netProtos {
+		if err := d.singleCheckEndpoint(n, protocol, id, flags, bindToDevice); err != nil {
+			return err
+		}
+	}
 
-func (h *transportEndpointHeap) Pop() interface{} {
-	old := *h
-	n := len(old)
-	x := old[n-1]
-	old[n-1] = nil
-	*h = old[:n-1]
-	return x
+	return nil
 }
 
 // multiPortEndpoint is a container for TransportEndpoints which are bound to
@@ -334,9 +327,10 @@ type multiPortEndpoint struct {
 	netProto   tcpip.NetworkProtocolNumber
 	transProto tcpip.TransportProtocolNumber
 
-	endpoints transportEndpointHeap
-	// reuse indicates if more than one endpoint is allowed.
-	reuse bool
+	// endpoints stores the transport endpoints in the order in which they
+	// were bound. This is required for UDP SO_REUSEADDR.
+	endpoints []TransportEndpoint
+	flags     ports.FlagCounter
 }
 
 func (ep *multiPortEndpoint) transportEndpoints() []TransportEndpoint {
@@ -362,6 +356,10 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
 		return mpep.endpoints[0]
 	}
 
+	if mpep.flags.IntersectionRefs().ToFlags().Effective().MostRecent {
+		return mpep.endpoints[len(mpep.endpoints)-1]
+	}
+
 	payload := []byte{
 		byte(id.LocalPort),
 		byte(id.LocalPort >> 8),
@@ -379,7 +377,7 @@ func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32
 	return mpep.endpoints[idx]
 }
 
-func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt PacketBuffer) {
+func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt *PacketBuffer) {
 	ep.mu.RLock()
 	queuedProtocol, mustQueue := ep.demux.queuedProtocols[protocolIDs{ep.netProto, ep.transProto}]
 	// HandlePacket takes ownership of pkt, so each endpoint needs
@@ -401,40 +399,63 @@ func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, p
 
 // singleRegisterEndpoint tries to add an endpoint to the multiPortEndpoint
 // list. The list might be empty already.
-func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, reusePort bool) *tcpip.Error {
+func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, flags ports.Flags) *tcpip.Error {
 	ep.mu.Lock()
 	defer ep.mu.Unlock()
 
+	bits := flags.Bits() & ports.MultiBindFlagMask
+
 	if len(ep.endpoints) != 0 {
 		// If it was previously bound, we need to check if we can bind again.
-		if !ep.reuse || !reusePort {
+		if ep.flags.TotalRefs() > 0 && bits&ep.flags.IntersectionRefs() == 0 {
 			return tcpip.ErrPortInUse
 		}
 	}
 
-	heap.Push(&ep.endpoints, t)
+	ep.endpoints = append(ep.endpoints, t)
+	ep.flags.AddRef(bits)
+
+	return nil
+}
+
+func (ep *multiPortEndpoint) singleCheckEndpoint(flags ports.Flags) *tcpip.Error {
+	ep.mu.RLock()
+	defer ep.mu.RUnlock()
+
+	bits := flags.Bits() & ports.MultiBindFlagMask
+
+	if len(ep.endpoints) != 0 {
+		// If it was previously bound, we need to check if we can bind again.
+		if ep.flags.TotalRefs() > 0 && bits&ep.flags.IntersectionRefs() == 0 {
+			return tcpip.ErrPortInUse
+		}
+	}
 
 	return nil
 }
 
 // unregisterEndpoint returns true if multiPortEndpoint has to be unregistered.
-func (ep *multiPortEndpoint) unregisterEndpoint(t TransportEndpoint) bool {
+func (ep *multiPortEndpoint) unregisterEndpoint(t TransportEndpoint, flags ports.Flags) bool {
 	ep.mu.Lock()
 	defer ep.mu.Unlock()
 
 	for i, endpoint := range ep.endpoints {
 		if endpoint == t {
-			heap.Remove(&ep.endpoints, i)
+			copy(ep.endpoints[i:], ep.endpoints[i+1:])
+			ep.endpoints[len(ep.endpoints)-1] = nil
+			ep.endpoints = ep.endpoints[:len(ep.endpoints)-1]
+
+			ep.flags.DropRef(flags.Bits() & ports.MultiBindFlagMask)
 			break
 		}
 	}
 	return len(ep.endpoints) == 0
 }
 
-func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool, bindToDevice tcpip.NICID) *tcpip.Error {
+func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
 	if id.RemotePort != 0 {
-		// TODO(eyalsoha): Why?
-		reusePort = false
+		// SO_REUSEPORT only applies to bound/listening endpoints.
+		flags.LoadBalanced = false
 	}
 
 	eps, ok := d.protocol[protocolIDs{netProto, protocol}]
@@ -454,15 +475,42 @@ func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocol
 		eps.endpoints[id] = epsByNIC
 	}
 
-	return epsByNIC.registerEndpoint(d, netProto, protocol, ep, reusePort, bindToDevice)
+	return epsByNIC.registerEndpoint(d, netProto, protocol, ep, flags, bindToDevice)
+}
+
+func (d *transportDemuxer) singleCheckEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+	if id.RemotePort != 0 {
+		// SO_REUSEPORT only applies to bound/listening endpoints.
+		flags.LoadBalanced = false
+	}
+
+	eps, ok := d.protocol[protocolIDs{netProto, protocol}]
+	if !ok {
+		return tcpip.ErrUnknownProtocol
+	}
+
+	eps.mu.RLock()
+	defer eps.mu.RUnlock()
+
+	epsByNIC, ok := eps.endpoints[id]
+	if !ok {
+		return nil
+	}
+
+	return epsByNIC.checkEndpoint(d, netProto, protocol, flags, bindToDevice)
 }
 
 // unregisterEndpoint unregisters the endpoint with the given id such that it
 // won't receive any more packets.
-func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, bindToDevice tcpip.NICID) {
+func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
+	if id.RemotePort != 0 {
+		// SO_REUSEPORT only applies to bound/listening endpoints.
+		flags.LoadBalanced = false
+	}
+
 	for _, n := range netProtos {
 		if eps, ok := d.protocol[protocolIDs{n, protocol}]; ok {
-			eps.unregisterEndpoint(id, ep, bindToDevice)
+			eps.unregisterEndpoint(id, ep, flags, bindToDevice)
 		}
 	}
 }
@@ -470,7 +518,7 @@ func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolN
 // deliverPacket attempts to find one or more matching transport endpoints, and
 // then, if matches are found, delivers the packet to them. Returns true if
 // the packet no longer needs to be handled.
-func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
@@ -520,7 +568,7 @@ func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProto
 
 // deliverRawPacket attempts to deliver the given packet and returns whether it
 // was delivered successfully.
-func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt PacketBuffer) bool {
+func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) bool {
 	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
 	if !ok {
 		return false
@@ -544,7 +592,7 @@ func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportPr
 
 // deliverControlPacket attempts to deliver the given control packet. Returns
 // true if it found an endpoint, false otherwise.
-func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt PacketBuffer, id TransportEndpointID) bool {
+func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt *PacketBuffer, id TransportEndpointID) bool {
 	eps, ok := d.protocol[protocolIDs{net, trans}]
 	if !ok {
 		return false
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
index 2474a7db3..1339edc2d 100644
--- a/pkg/tcpip/stack/transport_demuxer_test.go
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -127,11 +128,10 @@ func (c *testContext) sendV4Packet(payload []byte, h *headers, linkEpID tcpip.NI
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEps[linkEpID].InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
-		Data:            buf.ToVectorisedView(),
-		NetworkHeader:   buffer.View(ip),
-		TransportHeader: buffer.View(u),
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: buf.ToVectorisedView(),
 	})
+	c.linkEps[linkEpID].InjectInbound(ipv4.ProtocolNumber, pkt)
 }
 
 func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NICID) {
@@ -165,11 +165,10 @@ func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NI
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEps[linkEpID].InjectInbound(ipv6.ProtocolNumber, stack.PacketBuffer{
-		Data:            buf.ToVectorisedView(),
-		NetworkHeader:   buffer.View(ip),
-		TransportHeader: buffer.View(u),
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: buf.ToVectorisedView(),
 	})
+	c.linkEps[linkEpID].InjectInbound(ipv6.ProtocolNumber, pkt)
 }
 
 func TestTransportDemuxerRegister(t *testing.T) {
@@ -195,7 +194,7 @@ func TestTransportDemuxerRegister(t *testing.T) {
 			if !ok {
 				t.Fatalf("%T does not implement stack.TransportEndpoint", ep)
 			}
-			if got, want := s.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{test.proto}, udp.ProtocolNumber, stack.TransportEndpointID{}, tEP, false, 0), test.want; got != want {
+			if got, want := s.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{test.proto}, udp.ProtocolNumber, stack.TransportEndpointID{}, tEP, ports.Flags{}, 0), test.want; got != want {
 				t.Fatalf("s.RegisterTransportEndpoint(...) = %s, want %s", got, want)
 			}
 		})
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
index a611e44ab..6c6e44468 100644
--- a/pkg/tcpip/stack/transport_test.go
+++ b/pkg/tcpip/stack/transport_test.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
 	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -83,15 +84,16 @@ func (f *fakeTransportEndpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions
 		return 0, nil, tcpip.ErrNoRoute
 	}
 
-	hdr := buffer.NewPrependable(int(f.route.MaxHeaderLength()))
 	v, err := p.FullPayload()
 	if err != nil {
 		return 0, nil, err
 	}
-	if err := f.route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, stack.PacketBuffer{
-		Header: hdr,
-		Data:   buffer.View(v).ToVectorisedView(),
-	}); err != nil {
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: int(f.route.MaxHeaderLength()) + fakeTransHeaderLen,
+		Data:               buffer.View(v).ToVectorisedView(),
+	})
+	_ = pkt.TransportHeader().Push(fakeTransHeaderLen)
+	if err := f.route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, pkt); err != nil {
 		return 0, nil, err
 	}
 
@@ -153,7 +155,7 @@ func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 
 	// Try to register so that we can start receiving packets.
 	f.ID.RemoteAddress = addr.Addr
-	err = f.stack.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.ID, f, false /* reuse */, 0 /* bindToDevice */)
+	err = f.stack.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.ID, f, ports.Flags{}, 0 /* bindToDevice */)
 	if err != nil {
 		return err
 	}
@@ -198,8 +200,8 @@ func (f *fakeTransportEndpoint) Bind(a tcpip.FullAddress) *tcpip.Error {
 		fakeTransNumber,
 		stack.TransportEndpointID{LocalAddress: a.Addr},
 		f,
-		false, /* reuse */
-		0,     /* bindtoDevice */
+		ports.Flags{},
+		0, /* bindtoDevice */
 	); err != nil {
 		return err
 	}
@@ -215,7 +217,7 @@ func (*fakeTransportEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Erro
 	return tcpip.FullAddress{}, nil
 }
 
-func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ stack.PacketBuffer) {
+func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ *stack.PacketBuffer) {
 	// Increment the number of received packets.
 	f.proto.packetCount++
 	if f.acceptQueue != nil {
@@ -232,7 +234,7 @@ func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportE
 	}
 }
 
-func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, stack.PacketBuffer) {
+func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, *stack.PacketBuffer) {
 	// Increment the number of received control packets.
 	f.proto.controlCount++
 }
@@ -289,7 +291,7 @@ func (*fakeTransportProtocol) ParsePorts(buffer.View) (src, dst uint16, err *tcp
 	return 0, 0, nil
 }
 
-func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, stack.PacketBuffer) bool {
+func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) bool {
 	return true
 }
 
@@ -324,6 +326,12 @@ func (*fakeTransportProtocol) Close() {}
 // Wait implements TransportProtocol.Wait.
 func (*fakeTransportProtocol) Wait() {}
 
+// Parse implements TransportProtocol.Parse.
+func (*fakeTransportProtocol) Parse(pkt *stack.PacketBuffer) bool {
+	_, ok := pkt.TransportHeader().Consume(fakeTransHeaderLen)
+	return ok
+}
+
 func fakeTransFactory() stack.TransportProtocol {
 	return &fakeTransportProtocol{}
 }
@@ -369,9 +377,9 @@ func TestTransportReceive(t *testing.T) {
 	// Make sure packet with wrong protocol is not delivered.
 	buf[0] = 1
 	buf[2] = 0
-	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 	if fakeTrans.packetCount != 0 {
 		t.Errorf("packetCount = %d, want %d", fakeTrans.packetCount, 0)
 	}
@@ -380,9 +388,9 @@ func TestTransportReceive(t *testing.T) {
 	buf[0] = 1
 	buf[1] = 3
 	buf[2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 	if fakeTrans.packetCount != 0 {
 		t.Errorf("packetCount = %d, want %d", fakeTrans.packetCount, 0)
 	}
@@ -391,9 +399,9 @@ func TestTransportReceive(t *testing.T) {
 	buf[0] = 1
 	buf[1] = 2
 	buf[2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 	if fakeTrans.packetCount != 1 {
 		t.Errorf("packetCount = %d, want %d", fakeTrans.packetCount, 1)
 	}
@@ -446,9 +454,9 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 0
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = 0
-	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 	if fakeTrans.controlCount != 0 {
 		t.Errorf("controlCount = %d, want %d", fakeTrans.controlCount, 0)
 	}
@@ -457,9 +465,9 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 3
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 	if fakeTrans.controlCount != 0 {
 		t.Errorf("controlCount = %d, want %d", fakeTrans.controlCount, 0)
 	}
@@ -468,9 +476,9 @@ func TestTransportControlReceive(t *testing.T) {
 	buf[fakeNetHeaderLen+0] = 2
 	buf[fakeNetHeaderLen+1] = 1
 	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
-	linkEP.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	linkEP.InjectInbound(fakeNetNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
-	})
+	}))
 	if fakeTrans.controlCount != 1 {
 		t.Errorf("controlCount = %d, want %d", fakeTrans.controlCount, 1)
 	}
@@ -623,9 +631,9 @@ func TestTransportForwarding(t *testing.T) {
 	req[0] = 1
 	req[1] = 3
 	req[2] = byte(fakeTransNumber)
-	ep2.InjectInbound(fakeNetNumber, stack.PacketBuffer{
+	ep2.InjectInbound(fakeNetNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: req.ToVectorisedView(),
-	})
+	}))
 
 	aep, _, err := ep.Accept()
 	if err != nil || aep == nil {
@@ -642,11 +650,11 @@ func TestTransportForwarding(t *testing.T) {
 		t.Fatal("Response packet not forwarded")
 	}
 
-	hdrs := p.Pkt.Data.ToView()
-	if dst := hdrs[0]; dst != 3 {
+	nh := stack.PayloadSince(p.Pkt.NetworkHeader())
+	if dst := nh[0]; dst != 3 {
 		t.Errorf("Response packet has incorrect destination addresss: got = %d, want = 3", dst)
 	}
-	if src := hdrs[1]; src != 1 {
+	if src := nh[1]; src != 1 {
 		t.Errorf("Response packet has incorrect source addresss: got = %d, want = 3", src)
 	}
 }
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
index b7b227328..07c85ce59 100644
--- a/pkg/tcpip/tcpip.go
+++ b/pkg/tcpip/tcpip.go
@@ -43,6 +43,9 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// Using header.IPv4AddressSize would cause an import cycle.
+const ipv4AddressSize = 4
+
 // Error represents an error in the netstack error space. Using a special type
 // ensures that errors outside of this space are not accidentally introduced.
 //
@@ -192,7 +195,7 @@ func (e ErrSaveRejection) Error() string {
 	return "save rejected due to unsupported networking state: " + e.Err.Error()
 }
 
-// A Clock provides the current time.
+// A Clock provides the current time and schedules work for execution.
 //
 // Times returned by a Clock should always be used for application-visible
 // time. Only monotonic times should be used for netstack internal timekeeping.
@@ -203,6 +206,31 @@ type Clock interface {
 
 	// NowMonotonic returns a monotonic time value.
 	NowMonotonic() int64
+
+	// AfterFunc waits for the duration to elapse and then calls f in its own
+	// goroutine. It returns a Timer that can be used to cancel the call using
+	// its Stop method.
+	AfterFunc(d time.Duration, f func()) Timer
+}
+
+// Timer represents a single event. A Timer must be created with
+// Clock.AfterFunc.
+type Timer interface {
+	// Stop prevents the Timer from firing. It returns true if the call stops the
+	// timer, false if the timer has already expired or been stopped.
+	//
+	// If Stop returns false, then the timer has already expired and the function
+	// f of Clock.AfterFunc(d, f) has been started in its own goroutine; Stop
+	// does not wait for f to complete before returning. If the caller needs to
+	// know whether f is completed, it must coordinate with f explicitly.
+	Stop() bool
+
+	// Reset changes the timer to expire after duration d.
+	//
+	// Reset should be invoked only on stopped or expired timers. If the timer is
+	// known to have expired, Reset can be used directly. Otherwise, the caller
+	// must coordinate with the function f of Clock.AfterFunc(d, f).
+	Reset(d time.Duration)
 }
 
 // Address is a byte slice cast as a string that represents the address of a
@@ -295,6 +323,29 @@ func (s *Subnet) Broadcast() Address {
 	return Address(addr)
 }
 
+// IsBroadcast returns true if the address is considered a broadcast address.
+func (s *Subnet) IsBroadcast(address Address) bool {
+	// Only IPv4 supports the notion of a broadcast address.
+	if len(address) != ipv4AddressSize {
+		return false
+	}
+
+	// Normally, we would just compare address with the subnet's broadcast
+	// address but there is an exception where a simple comparison is not
+	// correct. This exception is for /31 and /32 IPv4 subnets where all
+	// addresses are considered valid host addresses.
+	//
+	// For /31 subnets, the case is easy. RFC 3021 Section 2.1 states that
+	// both addresses in a /31 subnet "MUST be interpreted as host addresses."
+	//
+	// For /32, the case is a bit more vague. RFC 3021 makes no mention of /32
+	// subnets. However, the same reasoning applies - if an exception is not
+	// made, then there do not exist any host addresses in a /32 subnet. RFC
+	// 4632 Section 3.1 also vaguely implies this interpretation by referring
+	// to addresses in /32 subnets as "host routes."
+	return s.Prefix() <= 30 && s.Broadcast() == address
+}
+
 // Equal returns true if s equals o.
 //
 // Needed to use cmp.Equal on Subnet as its fields are unexported.
@@ -316,6 +367,28 @@ const (
 	ShutdownWrite
 )
 
+// PacketType is used to indicate the destination of the packet.
+type PacketType uint8
+
+const (
+	// PacketHost indicates a packet addressed to the local host.
+	PacketHost PacketType = iota
+
+	// PacketOtherHost indicates an outgoing packet addressed to
+	// another host caught by a NIC in promiscuous mode.
+	PacketOtherHost
+
+	// PacketOutgoing for a packet originating from the local host
+	// that is looped back to a packet socket.
+	PacketOutgoing
+
+	// PacketBroadcast indicates a link layer broadcast packet.
+	PacketBroadcast
+
+	// PacketMulticast indicates a link layer multicast packet.
+	PacketMulticast
+)
+
 // FullAddress represents a full transport node address, as required by the
 // Connect() and Bind() methods.
 //
@@ -549,6 +622,28 @@ type Endpoint interface {
 	SetOwner(owner PacketOwner)
 }
 
+// LinkPacketInfo holds Link layer information for a received packet.
+//
+// +stateify savable
+type LinkPacketInfo struct {
+	// Protocol is the NetworkProtocolNumber for the packet.
+	Protocol NetworkProtocolNumber
+
+	// PktType is used to indicate the destination of the packet.
+	PktType PacketType
+}
+
+// PacketEndpoint are additional methods that are only implemented by Packet
+// endpoints.
+type PacketEndpoint interface {
+	// ReadPacket reads a datagram/packet from the endpoint and optionally
+	// returns the sender and additional LinkPacketInfo.
+	//
+	// This method does not block if there is no data pending. It will also
+	// either return an error or data, never both.
+	ReadPacket(*FullAddress, *LinkPacketInfo) (buffer.View, ControlMessages, *Error)
+}
+
 // EndpointInfo is the interface implemented by each endpoint info struct.
 type EndpointInfo interface {
 	// IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
@@ -585,85 +680,108 @@ type WriteOptions struct {
 type SockOptBool int
 
 const (
-	// BroadcastOption is used by SetSockOpt/GetSockOpt to specify whether
-	// datagram sockets are allowed to send packets to a broadcast address.
+	// BroadcastOption is used by SetSockOptBool/GetSockOptBool to specify
+	// whether datagram sockets are allowed to send packets to a broadcast
+	// address.
 	BroadcastOption SockOptBool = iota
 
-	// CorkOption is used by SetSockOpt/GetSockOpt to specify if data should be
-	// held until segments are full by the TCP transport protocol.
+	// CorkOption is used by SetSockOptBool/GetSockOptBool to specify if
+	// data should be held until segments are full by the TCP transport
+	// protocol.
 	CorkOption
 
-	// DelayOption is used by SetSockOpt/GetSockOpt to specify if data
-	// should be sent out immediately by the transport protocol. For TCP,
-	// it determines if the Nagle algorithm is on or off.
+	// DelayOption is used by SetSockOptBool/GetSockOptBool to specify if
+	// data should be sent out immediately by the transport protocol. For
+	// TCP, it determines if the Nagle algorithm is on or off.
 	DelayOption
 
-	// KeepaliveEnabledOption is used by SetSockOpt/GetSockOpt to specify whether
-	// TCP keepalive is enabled for this socket.
+	// KeepaliveEnabledOption is used by SetSockOptBool/GetSockOptBool to
+	// specify whether TCP keepalive is enabled for this socket.
 	KeepaliveEnabledOption
 
-	// MulticastLoopOption is used by SetSockOpt/GetSockOpt to specify whether
-	// multicast packets sent over a non-loopback interface will be looped back.
+	// MulticastLoopOption is used by SetSockOptBool/GetSockOptBool to
+	// specify whether multicast packets sent over a non-loopback interface
+	// will be looped back.
 	MulticastLoopOption
 
-	// PasscredOption is used by SetSockOpt/GetSockOpt to specify whether
-	// SCM_CREDENTIALS socket control messages are enabled.
+	// NoChecksumOption is used by SetSockOptBool/GetSockOptBool to specify
+	// whether UDP checksum is disabled for this socket.
+	NoChecksumOption
+
+	// PasscredOption is used by SetSockOptBool/GetSockOptBool to specify
+	// whether SCM_CREDENTIALS socket control messages are enabled.
 	//
 	// Only supported on Unix sockets.
 	PasscredOption
 
-	// QuickAckOption is stubbed out in SetSockOpt/GetSockOpt.
+	// QuickAckOption is stubbed out in SetSockOptBool/GetSockOptBool.
 	QuickAckOption
 
-	// ReceiveTClassOption is used by SetSockOpt/GetSockOpt to specify if the
-	// IPV6_TCLASS ancillary message is passed with incoming packets.
+	// ReceiveTClassOption is used by SetSockOptBool/GetSockOptBool to
+	// specify if the IPV6_TCLASS ancillary message is passed with incoming
+	// packets.
 	ReceiveTClassOption
 
-	// ReceiveTOSOption is used by SetSockOpt/GetSockOpt to specify if the TOS
-	// ancillary message is passed with incoming packets.
+	// ReceiveTOSOption is used by SetSockOptBool/GetSockOptBool to specify
+	// if the TOS ancillary message is passed with incoming packets.
 	ReceiveTOSOption
 
-	// ReceiveIPPacketInfoOption is used by {G,S}etSockOptBool to specify
-	// if more inforamtion is provided with incoming packets such
-	// as interface index and address.
+	// ReceiveIPPacketInfoOption is used by SetSockOptBool/GetSockOptBool to
+	// specify if more inforamtion is provided with incoming packets such as
+	// interface index and address.
 	ReceiveIPPacketInfoOption
 
-	// ReuseAddressOption is used by SetSockOpt/GetSockOpt to specify whether Bind()
-	// should allow reuse of local address.
+	// ReuseAddressOption is used by SetSockOptBool/GetSockOptBool to
+	// specify whether Bind() should allow reuse of local address.
 	ReuseAddressOption
 
-	// ReusePortOption is used by SetSockOpt/GetSockOpt to permit multiple sockets
-	// to be bound to an identical socket address.
+	// ReusePortOption is used by SetSockOptBool/GetSockOptBool to permit
+	// multiple sockets to be bound to an identical socket address.
 	ReusePortOption
 
-	// V6OnlyOption is used by {G,S}etSockOptBool to specify whether an IPv6
-	// socket is to be restricted to sending and receiving IPv6 packets only.
+	// V6OnlyOption is used by SetSockOptBool/GetSockOptBool to specify
+	// whether an IPv6 socket is to be restricted to sending and receiving
+	// IPv6 packets only.
 	V6OnlyOption
+
+	// IPHdrIncludedOption is used by SetSockOpt to indicate for a raw
+	// endpoint that all packets being written have an IP header and the
+	// endpoint should not attach an IP header.
+	IPHdrIncludedOption
 )
 
 // SockOptInt represents socket options which values have the int type.
 type SockOptInt int
 
 const (
-	// KeepaliveCountOption is used by SetSockOpt/GetSockOpt to specify the number
-	// of un-ACKed TCP keepalives that will be sent before the connection is
-	// closed.
+	// KeepaliveCountOption is used by SetSockOptInt/GetSockOptInt to
+	// specify the number of un-ACKed TCP keepalives that will be sent
+	// before the connection is closed.
 	KeepaliveCountOption SockOptInt = iota
 
-	// IPv4TOSOption is used by SetSockOpt/GetSockOpt to specify TOS
+	// IPv4TOSOption is used by SetSockOptInt/GetSockOptInt to specify TOS
 	// for all subsequent outgoing IPv4 packets from the endpoint.
 	IPv4TOSOption
 
-	// IPv6TrafficClassOption is used by SetSockOpt/GetSockOpt to specify TOS
-	// for all subsequent outgoing IPv6 packets from the endpoint.
+	// IPv6TrafficClassOption is used by SetSockOptInt/GetSockOptInt to
+	// specify TOS for all subsequent outgoing IPv6 packets from the
+	// endpoint.
 	IPv6TrafficClassOption
 
-	// MaxSegOption is used by SetSockOpt/GetSockOpt to set/get the current
-	// Maximum Segment Size(MSS) value as specified using the TCP_MAXSEG option.
+	// MaxSegOption is used by SetSockOptInt/GetSockOptInt to set/get the
+	// current Maximum Segment Size(MSS) value as specified using the
+	// TCP_MAXSEG option.
 	MaxSegOption
 
-	// MulticastTTLOption is used by SetSockOpt/GetSockOpt to control the default
-	// TTL value for multicast messages. The default is 1.
+	// MTUDiscoverOption is used to set/get the path MTU discovery setting.
+	//
+	// NOTE: Setting this option to any other value than PMTUDiscoveryDont
+	// is not supported and will fail as such, and getting this option will
+	// always return PMTUDiscoveryDont.
+	MTUDiscoverOption
+
+	// MulticastTTLOption is used by SetSockOptInt/GetSockOptInt to control
+	// the default TTL value for multicast messages. The default is 1.
 	MulticastTTLOption
 
 	// ReceiveQueueSizeOption is used in GetSockOptInt to specify that the
@@ -682,26 +800,45 @@ const (
 	// number of unread bytes in the output buffer should be returned.
 	SendQueueSizeOption
 
-	// TTLOption is used by SetSockOpt/GetSockOpt to control the default TTL/hop
-	// limit value for unicast messages. The default is protocol specific.
+	// TTLOption is used by SetSockOptInt/GetSockOptInt to control the
+	// default TTL/hop limit value for unicast messages. The default is
+	// protocol specific.
 	//
 	// A zero value indicates the default.
 	TTLOption
 
-	// TCPSynCountOption is used by SetSockOpt/GetSockOpt to specify the number of
-	// SYN retransmits that TCP should send before aborting the attempt to
-	// connect. It cannot exceed 255.
+	// TCPSynCountOption is used by SetSockOptInt/GetSockOptInt to specify
+	// the number of SYN retransmits that TCP should send before aborting
+	// the attempt to connect. It cannot exceed 255.
 	//
 	// NOTE: This option is currently only stubbed out and is no-op.
 	TCPSynCountOption
 
-	// TCPWindowClampOption is used by SetSockOpt/GetSockOpt to bound the size
-	// of the advertised window to this value.
+	// TCPWindowClampOption is used by SetSockOptInt/GetSockOptInt to bound
+	// the size of the advertised window to this value.
 	//
 	// NOTE: This option is currently only stubed out and is a no-op
 	TCPWindowClampOption
 )
 
+const (
+	// PMTUDiscoveryWant is a setting of the MTUDiscoverOption to use
+	// per-route settings.
+	PMTUDiscoveryWant int = iota
+
+	// PMTUDiscoveryDont is a setting of the MTUDiscoverOption to disable
+	// path MTU discovery.
+	PMTUDiscoveryDont
+
+	// PMTUDiscoveryDo is a setting of the MTUDiscoverOption to always do
+	// path MTU discovery.
+	PMTUDiscoveryDo
+
+	// PMTUDiscoveryProbe is a setting of the MTUDiscoverOption to set DF
+	// but ignore path MTU.
+	PMTUDiscoveryProbe
+)
+
 // ErrorOption is used in GetSockOpt to specify that the last error reported by
 // the endpoint should be cleared and returned.
 type ErrorOption struct{}
@@ -740,7 +877,7 @@ type CongestionControlOption string
 // control algorithms.
 type AvailableCongestionControlOption string
 
-// buffer moderation.
+// ModerateReceiveBufferOption is used by buffer moderation.
 type ModerateReceiveBufferOption bool
 
 // TCPLingerTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
@@ -813,7 +950,35 @@ type OutOfBandInlineOption int
 // a default TTL.
 type DefaultTTLOption uint8
 
-// IPPacketInfo is the message struture for IP_PKTINFO.
+// SocketDetachFilterOption is used by SetSockOpt to detach a previously attached
+// classic BPF filter on a given endpoint.
+type SocketDetachFilterOption int
+
+// OriginalDestinationOption is used to get the original destination address
+// and port of a redirected packet.
+type OriginalDestinationOption FullAddress
+
+// TCPTimeWaitReuseOption is used stack.(*Stack).TransportProtocolOption to
+// specify if the stack can reuse the port bound by an endpoint in TIME-WAIT for
+// new connections when it is safe from protocol viewpoint.
+type TCPTimeWaitReuseOption uint8
+
+const (
+	// TCPTimeWaitReuseDisabled indicates reuse of port bound by endponts in TIME-WAIT cannot
+	// be reused for new connections.
+	TCPTimeWaitReuseDisabled TCPTimeWaitReuseOption = iota
+
+	// TCPTimeWaitReuseGlobal indicates reuse of port bound by endponts in TIME-WAIT can
+	// be reused for new connections irrespective of the src/dest addresses.
+	TCPTimeWaitReuseGlobal
+
+	// TCPTimeWaitReuseLoopbackOnly indicates reuse of port bound by endpoint in TIME-WAIT can
+	// only be reused if the connection was a connection over loopback. i.e src/dest adddresses
+	// are loopback addresses.
+	TCPTimeWaitReuseLoopbackOnly
+)
+
+// IPPacketInfo is the message structure for IP_PKTINFO.
 //
 // +stateify savable
 type IPPacketInfo struct {
@@ -823,7 +988,7 @@ type IPPacketInfo struct {
 	// LocalAddr is the local address.
 	LocalAddr Address
 
-	// DestinationAddr is the destination address.
+	// DestinationAddr is the destination address found in the IP header.
 	DestinationAddr Address
 }
 
@@ -1198,6 +1363,12 @@ type UDPStats struct {
 
 	// PacketSendErrors is the number of datagrams failed to be sent.
 	PacketSendErrors *StatCounter
+
+	// ChecksumErrors is the number of datagrams dropped due to bad checksums.
+	ChecksumErrors *StatCounter
+
+	// InvalidSourceAddress is the number of invalid sourced datagrams dropped.
+	InvalidSourceAddress *StatCounter
 }
 
 // Stats holds statistics about the networking stack.
@@ -1241,6 +1412,9 @@ type ReceiveErrors struct {
 	// ClosedReceiver is the number of received packets dropped because
 	// of receiving endpoint state being closed.
 	ClosedReceiver StatCounter
+
+	// ChecksumErrors is the number of packets dropped due to bad checksums.
+	ChecksumErrors StatCounter
 }
 
 // SendErrors collects packet send errors within the transport layer for
diff --git a/pkg/tcpip/tests/integration/BUILD b/pkg/tcpip/tests/integration/BUILD
new file mode 100644
index 000000000..6d52af98a
--- /dev/null
+++ b/pkg/tcpip/tests/integration/BUILD
@@ -0,0 +1,22 @@
+load("//tools:defs.bzl", "go_test")
+
+package(licenses = ["notice"])
+
+go_test(
+    name = "integration_test",
+    size = "small",
+    srcs = ["multicast_broadcast_test.go"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/icmp",
+        "//pkg/tcpip/transport/udp",
+        "//pkg/waiter",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+    ],
+)
diff --git a/pkg/tcpip/tests/integration/multicast_broadcast_test.go b/pkg/tcpip/tests/integration/multicast_broadcast_test.go
new file mode 100644
index 000000000..9f0dd4d6d
--- /dev/null
+++ b/pkg/tcpip/tests/integration/multicast_broadcast_test.go
@@ -0,0 +1,438 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package integration_test
+
+import (
+	"net"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	defaultMTU = 1280
+	ttl        = 255
+)
+
+var (
+	ipv4Addr = tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(net.ParseIP("192.168.1.58").To4()),
+		PrefixLen: 24,
+	}
+	ipv4Subnet      = ipv4Addr.Subnet()
+	ipv4SubnetBcast = ipv4Subnet.Broadcast()
+
+	ipv6Addr = tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(net.ParseIP("200a::1").To16()),
+		PrefixLen: 64,
+	}
+	ipv6Subnet      = ipv6Addr.Subnet()
+	ipv6SubnetBcast = ipv6Subnet.Broadcast()
+
+	// Remote addrs.
+	remoteIPv4Addr = tcpip.Address(net.ParseIP("10.0.0.1").To4())
+	remoteIPv6Addr = tcpip.Address(net.ParseIP("200b::1").To16())
+)
+
+// TestPingMulticastBroadcast tests that responding to an Echo Request destined
+// to a multicast or broadcast address uses a unicast source address for the
+// reply.
+func TestPingMulticastBroadcast(t *testing.T) {
+	const nicID = 1
+
+	rxIPv4ICMP := func(e *channel.Endpoint, dst tcpip.Address) {
+		totalLen := header.IPv4MinimumSize + header.ICMPv4MinimumSize
+		hdr := buffer.NewPrependable(totalLen)
+		pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
+		pkt.SetType(header.ICMPv4Echo)
+		pkt.SetCode(0)
+		pkt.SetChecksum(0)
+		pkt.SetChecksum(^header.Checksum(pkt, 0))
+		ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+		ip.Encode(&header.IPv4Fields{
+			IHL:         header.IPv4MinimumSize,
+			TotalLength: uint16(totalLen),
+			Protocol:    uint8(icmp.ProtocolNumber4),
+			TTL:         ttl,
+			SrcAddr:     remoteIPv4Addr,
+			DstAddr:     dst,
+		})
+
+		e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: hdr.View().ToVectorisedView(),
+		}))
+	}
+
+	rxIPv6ICMP := func(e *channel.Endpoint, dst tcpip.Address) {
+		totalLen := header.IPv6MinimumSize + header.ICMPv6MinimumSize
+		hdr := buffer.NewPrependable(totalLen)
+		pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6MinimumSize))
+		pkt.SetType(header.ICMPv6EchoRequest)
+		pkt.SetCode(0)
+		pkt.SetChecksum(0)
+		pkt.SetChecksum(header.ICMPv6Checksum(pkt, remoteIPv6Addr, dst, buffer.VectorisedView{}))
+		ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+		ip.Encode(&header.IPv6Fields{
+			PayloadLength: header.ICMPv6MinimumSize,
+			NextHeader:    uint8(icmp.ProtocolNumber6),
+			HopLimit:      ttl,
+			SrcAddr:       remoteIPv6Addr,
+			DstAddr:       dst,
+		})
+
+		e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: hdr.View().ToVectorisedView(),
+		}))
+	}
+
+	tests := []struct {
+		name    string
+		dstAddr tcpip.Address
+	}{
+		{
+			name:    "IPv4 unicast",
+			dstAddr: ipv4Addr.Address,
+		},
+		{
+			name:    "IPv4 directed broadcast",
+			dstAddr: ipv4SubnetBcast,
+		},
+		{
+			name:    "IPv4 broadcast",
+			dstAddr: header.IPv4Broadcast,
+		},
+		{
+			name:    "IPv4 all-systems multicast",
+			dstAddr: header.IPv4AllSystems,
+		},
+		{
+			name:    "IPv6 unicast",
+			dstAddr: ipv6Addr.Address,
+		},
+		{
+			name:    "IPv6 all-nodes multicast",
+			dstAddr: header.IPv6AllNodesMulticastAddress,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ipv4Proto := ipv4.NewProtocol()
+			ipv6Proto := ipv6.NewProtocol()
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{ipv4Proto, ipv6Proto},
+				TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol4(), icmp.NewProtocol6()},
+			})
+			// We only expect a single packet in response to our ICMP Echo Request.
+			e := channel.New(1, defaultMTU, "")
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+			}
+			ipv4ProtoAddr := tcpip.ProtocolAddress{Protocol: header.IPv4ProtocolNumber, AddressWithPrefix: ipv4Addr}
+			if err := s.AddProtocolAddress(nicID, ipv4ProtoAddr); err != nil {
+				t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, ipv4ProtoAddr, err)
+			}
+			ipv6ProtoAddr := tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: ipv6Addr}
+			if err := s.AddProtocolAddress(nicID, ipv6ProtoAddr); err != nil {
+				t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, ipv6ProtoAddr, err)
+			}
+
+			// Default routes for IPv4 and IPv6 so ICMP can find a route to the remote
+			// node when attempting to send the ICMP Echo Reply.
+			s.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: header.IPv6EmptySubnet,
+					NIC:         nicID,
+				},
+				tcpip.Route{
+					Destination: header.IPv4EmptySubnet,
+					NIC:         nicID,
+				},
+			})
+
+			var rxICMP func(*channel.Endpoint, tcpip.Address)
+			var expectedSrc tcpip.Address
+			var expectedDst tcpip.Address
+			var proto stack.NetworkProtocol
+			switch l := len(test.dstAddr); l {
+			case header.IPv4AddressSize:
+				rxICMP = rxIPv4ICMP
+				expectedSrc = ipv4Addr.Address
+				expectedDst = remoteIPv4Addr
+				proto = ipv4Proto
+			case header.IPv6AddressSize:
+				rxICMP = rxIPv6ICMP
+				expectedSrc = ipv6Addr.Address
+				expectedDst = remoteIPv6Addr
+				proto = ipv6Proto
+			default:
+				t.Fatalf("got unexpected address length = %d bytes", l)
+			}
+
+			rxICMP(e, test.dstAddr)
+			pkt, ok := e.Read()
+			if !ok {
+				t.Fatal("expected ICMP response")
+			}
+
+			if pkt.Route.LocalAddress != expectedSrc {
+				t.Errorf("got pkt.Route.LocalAddress = %s, want = %s", pkt.Route.LocalAddress, expectedSrc)
+			}
+			if pkt.Route.RemoteAddress != expectedDst {
+				t.Errorf("got pkt.Route.RemoteAddress = %s, want = %s", pkt.Route.RemoteAddress, expectedDst)
+			}
+
+			src, dst := proto.ParseAddresses(pkt.Pkt.NetworkHeader().View())
+			if src != expectedSrc {
+				t.Errorf("got pkt source = %s, want = %s", src, expectedSrc)
+			}
+			if dst != expectedDst {
+				t.Errorf("got pkt destination = %s, want = %s", dst, expectedDst)
+			}
+		})
+	}
+
+}
+
+// TestIncomingMulticastAndBroadcast tests receiving a packet destined to some
+// multicast or broadcast address.
+func TestIncomingMulticastAndBroadcast(t *testing.T) {
+	const (
+		nicID      = 1
+		remotePort = 5555
+		localPort  = 80
+	)
+
+	data := []byte{1, 2, 3, 4}
+
+	rxIPv4UDP := func(e *channel.Endpoint, dst tcpip.Address) {
+		payloadLen := header.UDPMinimumSize + len(data)
+		totalLen := header.IPv4MinimumSize + payloadLen
+		hdr := buffer.NewPrependable(totalLen)
+		u := header.UDP(hdr.Prepend(payloadLen))
+		u.Encode(&header.UDPFields{
+			SrcPort: remotePort,
+			DstPort: localPort,
+			Length:  uint16(payloadLen),
+		})
+		copy(u.Payload(), data)
+		sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, remoteIPv4Addr, dst, uint16(payloadLen))
+		sum = header.Checksum(data, sum)
+		u.SetChecksum(^u.CalculateChecksum(sum))
+
+		ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+		ip.Encode(&header.IPv4Fields{
+			IHL:         header.IPv4MinimumSize,
+			TotalLength: uint16(totalLen),
+			Protocol:    uint8(udp.ProtocolNumber),
+			TTL:         ttl,
+			SrcAddr:     remoteIPv4Addr,
+			DstAddr:     dst,
+		})
+
+		e.InjectInbound(header.IPv4ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: hdr.View().ToVectorisedView(),
+		}))
+	}
+
+	rxIPv6UDP := func(e *channel.Endpoint, dst tcpip.Address) {
+		payloadLen := header.UDPMinimumSize + len(data)
+		hdr := buffer.NewPrependable(header.IPv6MinimumSize + payloadLen)
+		u := header.UDP(hdr.Prepend(payloadLen))
+		u.Encode(&header.UDPFields{
+			SrcPort: remotePort,
+			DstPort: localPort,
+			Length:  uint16(payloadLen),
+		})
+		copy(u.Payload(), data)
+		sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, remoteIPv6Addr, dst, uint16(payloadLen))
+		sum = header.Checksum(data, sum)
+		u.SetChecksum(^u.CalculateChecksum(sum))
+
+		ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+		ip.Encode(&header.IPv6Fields{
+			PayloadLength: uint16(payloadLen),
+			NextHeader:    uint8(udp.ProtocolNumber),
+			HopLimit:      ttl,
+			SrcAddr:       remoteIPv6Addr,
+			DstAddr:       dst,
+		})
+
+		e.InjectInbound(header.IPv6ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: hdr.View().ToVectorisedView(),
+		}))
+	}
+
+	tests := []struct {
+		name     string
+		bindAddr tcpip.Address
+		dstAddr  tcpip.Address
+		expectRx bool
+	}{
+		{
+			name:     "IPv4 unicast binding to unicast",
+			bindAddr: ipv4Addr.Address,
+			dstAddr:  ipv4Addr.Address,
+			expectRx: true,
+		},
+		{
+			name:     "IPv4 unicast binding to broadcast",
+			bindAddr: header.IPv4Broadcast,
+			dstAddr:  ipv4Addr.Address,
+			expectRx: false,
+		},
+		{
+			name:     "IPv4 unicast binding to wildcard",
+			dstAddr:  ipv4Addr.Address,
+			expectRx: true,
+		},
+
+		{
+			name:     "IPv4 directed broadcast binding to subnet broadcast",
+			bindAddr: ipv4SubnetBcast,
+			dstAddr:  ipv4SubnetBcast,
+			expectRx: true,
+		},
+		{
+			name:     "IPv4 directed broadcast binding to broadcast",
+			bindAddr: header.IPv4Broadcast,
+			dstAddr:  ipv4SubnetBcast,
+			expectRx: false,
+		},
+		{
+			name:     "IPv4 directed broadcast binding to wildcard",
+			dstAddr:  ipv4SubnetBcast,
+			expectRx: true,
+		},
+
+		{
+			name:     "IPv4 broadcast binding to broadcast",
+			bindAddr: header.IPv4Broadcast,
+			dstAddr:  header.IPv4Broadcast,
+			expectRx: true,
+		},
+		{
+			name:     "IPv4 broadcast binding to subnet broadcast",
+			bindAddr: ipv4SubnetBcast,
+			dstAddr:  header.IPv4Broadcast,
+			expectRx: false,
+		},
+		{
+			name:     "IPv4 broadcast binding to wildcard",
+			dstAddr:  ipv4SubnetBcast,
+			expectRx: true,
+		},
+
+		{
+			name:     "IPv4 all-systems multicast binding to all-systems multicast",
+			bindAddr: header.IPv4AllSystems,
+			dstAddr:  header.IPv4AllSystems,
+			expectRx: true,
+		},
+		{
+			name:     "IPv4 all-systems multicast binding to wildcard",
+			dstAddr:  header.IPv4AllSystems,
+			expectRx: true,
+		},
+		{
+			name:     "IPv4 all-systems multicast binding to unicast",
+			bindAddr: ipv4Addr.Address,
+			dstAddr:  header.IPv4AllSystems,
+			expectRx: false,
+		},
+
+		// IPv6 has no notion of a broadcast.
+		{
+			name:     "IPv6 unicast binding to wildcard",
+			dstAddr:  ipv6Addr.Address,
+			expectRx: true,
+		},
+		{
+			name:     "IPv6 broadcast-like address binding to wildcard",
+			dstAddr:  ipv6SubnetBcast,
+			expectRx: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			e := channel.New(0, defaultMTU, "")
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+			}
+			ipv4ProtoAddr := tcpip.ProtocolAddress{Protocol: header.IPv4ProtocolNumber, AddressWithPrefix: ipv4Addr}
+			if err := s.AddProtocolAddress(nicID, ipv4ProtoAddr); err != nil {
+				t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, ipv4ProtoAddr, err)
+			}
+			ipv6ProtoAddr := tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: ipv6Addr}
+			if err := s.AddProtocolAddress(nicID, ipv6ProtoAddr); err != nil {
+				t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, ipv6ProtoAddr, err)
+			}
+
+			var netproto tcpip.NetworkProtocolNumber
+			var rxUDP func(*channel.Endpoint, tcpip.Address)
+			switch l := len(test.dstAddr); l {
+			case header.IPv4AddressSize:
+				netproto = header.IPv4ProtocolNumber
+				rxUDP = rxIPv4UDP
+			case header.IPv6AddressSize:
+				netproto = header.IPv6ProtocolNumber
+				rxUDP = rxIPv6UDP
+			default:
+				t.Fatalf("got unexpected address length = %d bytes", l)
+			}
+
+			wq := waiter.Queue{}
+			ep, err := s.NewEndpoint(udp.ProtocolNumber, netproto, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, netproto, err)
+			}
+			defer ep.Close()
+
+			bindAddr := tcpip.FullAddress{Addr: test.bindAddr, Port: localPort}
+			if err := ep.Bind(bindAddr); err != nil {
+				t.Fatalf("ep.Bind(%+v): %s", bindAddr, err)
+			}
+
+			rxUDP(e, test.dstAddr)
+			if gotPayload, _, err := ep.Read(nil); test.expectRx {
+				if err != nil {
+					t.Fatalf("Read(nil): %s", err)
+				}
+				if diff := cmp.Diff(buffer.View(data), gotPayload); diff != "" {
+					t.Errorf("got UDP payload mismatch (-want +got):\n%s", diff)
+				}
+			} else {
+				if err != tcpip.ErrWouldBlock {
+					t.Fatalf("got Read(nil) = (%x, _, %v), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+				}
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
index 2f98a996f..f32d58091 100644
--- a/pkg/tcpip/time_unsafe.go
+++ b/pkg/tcpip/time_unsafe.go
@@ -13,14 +13,14 @@
 // limitations under the License.
 
 // +build go1.9
-// +build !go1.15
+// +build !go1.16
 
 // Check go:linkname function signatures when updating Go version.
 
 package tcpip
 
 import (
-	_ "time"   // Used with go:linkname.
+	"time"     // Used with go:linkname.
 	_ "unsafe" // Required for go:linkname.
 )
 
@@ -45,3 +45,31 @@ func (*StdClock) NowMonotonic() int64 {
 	_, _, mono := now()
 	return mono
 }
+
+// AfterFunc implements Clock.AfterFunc.
+func (*StdClock) AfterFunc(d time.Duration, f func()) Timer {
+	return &stdTimer{
+		t: time.AfterFunc(d, f),
+	}
+}
+
+type stdTimer struct {
+	t *time.Timer
+}
+
+var _ Timer = (*stdTimer)(nil)
+
+// Stop implements Timer.Stop.
+func (st *stdTimer) Stop() bool {
+	return st.t.Stop()
+}
+
+// Reset implements Timer.Reset.
+func (st *stdTimer) Reset(d time.Duration) {
+	st.t.Reset(d)
+}
+
+// NewStdTimer returns a Timer implemented with the time package.
+func NewStdTimer(t *time.Timer) Timer {
+	return &stdTimer{t: t}
+}
diff --git a/pkg/tcpip/timer.go b/pkg/tcpip/timer.go
index 59f3b391f..f1dd7c310 100644
--- a/pkg/tcpip/timer.go
+++ b/pkg/tcpip/timer.go
@@ -15,54 +15,54 @@
 package tcpip
 
 import (
-	"sync"
 	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
 )
 
-// cancellableTimerInstance is a specific instance of CancellableTimer.
+// jobInstance is a specific instance of Job.
 //
-// Different instances are created each time CancellableTimer is Reset so each
-// timer has its own earlyReturn signal. This is to address a bug when a
-// CancellableTimer is stopped and reset in quick succession resulting in a
-// timer instance's earlyReturn signal being affected or seen by another timer
-// instance.
+// Different instances are created each time Job is scheduled so each timer has
+// its own earlyReturn signal. This is to address a bug when a Job is stopped
+// and reset in quick succession resulting in a timer instance's earlyReturn
+// signal being affected or seen by another timer instance.
 //
 // Consider the following sceneario where timer instances share a common
 // earlyReturn signal (T1 creates, stops and resets a Cancellable timer under a
 // lock L; T2, T3, T4 and T5 are goroutines that handle the first (A), second
 // (B), third (C), and fourth (D) instance of the timer firing, respectively):
 //   T1: Obtain L
-//   T1: Create a new CancellableTimer w/ lock L (create instance A)
+//   T1: Create a new Job w/ lock L (create instance A)
 //   T2: instance A fires, blocked trying to obtain L.
 //   T1: Attempt to stop instance A (set earlyReturn = true)
-//   T1: Reset timer (create instance B)
+//   T1: Schedule timer (create instance B)
 //   T3: instance B fires, blocked trying to obtain L.
 //   T1: Attempt to stop instance B (set earlyReturn = true)
-//   T1: Reset timer (create instance C)
+//   T1: Schedule timer (create instance C)
 //   T4: instance C fires, blocked trying to obtain L.
 //   T1: Attempt to stop instance C (set earlyReturn = true)
-//   T1: Reset timer (create instance D)
+//   T1: Schedule timer (create instance D)
 //   T5: instance D fires, blocked trying to obtain L.
 //   T1: Release L
 //
-// Now that T1 has released L, any of the 4 timer instances can take L and check
-// earlyReturn. If the timers simply check earlyReturn and then do nothing
-// further, then instance D will never early return even though it was not
-// requested to stop. If the timers reset earlyReturn before early returning,
-// then all but one of the timers will do work when only one was expected to.
-// If CancellableTimer resets earlyReturn when resetting, then all the timers
+// Now that T1 has released L, any of the 4 timer instances can take L and
+// check earlyReturn. If the timers simply check earlyReturn and then do
+// nothing further, then instance D will never early return even though it was
+// not requested to stop. If the timers reset earlyReturn before early
+// returning, then all but one of the timers will do work when only one was
+// expected to. If Job resets earlyReturn when resetting, then all the timers
 // will fire (again, when only one was expected to).
 //
 // To address the above concerns the simplest solution was to give each timer
 // its own earlyReturn signal.
-type cancellableTimerInstance struct {
-	timer *time.Timer
+type jobInstance struct {
+	timer Timer
 
 	// Used to inform the timer to early return when it gets stopped while the
 	// lock the timer tries to obtain when fired is held (T1 is a goroutine that
 	// tries to cancel the timer and T2 is the goroutine that handles the timer
 	// firing):
-	//   T1: Obtain the lock, then call StopLocked()
+	//   T1: Obtain the lock, then call Cancel()
 	//   T2: timer fires, and gets blocked on obtaining the lock
 	//   T1: Releases lock
 	//   T2: Obtains lock does unintended work
@@ -73,27 +73,33 @@ type cancellableTimerInstance struct {
 	earlyReturn *bool
 }
 
-// stop stops the timer instance t from firing if it hasn't fired already. If it
+// stop stops the job instance j from firing if it hasn't fired already. If it
 // has fired and is blocked at obtaining the lock, earlyReturn will be set to
 // true so that it will early return when it obtains the lock.
-func (t *cancellableTimerInstance) stop() {
-	if t.timer != nil {
-		t.timer.Stop()
-		*t.earlyReturn = true
+func (j *jobInstance) stop() {
+	if j.timer != nil {
+		j.timer.Stop()
+		*j.earlyReturn = true
 	}
 }
 
-// CancellableTimer is a timer that does some work and can be safely cancelled
-// when it fires at the same time some "related work" is being done.
+// Job represents some work that can be scheduled for execution. The work can
+// be safely cancelled when it fires at the same time some "related work" is
+// being done.
 //
 // The term "related work" is defined as some work that needs to be done while
 // holding some lock that the timer must also hold while doing some work.
 //
-// Note, it is not safe to copy a CancellableTimer as its timer instance creates
-// a closure over the address of the CancellableTimer.
-type CancellableTimer struct {
+// Note, it is not safe to copy a Job as its timer instance creates
+// a closure over the address of the Job.
+type Job struct {
+	_ sync.NoCopy
+
+	// The clock used to schedule the backing timer
+	clock Clock
+
 	// The active instance of a cancellable timer.
-	instance cancellableTimerInstance
+	instance jobInstance
 
 	// locker is the lock taken by the timer immediately after it fires and must
 	// be held when attempting to stop the timer.
@@ -110,75 +116,91 @@ type CancellableTimer struct {
 	fn func()
 }
 
-// StopLocked prevents the Timer from firing if it has not fired already.
+// Cancel prevents the Job from executing if it has not executed already.
 //
-// If the timer is blocked on obtaining the t.locker lock when StopLocked is
-// called, it will early return instead of calling t.fn.
+// Cancel requires appropriate locking to be in place for any resources managed
+// by the Job. If the Job is blocked on obtaining the lock when Cancel is
+// called, it will early return.
 //
 // Note, t will be modified.
 //
-// t.locker MUST be locked.
-func (t *CancellableTimer) StopLocked() {
-	t.instance.stop()
+// j.locker MUST be locked.
+func (j *Job) Cancel() {
+	j.instance.stop()
 
 	// Nothing to do with the stopped instance anymore.
-	t.instance = cancellableTimerInstance{}
+	j.instance = jobInstance{}
 }
 
-// Reset changes the timer to expire after duration d.
+// Schedule schedules the Job for execution after duration d. This can be
+// called on cancelled or completed Jobs to schedule them again.
 //
-// Note, t will be modified.
+// Schedule should be invoked only on unscheduled, cancelled, or completed
+// Jobs. To be safe, callers should always call Cancel before calling Schedule.
 //
-// Reset should only be called on stopped or expired timers. To be safe, callers
-// should always call StopLocked before calling Reset.
-func (t *CancellableTimer) Reset(d time.Duration) {
+// Note, j will be modified.
+func (j *Job) Schedule(d time.Duration) {
 	// Create a new instance.
 	earlyReturn := false
 
 	// Capture the locker so that updating the timer does not cause a data race
 	// when a timer fires and tries to obtain the lock (read the timer's locker).
-	locker := t.locker
-	t.instance = cancellableTimerInstance{
-		timer: time.AfterFunc(d, func() {
+	locker := j.locker
+	j.instance = jobInstance{
+		timer: j.clock.AfterFunc(d, func() {
 			locker.Lock()
 			defer locker.Unlock()
 
 			if earlyReturn {
 				// If we reach this point, it means that the timer fired while another
-				// goroutine called StopLocked while it had the lock. Simply return
-				// here and do nothing further.
+				// goroutine called Cancel while it had the lock. Simply return here
+				// and do nothing further.
 				earlyReturn = false
 				return
 			}
 
-			t.fn()
+			j.fn()
 		}),
 		earlyReturn: &earlyReturn,
 	}
 }
 
-// Lock is a no-op used by the copylocks checker from go vet.
-//
-// See CancellableTimer for details about why it shouldn't be copied.
-//
-// See https://github.com/golang/go/issues/8005#issuecomment-190753527 for more
-// details about the copylocks checker.
-func (*CancellableTimer) Lock() {}
-
-// Unlock is a no-op used by the copylocks checker from go vet.
-//
-// See CancellableTimer for details about why it shouldn't be copied.
-//
-// See https://github.com/golang/go/issues/8005#issuecomment-190753527 for more
-// details about the copylocks checker.
-func (*CancellableTimer) Unlock() {}
-
-// NewCancellableTimer returns an unscheduled CancellableTimer with the given
-// locker and fn.
-//
-// fn MUST NOT attempt to lock locker.
-//
-// Callers must call Reset to schedule the timer to fire.
-func NewCancellableTimer(locker sync.Locker, fn func()) *CancellableTimer {
-	return &CancellableTimer{locker: locker, fn: fn}
+// NewJob returns a new Job that can be used to schedule f to run in its own
+// gorountine. l will be locked before calling f then unlocked after f returns.
+//
+//  var clock tcpip.StdClock
+//  var mu sync.Mutex
+//  message := "foo"
+//  job := tcpip.NewJob(&clock, &mu, func() {
+//    fmt.Println(message)
+//  })
+//  job.Schedule(time.Second)
+//
+//  mu.Lock()
+//  message = "bar"
+//  mu.Unlock()
+//
+//  // Output: bar
+//
+// f MUST NOT attempt to lock l.
+//
+// l MUST be locked prior to calling the returned job's Cancel().
+//
+//  var clock tcpip.StdClock
+//  var mu sync.Mutex
+//  message := "foo"
+//  job := tcpip.NewJob(&clock, &mu, func() {
+//    fmt.Println(message)
+//  })
+//  job.Schedule(time.Second)
+//
+//  mu.Lock()
+//  job.Cancel()
+//  mu.Unlock()
+func NewJob(c Clock, l sync.Locker, f func()) *Job {
+	return &Job{
+		clock:  c,
+		locker: l,
+		fn:     f,
+	}
 }
diff --git a/pkg/tcpip/timer_test.go b/pkg/tcpip/timer_test.go
index b4940e397..a82384c49 100644
--- a/pkg/tcpip/timer_test.go
+++ b/pkg/tcpip/timer_test.go
@@ -28,8 +28,8 @@ const (
 	longDuration   = 1 * time.Second
 )
 
-func TestCancellableTimerReassignment(t *testing.T) {
-	var timer tcpip.CancellableTimer
+func TestJobReschedule(t *testing.T) {
+	var clock tcpip.StdClock
 	var wg sync.WaitGroup
 	var lock sync.Mutex
 
@@ -43,26 +43,27 @@ func TestCancellableTimerReassignment(t *testing.T) {
 			// that has an active timer (even if it has been stopped as a stopped
 			// timer may be blocked on a lock before it can check if it has been
 			// stopped while another goroutine holds the same lock).
-			timer = *tcpip.NewCancellableTimer(&lock, func() {
+			job := tcpip.NewJob(&clock, &lock, func() {
 				wg.Done()
 			})
-			timer.Reset(shortDuration)
+			job.Schedule(shortDuration)
 			lock.Unlock()
 		}()
 	}
 	wg.Wait()
 }
 
-func TestCancellableTimerFire(t *testing.T) {
+func TestJobExecution(t *testing.T) {
 	t.Parallel()
 
-	ch := make(chan struct{})
+	var clock tcpip.StdClock
 	var lock sync.Mutex
+	ch := make(chan struct{})
 
-	timer := tcpip.NewCancellableTimer(&lock, func() {
+	job := tcpip.NewJob(&clock, &lock, func() {
 		ch <- struct{}{}
 	})
-	timer.Reset(shortDuration)
+	job.Schedule(shortDuration)
 
 	// Wait for timer to fire.
 	select {
@@ -82,17 +83,18 @@ func TestCancellableTimerFire(t *testing.T) {
 func TestCancellableTimerResetFromLongDuration(t *testing.T) {
 	t.Parallel()
 
-	ch := make(chan struct{})
+	var clock tcpip.StdClock
 	var lock sync.Mutex
+	ch := make(chan struct{})
 
-	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
-	timer.Reset(middleDuration)
+	job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} })
+	job.Schedule(middleDuration)
 
 	lock.Lock()
-	timer.StopLocked()
+	job.Cancel()
 	lock.Unlock()
 
-	timer.Reset(shortDuration)
+	job.Schedule(shortDuration)
 
 	// Wait for timer to fire.
 	select {
@@ -109,16 +111,17 @@ func TestCancellableTimerResetFromLongDuration(t *testing.T) {
 	}
 }
 
-func TestCancellableTimerResetFromShortDuration(t *testing.T) {
+func TestJobRescheduleFromShortDuration(t *testing.T) {
 	t.Parallel()
 
-	ch := make(chan struct{})
+	var clock tcpip.StdClock
 	var lock sync.Mutex
+	ch := make(chan struct{})
 
 	lock.Lock()
-	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
-	timer.Reset(shortDuration)
-	timer.StopLocked()
+	job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} })
+	job.Schedule(shortDuration)
+	job.Cancel()
 	lock.Unlock()
 
 	// Wait for timer to fire if it wasn't correctly stopped.
@@ -128,7 +131,7 @@ func TestCancellableTimerResetFromShortDuration(t *testing.T) {
 	case <-time.After(middleDuration):
 	}
 
-	timer.Reset(shortDuration)
+	job.Schedule(shortDuration)
 
 	// Wait for timer to fire.
 	select {
@@ -145,17 +148,18 @@ func TestCancellableTimerResetFromShortDuration(t *testing.T) {
 	}
 }
 
-func TestCancellableTimerImmediatelyStop(t *testing.T) {
+func TestJobImmediatelyCancel(t *testing.T) {
 	t.Parallel()
 
-	ch := make(chan struct{})
+	var clock tcpip.StdClock
 	var lock sync.Mutex
+	ch := make(chan struct{})
 
 	for i := 0; i < 1000; i++ {
 		lock.Lock()
-		timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
-		timer.Reset(shortDuration)
-		timer.StopLocked()
+		job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} })
+		job.Schedule(shortDuration)
+		job.Cancel()
 		lock.Unlock()
 	}
 
@@ -167,25 +171,26 @@ func TestCancellableTimerImmediatelyStop(t *testing.T) {
 	}
 }
 
-func TestCancellableTimerStoppedResetWithoutLock(t *testing.T) {
+func TestJobCancelledRescheduleWithoutLock(t *testing.T) {
 	t.Parallel()
 
-	ch := make(chan struct{})
+	var clock tcpip.StdClock
 	var lock sync.Mutex
+	ch := make(chan struct{})
 
 	lock.Lock()
-	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
-	timer.Reset(shortDuration)
-	timer.StopLocked()
+	job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} })
+	job.Schedule(shortDuration)
+	job.Cancel()
 	lock.Unlock()
 
 	for i := 0; i < 10; i++ {
-		timer.Reset(middleDuration)
+		job.Schedule(middleDuration)
 
 		lock.Lock()
 		// Sleep until the timer fires and gets blocked trying to take the lock.
 		time.Sleep(middleDuration * 2)
-		timer.StopLocked()
+		job.Cancel()
 		lock.Unlock()
 	}
 
@@ -201,17 +206,18 @@ func TestCancellableTimerStoppedResetWithoutLock(t *testing.T) {
 func TestManyCancellableTimerResetAfterBlockedOnLock(t *testing.T) {
 	t.Parallel()
 
-	ch := make(chan struct{})
+	var clock tcpip.StdClock
 	var lock sync.Mutex
+	ch := make(chan struct{})
 
 	lock.Lock()
-	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
-	timer.Reset(shortDuration)
+	job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} })
+	job.Schedule(shortDuration)
 	for i := 0; i < 10; i++ {
 		// Sleep until the timer fires and gets blocked trying to take the lock.
 		time.Sleep(middleDuration)
-		timer.StopLocked()
-		timer.Reset(shortDuration)
+		job.Cancel()
+		job.Schedule(shortDuration)
 	}
 	lock.Unlock()
 
@@ -230,18 +236,19 @@ func TestManyCancellableTimerResetAfterBlockedOnLock(t *testing.T) {
 	}
 }
 
-func TestManyCancellableTimerResetUnderLock(t *testing.T) {
+func TestManyJobReschedulesUnderLock(t *testing.T) {
 	t.Parallel()
 
-	ch := make(chan struct{})
+	var clock tcpip.StdClock
 	var lock sync.Mutex
+	ch := make(chan struct{})
 
 	lock.Lock()
-	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
-	timer.Reset(shortDuration)
+	job := tcpip.NewJob(&clock, &lock, func() { ch <- struct{}{} })
+	job.Schedule(shortDuration)
 	for i := 0; i < 10; i++ {
-		timer.StopLocked()
-		timer.Reset(shortDuration)
+		job.Cancel()
+		job.Schedule(shortDuration)
 	}
 	lock.Unlock()
 
diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD
index 9ce625c17..7e5c79776 100644
--- a/pkg/tcpip/transport/icmp/BUILD
+++ b/pkg/tcpip/transport/icmp/BUILD
@@ -31,6 +31,7 @@ go_library(
         "//pkg/tcpip",
         "//pkg/tcpip/buffer",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/ports",
         "//pkg/tcpip/stack",
         "//pkg/tcpip/transport/raw",
         "//pkg/tcpip/transport/tcp",
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
index b1d820372..bd6f49eb8 100644
--- a/pkg/tcpip/transport/icmp/endpoint.go
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -19,6 +19,7 @@ import (
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
@@ -110,7 +111,7 @@ func (e *endpoint) Close() {
 	e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite
 	switch e.state {
 	case stateBound, stateConnected:
-		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, []tcpip.NetworkProtocolNumber{e.NetProto}, e.TransProto, e.ID, e, 0 /* bindToDevice */)
+		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, []tcpip.NetworkProtocolNumber{e.NetProto}, e.TransProto, e.ID, e, ports.Flags{}, 0 /* bindToDevice */)
 	}
 
 	// Close the receive list and drain it.
@@ -140,11 +141,6 @@ func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (stack.IPTables, error) {
-	return e.stack.IPTables(), nil
-}
-
 // Read reads data from the endpoint. This method does not block if
 // there is no data pending.
 func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
@@ -348,6 +344,10 @@ func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
 
 // SetSockOpt sets a socket option.
 func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	switch opt.(type) {
+	case tcpip.SocketDetachFilterOption:
+		return nil
+	}
 	return nil
 }
 
@@ -430,9 +430,12 @@ func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpi
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	hdr := buffer.NewPrependable(header.ICMPv4MinimumSize + int(r.MaxHeaderLength()))
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: header.ICMPv4MinimumSize + int(r.MaxHeaderLength()),
+	})
+	pkt.Owner = owner
 
-	icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
+	icmpv4 := header.ICMPv4(pkt.TransportHeader().Push(header.ICMPv4MinimumSize))
 	copy(icmpv4, data)
 	// Set the ident to the user-specified port. Sequence number should
 	// already be set by the user.
@@ -447,15 +450,12 @@ func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpi
 	icmpv4.SetChecksum(0)
 	icmpv4.SetChecksum(^header.Checksum(icmpv4, header.Checksum(data, 0)))
 
+	pkt.Data = data.ToVectorisedView()
+
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, stack.PacketBuffer{
-		Header:          hdr,
-		Data:            data.ToVectorisedView(),
-		TransportHeader: buffer.View(icmpv4),
-		Owner:           owner,
-	})
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, pkt)
 }
 
 func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Error {
@@ -463,9 +463,11 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	hdr := buffer.NewPrependable(header.ICMPv6MinimumSize + int(r.MaxHeaderLength()))
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: header.ICMPv6MinimumSize + int(r.MaxHeaderLength()),
+	})
 
-	icmpv6 := header.ICMPv6(hdr.Prepend(header.ICMPv6MinimumSize))
+	icmpv6 := header.ICMPv6(pkt.TransportHeader().Push(header.ICMPv6MinimumSize))
 	copy(icmpv6, data)
 	// Set the ident. Sequence number is provided by the user.
 	icmpv6.SetIdent(ident)
@@ -477,15 +479,12 @@ func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Err
 
 	dataVV := data.ToVectorisedView()
 	icmpv6.SetChecksum(header.ICMPv6Checksum(icmpv6, r.LocalAddress, r.RemoteAddress, dataVV))
+	pkt.Data = dataVV
 
 	if ttl == 0 {
 		ttl = r.DefaultTTL()
 	}
-	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, stack.PacketBuffer{
-		Header:          hdr,
-		Data:            dataVV,
-		TransportHeader: buffer.View(icmpv6),
-	})
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, pkt)
 }
 
 // checkV4MappedLocked determines the effective network protocol and converts
@@ -511,6 +510,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 	nicID := addr.NIC
 	localPort := uint16(0)
 	switch e.state {
+	case stateInitial:
 	case stateBound, stateConnected:
 		localPort = e.ID.LocalPort
 		if e.BindNICID == 0 {
@@ -611,14 +611,14 @@ func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.Networ
 	if id.LocalPort != 0 {
 		// The endpoint already has a local port, just attempt to
 		// register it.
-		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, e.TransProto, id, e, false /* reuse */, 0 /* bindToDevice */)
+		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, e.TransProto, id, e, ports.Flags{}, 0 /* bindToDevice */)
 		return id, err
 	}
 
 	// We need to find a port for the endpoint.
 	_, err := e.stack.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
 		id.LocalPort = p
-		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, e.TransProto, id, e, false /* reuse */, 0 /* bindtodevice */)
+		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, e.TransProto, id, e, ports.Flags{}, 0 /* bindtodevice */)
 		switch err {
 		case nil:
 			return true, nil
@@ -743,19 +743,23 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	// Only accept echo replies.
 	switch e.NetProto {
 	case header.IPv4ProtocolNumber:
-		h, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
-		if !ok || header.ICMPv4(h).Type() != header.ICMPv4EchoReply {
+		h := header.ICMPv4(pkt.TransportHeader().View())
+		// TODO(b/129292233): Determine if len(h) check is still needed after early
+		// parsing.
+		if len(h) < header.ICMPv4MinimumSize || h.Type() != header.ICMPv4EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 			return
 		}
 	case header.IPv6ProtocolNumber:
-		h, ok := pkt.Data.PullUp(header.ICMPv6MinimumSize)
-		if !ok || header.ICMPv6(h).Type() != header.ICMPv6EchoReply {
+		h := header.ICMPv6(pkt.TransportHeader().View())
+		// TODO(b/129292233): Determine if len(h) check is still needed after early
+		// parsing.
+		if len(h) < header.ICMPv6MinimumSize || h.Type() != header.ICMPv6EchoReply {
 			e.stack.Stats().DroppedPackets.Increment()
 			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 			return
@@ -789,12 +793,14 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 		},
 	}
 
-	packet.data = pkt.Data
+	// ICMP socket's data includes ICMP header.
+	packet.data = pkt.TransportHeader().View().ToVectorisedView()
+	packet.data.Append(pkt.Data)
 
 	e.rcvList.PushBack(packet)
 	e.rcvBufSize += packet.data.Size()
 
-	packet.timestamp = e.stack.NowNanoseconds()
+	packet.timestamp = e.stack.Clock().NowNanoseconds()
 
 	e.rcvMu.Unlock()
 	e.stats.PacketsReceived.Increment()
@@ -805,7 +811,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 }
 
 // State implements tcpip.Endpoint.State. The ICMP endpoint currently doesn't
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
index 3c47692b2..74ef6541e 100644
--- a/pkg/tcpip/transport/icmp/protocol.go
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -104,7 +104,7 @@ func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, stack.PacketBuffer) bool {
+func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) bool {
 	return true
 }
 
@@ -124,6 +124,16 @@ func (*protocol) Close() {}
 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}
 
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
+	// TODO(gvisor.dev/issue/170): Implement parsing of ICMP.
+	//
+	// Right now, the Parse() method is tied to enabled protocols passed into
+	// stack.New. This works for UDP and TCP, but we handle ICMP traffic even
+	// when netstack users don't pass ICMP as a supported protocol.
+	return false
+}
+
 // NewProtocol4 returns an ICMPv4 transport protocol.
 func NewProtocol4() stack.TransportProtocol {
 	return &protocol{ProtocolNumber4}
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
index 23158173d..1b03ad6bb 100644
--- a/pkg/tcpip/transport/packet/endpoint.go
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -25,6 +25,8 @@
 package packet
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -43,6 +45,9 @@ type packet struct {
 	timestampNS int64
 	// senderAddr is the network address of the sender.
 	senderAddr tcpip.FullAddress
+	// packetInfo holds additional information like the protocol
+	// of the packet etc.
+	packetInfo tcpip.LinkPacketInfo
 }
 
 // endpoint is the packet socket implementation of tcpip.Endpoint. It is legal
@@ -71,11 +76,17 @@ type endpoint struct {
 	rcvClosed     bool
 
 	// The following fields are protected by mu.
-	mu         sync.RWMutex `state:"nosave"`
-	sndBufSize int
-	closed     bool
-	stats      tcpip.TransportEndpointStats `state:"nosave"`
-	bound      bool
+	mu            sync.RWMutex `state:"nosave"`
+	sndBufSize    int
+	sndBufSizeMax int
+	closed        bool
+	stats         tcpip.TransportEndpointStats `state:"nosave"`
+	bound         bool
+	boundNIC      tcpip.NICID
+
+	// lastErrorMu protects lastError.
+	lastErrorMu sync.Mutex   `state:"nosave"`
+	lastError   *tcpip.Error `state:".(string)"`
 }
 
 // NewEndpoint returns a new packet endpoint.
@@ -92,6 +103,17 @@ func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumb
 		sndBufSize:    32 * 1024,
 	}
 
+	// Override with stack defaults.
+	var ss stack.SendBufferSizeOption
+	if err := s.Option(&ss); err == nil {
+		ep.sndBufSizeMax = ss.Default
+	}
+
+	var rs stack.ReceiveBufferSizeOption
+	if err := s.Option(&rs); err == nil {
+		ep.rcvBufSizeMax = rs.Default
+	}
+
 	if err := s.RegisterPacketEndpoint(0, netProto, ep); err != nil {
 		return nil, err
 	}
@@ -132,13 +154,8 @@ func (ep *endpoint) Close() {
 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
 func (ep *endpoint) ModerateRecvBuf(copied int) {}
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (ep *endpoint) IPTables() (stack.IPTables, error) {
-	return ep.stack.IPTables(), nil
-}
-
-// Read implements tcpip.Endpoint.Read.
-func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+// Read implements tcpip.PacketEndpoint.ReadPacket.
+func (ep *endpoint) ReadPacket(addr *tcpip.FullAddress, info *tcpip.LinkPacketInfo) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
 	ep.rcvMu.Lock()
 
 	// If there's no data to read, return that read would block or that the
@@ -163,11 +180,20 @@ func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMes
 		*addr = packet.senderAddr
 	}
 
+	if info != nil {
+		*info = packet.packetInfo
+	}
+
 	return packet.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: packet.timestampNS}, nil
 }
 
+// Read implements tcpip.Endpoint.Read.
+func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	return ep.ReadPacket(addr, nil)
+}
+
 func (ep *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
-	// TODO(b/129292371): Implement.
+	// TODO(gvisor.dev/issue/173): Implement.
 	return 0, nil, tcpip.ErrInvalidOptionValue
 }
 
@@ -220,12 +246,14 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 	ep.mu.Lock()
 	defer ep.mu.Unlock()
 
-	if ep.bound {
-		return tcpip.ErrAlreadyBound
+	if ep.bound && ep.boundNIC == addr.NIC {
+		// If the NIC being bound is the same then just return success.
+		return nil
 	}
 
 	// Unregister endpoint with all the nics.
 	ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep)
+	ep.bound = false
 
 	// Bind endpoint to receive packets from specific interface.
 	if err := ep.stack.RegisterPacketEndpoint(addr.NIC, ep.netProto, ep); err != nil {
@@ -233,6 +261,7 @@ func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 	}
 
 	ep.bound = true
+	ep.boundNIC = addr.NIC
 
 	return nil
 }
@@ -269,7 +298,13 @@ func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 // used with SetSockOpt, and this function always returns
 // tcpip.ErrNotSupported.
 func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
+	switch opt.(type) {
+	case tcpip.SocketDetachFilterOption:
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
@@ -279,11 +314,63 @@ func (ep *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
 func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
+	switch opt {
+	case tcpip.SendBufferSizeOption:
+		// Make sure the send buffer size is within the min and max
+		// allowed.
+		var ss stack.SendBufferSizeOption
+		if err := ep.stack.Option(&ss); err != nil {
+			panic(fmt.Sprintf("s.Option(%#v) = %s", ss, err))
+		}
+		if v > ss.Max {
+			v = ss.Max
+		}
+		if v < ss.Min {
+			v = ss.Min
+		}
+		ep.mu.Lock()
+		ep.sndBufSizeMax = v
+		ep.mu.Unlock()
+		return nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		// Make sure the receive buffer size is within the min and max
+		// allowed.
+		var rs stack.ReceiveBufferSizeOption
+		if err := ep.stack.Option(&rs); err != nil {
+			panic(fmt.Sprintf("s.Option(%#v) = %s", rs, err))
+		}
+		if v > rs.Max {
+			v = rs.Max
+		}
+		if v < rs.Min {
+			v = rs.Min
+		}
+		ep.rcvMu.Lock()
+		ep.rcvBufSizeMax = v
+		ep.rcvMu.Unlock()
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+func (ep *endpoint) takeLastError() *tcpip.Error {
+	ep.lastErrorMu.Lock()
+	defer ep.lastErrorMu.Unlock()
+
+	err := ep.lastError
+	ep.lastError = nil
+	return err
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
 func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	switch opt.(type) {
+	case tcpip.ErrorOption:
+		return ep.takeLastError()
+	}
 	return tcpip.ErrNotSupported
 }
 
@@ -294,11 +381,36 @@ func (ep *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 
 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
 func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
-	return 0, tcpip.ErrNotSupported
+	switch opt {
+	case tcpip.ReceiveQueueSizeOption:
+		v := 0
+		ep.rcvMu.Lock()
+		if !ep.rcvList.Empty() {
+			p := ep.rcvList.Front()
+			v = p.data.Size()
+		}
+		ep.rcvMu.Unlock()
+		return v, nil
+
+	case tcpip.SendBufferSizeOption:
+		ep.mu.Lock()
+		v := ep.sndBufSizeMax
+		ep.mu.Unlock()
+		return v, nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		ep.rcvMu.Lock()
+		v := ep.rcvBufSizeMax
+		ep.rcvMu.Unlock()
+		return v, nil
+
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // HandlePacket implements stack.PacketEndpoint.HandlePacket.
-func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt stack.PacketBuffer) {
+func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
 	ep.rcvMu.Lock()
 
 	// Drop the packet if our buffer is currently full.
@@ -320,48 +432,73 @@ func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress,
 
 	// Push new packet into receive list and increment the buffer size.
 	var packet packet
-	// TODO(b/129292371): Return network protocol.
-	if len(pkt.LinkHeader) > 0 {
+	// TODO(gvisor.dev/issue/173): Return network protocol.
+	if !pkt.LinkHeader().View().IsEmpty() {
 		// Get info directly from the ethernet header.
-		hdr := header.Ethernet(pkt.LinkHeader)
+		hdr := header.Ethernet(pkt.LinkHeader().View())
 		packet.senderAddr = tcpip.FullAddress{
 			NIC:  nicID,
 			Addr: tcpip.Address(hdr.SourceAddress()),
 		}
+		packet.packetInfo.Protocol = netProto
+		packet.packetInfo.PktType = pkt.PktType
 	} else {
 		// Guess the would-be ethernet header.
 		packet.senderAddr = tcpip.FullAddress{
 			NIC:  nicID,
 			Addr: tcpip.Address(localAddr),
 		}
+		packet.packetInfo.Protocol = netProto
+		packet.packetInfo.PktType = pkt.PktType
 	}
 
 	if ep.cooked {
 		// Cooked packets can simply be queued.
-		packet.data = pkt.Data
+		switch pkt.PktType {
+		case tcpip.PacketHost:
+			packet.data = pkt.Data
+		case tcpip.PacketOutgoing:
+			// Strip Link Header.
+			var combinedVV buffer.VectorisedView
+			if v := pkt.NetworkHeader().View(); !v.IsEmpty() {
+				combinedVV.AppendView(v)
+			}
+			if v := pkt.TransportHeader().View(); !v.IsEmpty() {
+				combinedVV.AppendView(v)
+			}
+			combinedVV.Append(pkt.Data)
+			packet.data = combinedVV
+		default:
+			panic(fmt.Sprintf("unexpected PktType in pkt: %+v", pkt))
+		}
+
 	} else {
 		// Raw packets need their ethernet headers prepended before
 		// queueing.
 		var linkHeader buffer.View
-		if len(pkt.LinkHeader) == 0 {
-			// We weren't provided with an actual ethernet header,
-			// so fake one.
-			ethFields := header.EthernetFields{
-				SrcAddr: tcpip.LinkAddress([]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00}),
-				DstAddr: localAddr,
-				Type:    netProto,
+		if pkt.PktType != tcpip.PacketOutgoing {
+			if pkt.LinkHeader().View().IsEmpty() {
+				// We weren't provided with an actual ethernet header,
+				// so fake one.
+				ethFields := header.EthernetFields{
+					SrcAddr: tcpip.LinkAddress([]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00}),
+					DstAddr: localAddr,
+					Type:    netProto,
+				}
+				fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
+				fakeHeader.Encode(&ethFields)
+				linkHeader = buffer.View(fakeHeader)
+			} else {
+				linkHeader = append(buffer.View(nil), pkt.LinkHeader().View()...)
 			}
-			fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
-			fakeHeader.Encode(&ethFields)
-			linkHeader = buffer.View(fakeHeader)
+			combinedVV := linkHeader.ToVectorisedView()
+			combinedVV.Append(pkt.Data)
+			packet.data = combinedVV
 		} else {
-			linkHeader = append(buffer.View(nil), pkt.LinkHeader...)
+			packet.data = buffer.NewVectorisedView(pkt.Size(), pkt.Views())
 		}
-		combinedVV := linkHeader.ToVectorisedView()
-		combinedVV.Append(pkt.Data)
-		packet.data = combinedVV
 	}
-	packet.timestampNS = ep.stack.NowNanoseconds()
+	packet.timestampNS = ep.stack.Clock().NowNanoseconds()
 
 	ep.rcvList.PushBack(&packet)
 	ep.rcvBufSize += packet.data.Size()
diff --git a/pkg/tcpip/transport/packet/endpoint_state.go b/pkg/tcpip/transport/packet/endpoint_state.go
index 9b88f17e4..e2fa96d17 100644
--- a/pkg/tcpip/transport/packet/endpoint_state.go
+++ b/pkg/tcpip/transport/packet/endpoint_state.go
@@ -15,6 +15,7 @@
 package packet
 
 import (
+	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
 	"gvisor.dev/gvisor/pkg/tcpip/stack"
 )
@@ -70,3 +71,21 @@ func (ep *endpoint) afterLoad() {
 		panic(*err)
 	}
 }
+
+// saveLastError is invoked by stateify.
+func (ep *endpoint) saveLastError() string {
+	if ep.lastError == nil {
+		return ""
+	}
+
+	return ep.lastError.String()
+}
+
+// loadLastError is invoked by stateify.
+func (ep *endpoint) loadLastError(s string) {
+	if s == "" {
+		return
+	}
+
+	ep.lastError = tcpip.StringToError(s)
+}
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
index eee754a5a..edc2b5b61 100644
--- a/pkg/tcpip/transport/raw/endpoint.go
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -26,6 +26,8 @@
 package raw
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -61,21 +63,23 @@ type endpoint struct {
 	stack       *stack.Stack `state:"manual"`
 	waiterQueue *waiter.Queue
 	associated  bool
+	hdrIncluded bool
 
 	// The following fields are used to manage the receive queue and are
 	// protected by rcvMu.
 	rcvMu         sync.Mutex `state:"nosave"`
 	rcvList       rawPacketList
-	rcvBufSizeMax int `state:".(int)"`
 	rcvBufSize    int
+	rcvBufSizeMax int `state:".(int)"`
 	rcvClosed     bool
 
 	// The following fields are protected by mu.
-	mu         sync.RWMutex `state:"nosave"`
-	sndBufSize int
-	closed     bool
-	connected  bool
-	bound      bool
+	mu            sync.RWMutex `state:"nosave"`
+	sndBufSize    int
+	sndBufSizeMax int
+	closed        bool
+	connected     bool
+	bound         bool
 	// route is the route to a remote network endpoint. It is set via
 	// Connect(), and is valid only when conneted is true.
 	route stack.Route                  `state:"manual"`
@@ -91,7 +95,7 @@ func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, trans
 }
 
 func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) {
-	if netProto != header.IPv4ProtocolNumber {
+	if netProto != header.IPv4ProtocolNumber && netProto != header.IPv6ProtocolNumber {
 		return nil, tcpip.ErrUnknownProtocol
 	}
 
@@ -103,8 +107,20 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProt
 		},
 		waiterQueue:   waiterQueue,
 		rcvBufSizeMax: 32 * 1024,
-		sndBufSize:    32 * 1024,
+		sndBufSizeMax: 32 * 1024,
 		associated:    associated,
+		hdrIncluded:   !associated,
+	}
+
+	// Override with stack defaults.
+	var ss stack.SendBufferSizeOption
+	if err := s.Option(&ss); err == nil {
+		e.sndBufSizeMax = ss.Default
+	}
+
+	var rs stack.ReceiveBufferSizeOption
+	if err := s.Option(&rs); err == nil {
+		e.rcvBufSizeMax = rs.Default
 	}
 
 	// Unassociated endpoints are write-only and users call Write() with IP
@@ -166,17 +182,8 @@ func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (stack.IPTables, error) {
-	return e.stack.IPTables(), nil
-}
-
 // Read implements tcpip.Endpoint.Read.
 func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
-	if !e.associated {
-		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidOptionValue
-	}
-
 	e.rcvMu.Lock()
 
 	// If there's no data to read, return that read would block or that the
@@ -206,6 +213,11 @@ func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMess
 
 // Write implements tcpip.Endpoint.Write.
 func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+	// We can create, but not write to, unassociated IPv6 endpoints.
+	if !e.associated && e.TransportEndpointInfo.NetProto == header.IPv6ProtocolNumber {
+		return 0, nil, tcpip.ErrInvalidOptionValue
+	}
+
 	n, ch, err := e.write(p, opts)
 	switch err {
 	case nil:
@@ -249,7 +261,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 
 	// If this is an unassociated socket and callee provided a nonzero
 	// destination address, route using that address.
-	if !e.associated {
+	if e.hdrIncluded {
 		ip := header.IPv4(payloadBytes)
 		if !ip.IsValid(len(payloadBytes)) {
 			e.mu.RUnlock()
@@ -310,12 +322,6 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		return 0, nil, tcpip.ErrNoRoute
 	}
 
-	// We don't support IPv6 yet, so this has to be an IPv4 address.
-	if len(opts.To.Addr) != header.IPv4AddressSize {
-		e.mu.RUnlock()
-		return 0, nil, tcpip.ErrInvalidEndpointState
-	}
-
 	// Find the route to the destination. If BindAddress is 0,
 	// FindRoute will choose an appropriate source address.
 	route, err := e.stack.FindRoute(nic, e.BindAddr, opts.To.Addr, e.NetProto, false)
@@ -345,28 +351,26 @@ func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64,
 		}
 	}
 
-	switch e.NetProto {
-	case header.IPv4ProtocolNumber:
-		if !e.associated {
-			if err := route.WriteHeaderIncludedPacket(stack.PacketBuffer{
-				Data: buffer.View(payloadBytes).ToVectorisedView(),
-			}); err != nil {
-				return 0, nil, err
-			}
-			break
+	if e.hdrIncluded {
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: buffer.View(payloadBytes).ToVectorisedView(),
+		})
+		if err := route.WriteHeaderIncludedPacket(pkt); err != nil {
+			return 0, nil, err
 		}
-
-		hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength()))
-		if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
-			Header: hdr,
-			Data:   buffer.View(payloadBytes).ToVectorisedView(),
-			Owner:  e.owner,
-		}); err != nil {
+	} else {
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: int(route.MaxHeaderLength()),
+			Data:               buffer.View(payloadBytes).ToVectorisedView(),
+		})
+		pkt.Owner = e.owner
+		if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{
+			Protocol: e.TransProto,
+			TTL:      route.DefaultTTL(),
+			TOS:      stack.DefaultTOS,
+		}, pkt); err != nil {
 			return 0, nil, err
 		}
-
-	default:
-		return 0, nil, tcpip.ErrUnknownProtocol
 	}
 
 	return int64(len(payloadBytes)), nil, nil
@@ -391,11 +395,6 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		return tcpip.ErrInvalidEndpointState
 	}
 
-	// We don't support IPv6 yet.
-	if len(addr.Addr) != header.IPv4AddressSize {
-		return tcpip.ErrInvalidEndpointState
-	}
-
 	nic := addr.NIC
 	if e.bound {
 		if e.BindNICID == 0 {
@@ -461,14 +460,8 @@ func (e *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	// Callers must provide an IPv4 address or no network address (for
-	// binding to a NIC, but not an address).
-	if len(addr.Addr) != 0 && len(addr.Addr) != 4 {
-		return tcpip.ErrInvalidEndpointState
-	}
-
 	// If a local address was specified, verify that it's valid.
-	if len(addr.Addr) == header.IPv4AddressSize && e.stack.CheckLocalAddress(addr.NIC, e.NetProto, addr.Addr) == 0 {
+	if len(addr.Addr) != 0 && e.stack.CheckLocalAddress(addr.NIC, e.NetProto, addr.Addr) == 0 {
 		return tcpip.ErrBadLocalAddress
 	}
 
@@ -518,17 +511,69 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // SetSockOpt implements tcpip.Endpoint.SetSockOpt.
 func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
+	switch opt.(type) {
+	case tcpip.SocketDetachFilterOption:
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
 func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	switch opt {
+	case tcpip.IPHdrIncludedOption:
+		e.mu.Lock()
+		e.hdrIncluded = v
+		e.mu.Unlock()
+		return nil
+	}
 	return tcpip.ErrUnknownProtocolOption
 }
 
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
-	return tcpip.ErrUnknownProtocolOption
+	switch opt {
+	case tcpip.SendBufferSizeOption:
+		// Make sure the send buffer size is within the min and max
+		// allowed.
+		var ss stack.SendBufferSizeOption
+		if err := e.stack.Option(&ss); err != nil {
+			panic(fmt.Sprintf("s.Option(%#v) = %s", ss, err))
+		}
+		if v > ss.Max {
+			v = ss.Max
+		}
+		if v < ss.Min {
+			v = ss.Min
+		}
+		e.mu.Lock()
+		e.sndBufSizeMax = v
+		e.mu.Unlock()
+		return nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		// Make sure the receive buffer size is within the min and max
+		// allowed.
+		var rs stack.ReceiveBufferSizeOption
+		if err := e.stack.Option(&rs); err != nil {
+			panic(fmt.Sprintf("s.Option(%#v) = %s", rs, err))
+		}
+		if v > rs.Max {
+			v = rs.Max
+		}
+		if v < rs.Min {
+			v = rs.Min
+		}
+		e.rcvMu.Lock()
+		e.rcvBufSizeMax = v
+		e.rcvMu.Unlock()
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
 }
 
 // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
@@ -548,6 +593,12 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 	case tcpip.KeepaliveEnabledOption:
 		return false, nil
 
+	case tcpip.IPHdrIncludedOption:
+		e.mu.Lock()
+		v := e.hdrIncluded
+		e.mu.Unlock()
+		return v, nil
+
 	default:
 		return false, tcpip.ErrUnknownProtocolOption
 	}
@@ -568,7 +619,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 
 	case tcpip.SendBufferSizeOption:
 		e.mu.Lock()
-		v := e.sndBufSize
+		v := e.sndBufSizeMax
 		e.mu.Unlock()
 		return v, nil
 
@@ -584,11 +635,18 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 }
 
 // HandlePacket implements stack.RawTransportEndpoint.HandlePacket.
-func (e *endpoint) HandlePacket(route *stack.Route, pkt stack.PacketBuffer) {
+func (e *endpoint) HandlePacket(route *stack.Route, pkt *stack.PacketBuffer) {
 	e.rcvMu.Lock()
 
-	// Drop the packet if our buffer is currently full.
-	if e.rcvClosed {
+	// Drop the packet if our buffer is currently full or if this is an unassociated
+	// endpoint (i.e endpoint created  w/ IPPROTO_RAW). Such endpoints are send only
+	// See: https://man7.org/linux/man-pages/man7/raw.7.html
+	//
+	//    An IPPROTO_RAW socket is send only.  If you really want to receive
+	//    all IP packets, use a packet(7) socket with the ETH_P_IP protocol.
+	//    Note that packet sockets don't reassemble IP fragments, unlike raw
+	//    sockets.
+	if e.rcvClosed || !e.associated {
 		e.rcvMu.Unlock()
 		e.stack.Stats().DroppedPackets.Increment()
 		e.stats.ReceiveErrors.ClosedReceiver.Increment()
@@ -632,15 +690,26 @@ func (e *endpoint) HandlePacket(route *stack.Route, pkt stack.PacketBuffer) {
 		},
 	}
 
-	networkHeader := append(buffer.View(nil), pkt.NetworkHeader...)
-	combinedVV := networkHeader.ToVectorisedView()
+	// Raw IPv4 endpoints return the IP header, but IPv6 endpoints do not.
+	// We copy headers' underlying bytes because pkt.*Header may point to
+	// the middle of a slice, and another struct may point to the "outer"
+	// slice. Save/restore doesn't support overlapping slices and will fail.
+	var combinedVV buffer.VectorisedView
+	if e.TransportEndpointInfo.NetProto == header.IPv4ProtocolNumber {
+		network, transport := pkt.NetworkHeader().View(), pkt.TransportHeader().View()
+		headers := make(buffer.View, 0, len(network)+len(transport))
+		headers = append(headers, network...)
+		headers = append(headers, transport...)
+		combinedVV = headers.ToVectorisedView()
+	} else {
+		combinedVV = append(buffer.View(nil), pkt.TransportHeader().View()...).ToVectorisedView()
+	}
 	combinedVV.Append(pkt.Data)
 	packet.data = combinedVV
-	packet.timestampNS = e.stack.NowNanoseconds()
+	packet.timestampNS = e.stack.Clock().NowNanoseconds()
 
 	e.rcvList.PushBack(packet)
 	e.rcvBufSize += packet.data.Size()
-
 	e.rcvMu.Unlock()
 	e.stats.PacketsReceived.Increment()
 	// Notify waiters that there's data to be read.
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
index f38eb6833..234fb95ce 100644
--- a/pkg/tcpip/transport/tcp/BUILD
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -40,6 +40,8 @@ go_library(
         "endpoint_state.go",
         "forwarder.go",
         "protocol.go",
+        "rack.go",
+        "rack_state.go",
         "rcv.go",
         "rcv_state.go",
         "reno.go",
@@ -49,6 +51,7 @@ go_library(
         "segment_heap.go",
         "segment_queue.go",
         "segment_state.go",
+        "segment_unsafe.go",
         "snd.go",
         "snd_state.go",
         "tcp_endpoint_list.go",
@@ -76,20 +79,18 @@ go_library(
 )
 
 go_test(
-    name = "tcp_test",
+    name = "tcp_x_test",
     size = "medium",
     srcs = [
         "dual_stack_test.go",
         "sack_scoreboard_test.go",
         "tcp_noracedetector_test.go",
+        "tcp_rack_test.go",
         "tcp_sack_test.go",
         "tcp_test.go",
         "tcp_timestamp_test.go",
     ],
-    # FIXME(b/68809571)
-    tags = [
-        "flaky",
-    ],
+    shard_count = 10,
     deps = [
         ":tcp",
         "//pkg/sync",
@@ -119,3 +120,11 @@ go_test(
         "//pkg/tcpip/seqnum",
     ],
 )
+
+go_test(
+    name = "tcp_test",
+    size = "small",
+    srcs = ["timer_test.go"],
+    library = ":tcp",
+    deps = ["//pkg/sleep"],
+)
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
index e6a23c978..913ea6535 100644
--- a/pkg/tcpip/transport/tcp/accept.go
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -198,9 +198,8 @@ func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnu
 }
 
 // createConnectingEndpoint creates a new endpoint in a connecting state, with
-// the connection parameters given by the arguments. The endpoint is returned
-// with n.mu held.
-func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) (*endpoint, *tcpip.Error) {
+// the connection parameters given by the arguments.
+func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) *endpoint {
 	// Create a new endpoint.
 	netProto := l.netProto
 	if netProto == 0 {
@@ -221,32 +220,12 @@ func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, i
 
 	n.initGSO()
 
-	// Create sender and receiver.
-	//
-	// The receiver at least temporarily has a zero receive window scale,
-	// but the caller may change it (before starting the protocol loop).
-	n.snd = newSender(n, iss, irs, s.window, rcvdSynOpts.MSS, rcvdSynOpts.WS)
-	n.rcv = newReceiver(n, irs, seqnum.Size(n.initialReceiveWindow()), 0, seqnum.Size(n.receiveBufferSize()))
 	// Bootstrap the auto tuning algorithm. Starting at zero will result in
 	// a large step function on the first window adjustment causing the
 	// window to grow to a really large value.
 	n.rcvAutoParams.prevCopied = n.initialReceiveWindow()
 
-	// Lock the endpoint before registering to ensure that no out of
-	// band changes are possible due to incoming packets etc till
-	// the endpoint is done initializing.
-	n.mu.Lock()
-
-	// Register new endpoint so that packets are routed to it.
-	if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.reusePort, n.boundBindToDevice); err != nil {
-		n.mu.Unlock()
-		n.Close()
-		return nil, err
-	}
-
-	n.isRegistered = true
-
-	return n, nil
+	return n
 }
 
 // createEndpointAndPerformHandshake creates a new endpoint in connected state
@@ -257,10 +236,12 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	// Create new endpoint.
 	irs := s.sequenceNumber
 	isn := generateSecureISN(s.id, l.stack.Seed())
-	ep, err := l.createConnectingEndpoint(s, isn, irs, opts, queue)
-	if err != nil {
-		return nil, err
-	}
+	ep := l.createConnectingEndpoint(s, isn, irs, opts, queue)
+
+	// Lock the endpoint before registering to ensure that no out of
+	// band changes are possible due to incoming packets etc till
+	// the endpoint is done initializing.
+	ep.mu.Lock()
 	ep.owner = owner
 
 	// listenEP is nil when listenContext is used by tcp.Forwarder.
@@ -268,18 +249,13 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 	if l.listenEP != nil {
 		l.listenEP.mu.Lock()
 		if l.listenEP.EndpointState() != StateListen {
+
 			l.listenEP.mu.Unlock()
 			// Ensure we release any registrations done by the newly
 			// created endpoint.
 			ep.mu.Unlock()
 			ep.Close()
 
-			// Wake up any waiters. This is strictly not required normally
-			// as a socket that was never accepted can't really have any
-			// registered waiters except when stack.Wait() is called which
-			// waits for all registered endpoints to stop and expects an
-			// EventHUp.
-			ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 			return nil, tcpip.ErrConnectionAborted
 		}
 		l.addPendingEndpoint(ep)
@@ -288,21 +264,44 @@ func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *head
 		// to the newly created endpoint.
 		l.listenEP.propagateInheritableOptionsLocked(ep)
 
+		if !ep.reserveTupleLocked() {
+			ep.mu.Unlock()
+			ep.Close()
+
+			if l.listenEP != nil {
+				l.removePendingEndpoint(ep)
+				l.listenEP.mu.Unlock()
+			}
+
+			return nil, tcpip.ErrConnectionAborted
+		}
+
 		deferAccept = l.listenEP.deferAccept
 		l.listenEP.mu.Unlock()
 	}
 
+	// Register new endpoint so that packets are routed to it.
+	if err := ep.stack.RegisterTransportEndpoint(ep.boundNICID, ep.effectiveNetProtos, ProtocolNumber, ep.ID, ep, ep.boundPortFlags, ep.boundBindToDevice); err != nil {
+		ep.mu.Unlock()
+		ep.Close()
+
+		if l.listenEP != nil {
+			l.removePendingEndpoint(ep)
+		}
+
+		ep.drainClosingSegmentQueue()
+
+		return nil, err
+	}
+
+	ep.isRegistered = true
+
 	// Perform the 3-way handshake.
-	h := newPassiveHandshake(ep, ep.rcv.rcvWnd, isn, irs, opts, deferAccept)
+	h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept)
 	if err := h.execute(); err != nil {
 		ep.mu.Unlock()
 		ep.Close()
-		// Wake up any waiters. This is strictly not required normally
-		// as a socket that was never accepted can't really have any
-		// registered waiters except when stack.Wait() is called which
-		// waits for all registered endpoints to stop and expects an
-		// EventHUp.
-		ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+		ep.notifyAborted()
 
 		if l.listenEP != nil {
 			l.removePendingEndpoint(ep)
@@ -378,6 +377,43 @@ func (e *endpoint) deliverAccepted(n *endpoint) {
 // Precondition: e.mu and n.mu must be held.
 func (e *endpoint) propagateInheritableOptionsLocked(n *endpoint) {
 	n.userTimeout = e.userTimeout
+	n.portFlags = e.portFlags
+	n.boundBindToDevice = e.boundBindToDevice
+	n.boundPortFlags = e.boundPortFlags
+}
+
+// reserveTupleLocked reserves an accepted endpoint's tuple.
+//
+// Preconditions:
+// * propagateInheritableOptionsLocked has been called.
+// * e.mu is held.
+func (e *endpoint) reserveTupleLocked() bool {
+	dest := tcpip.FullAddress{Addr: e.ID.RemoteAddress, Port: e.ID.RemotePort}
+	if !e.stack.ReserveTuple(
+		e.effectiveNetProtos,
+		ProtocolNumber,
+		e.ID.LocalAddress,
+		e.ID.LocalPort,
+		e.boundPortFlags,
+		e.boundBindToDevice,
+		dest,
+	) {
+		return false
+	}
+
+	e.isPortReserved = true
+	e.boundDest = dest
+	return true
+}
+
+// notifyAborted wakes up any waiters on registered, but not accepted
+// endpoints.
+//
+// This is strictly not required normally as a socket that was never accepted
+// can't really have any registered waiters except when stack.Wait() is called
+// which waits for all registered endpoints to stop and expects an EventHUp.
+func (e *endpoint) notifyAborted() {
+	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 }
 
 // handleSynSegment is called in its own goroutine once the listening endpoint
@@ -485,7 +521,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			synOpts := header.TCPSynOptions{
 				WS:    -1,
 				TS:    opts.TS,
-				TSVal: tcpTimeStamp(timeStampOffset()),
+				TSVal: tcpTimeStamp(time.Now(), timeStampOffset()),
 				TSEcr: opts.TSVal,
 				MSS:   mssForRoute(&s.route),
 			}
@@ -534,6 +570,9 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			return
 		}
 
+		iss := s.ackNumber - 1
+		irs := s.sequenceNumber - 1
+
 		// Since SYN cookies are in use this is potentially an ACK to a
 		// SYN-ACK we sent but don't have a half open connection state
 		// as cookies are being used to protect against a potential SYN
@@ -544,7 +583,7 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		// when under a potential syn flood attack.
 		//
 		// Validate the cookie.
-		data, ok := ctx.isCookieValid(s.id, s.ackNumber-1, s.sequenceNumber-1)
+		data, ok := ctx.isCookieValid(s.id, iss, irs)
 		if !ok || int(data) >= len(mssTable) {
 			e.stack.Stats().TCP.ListenOverflowInvalidSynCookieRcvd.Increment()
 			e.stack.Stats().DroppedPackets.Increment()
@@ -569,16 +608,34 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 			rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr
 		}
 
-		n, err := ctx.createConnectingEndpoint(s, s.ackNumber-1, s.sequenceNumber-1, rcvdSynOptions, &waiter.Queue{})
-		if err != nil {
+		n := ctx.createConnectingEndpoint(s, iss, irs, rcvdSynOptions, &waiter.Queue{})
+
+		n.mu.Lock()
+
+		// Propagate any inheritable options from the listening endpoint
+		// to the newly created endpoint.
+		e.propagateInheritableOptionsLocked(n)
+
+		if !n.reserveTupleLocked() {
+			n.mu.Unlock()
+			n.Close()
+
 			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
 			e.stats.FailedConnectionAttempts.Increment()
 			return
 		}
 
-		// Propagate any inheritable options from the listening endpoint
-		// to the newly created endpoint.
-		e.propagateInheritableOptionsLocked(n)
+		// Register new endpoint so that packets are routed to it.
+		if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.boundPortFlags, n.boundBindToDevice); err != nil {
+			n.mu.Unlock()
+			n.Close()
+
+			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
+			e.stats.FailedConnectionAttempts.Increment()
+			return
+		}
+
+		n.isRegistered = true
 
 		// clear the tsOffset for the newly created
 		// endpoint as the Timestamp was already
@@ -587,10 +644,17 @@ func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
 		n.tsOffset = 0
 
 		// Switch state to connected.
-		// We do not use transitionToStateEstablishedLocked here as there is
-		// no handshake state available when doing a SYN cookie based accept.
 		n.isConnectNotified = true
-		n.setEndpointState(StateEstablished)
+		n.transitionToStateEstablishedLocked(&handshake{
+			ep:          n,
+			iss:         iss,
+			ackNum:      irs + 1,
+			rcvWnd:      seqnum.Size(n.initialReceiveWindow()),
+			sndWnd:      s.window,
+			rcvWndScale: e.rcvWndScaleForHandshake(),
+			sndWndScale: rcvdSynOptions.WS,
+			mss:         rcvdSynOptions.MSS,
+		})
 
 		// Do the delivery in a separate goroutine so
 		// that we don't block the listen loop in case
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
index a7e088d4e..290172ac9 100644
--- a/pkg/tcpip/transport/tcp/connect.go
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -490,6 +490,9 @@ func (h *handshake) resolveRoute() *tcpip.Error {
 				<-h.ep.undrain
 				h.ep.mu.Lock()
 			}
+			if n&notifyError != 0 {
+				return h.ep.takeLastError()
+			}
 		}
 
 		// Wait for notification.
@@ -509,9 +512,7 @@ func (h *handshake) execute() *tcpip.Error {
 	// Initialize the resend timer.
 	resendWaker := sleep.Waker{}
 	timeOut := time.Duration(time.Second)
-	rt := time.AfterFunc(timeOut, func() {
-		resendWaker.Assert()
-	})
+	rt := time.AfterFunc(timeOut, resendWaker.Assert)
 	defer rt.Stop()
 
 	// Set up the wakers.
@@ -618,6 +619,9 @@ func (h *handshake) execute() *tcpip.Error {
 				<-h.ep.undrain
 				h.ep.mu.Lock()
 			}
+			if n&notifyError != 0 {
+				return h.ep.takeLastError()
+			}
 
 		case wakerForNewSegment:
 			if err := h.processSegments(); err != nil {
@@ -742,11 +746,7 @@ func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedV
 
 func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso *stack.GSO) {
 	optLen := len(tf.opts)
-	hdr := &pkt.Header
-	packetSize := pkt.Data.Size()
-	// Initialize the header.
-	tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen))
-	pkt.TransportHeader = buffer.View(tcp)
+	tcp := header.TCP(pkt.TransportHeader().Push(header.TCPMinimumSize + optLen))
 	tcp.Encode(&header.TCPFields{
 		SrcPort:    tf.id.LocalPort,
 		DstPort:    tf.id.RemotePort,
@@ -758,8 +758,7 @@ func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso *sta
 	})
 	copy(tcp[header.TCPMinimumSize:], tf.opts)
 
-	length := uint16(hdr.UsedLength() + packetSize)
-	xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
+	xsum := r.PseudoHeaderChecksum(ProtocolNumber, uint16(pkt.Size()))
 	// Only calculate the checksum if offloading isn't supported.
 	if gso != nil && gso.NeedsCsum {
 		// This is called CHECKSUM_PARTIAL in the Linux kernel. We
@@ -797,17 +796,18 @@ func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso
 			packetSize = size
 		}
 		size -= packetSize
-		var pkt stack.PacketBuffer
-		pkt.Header = buffer.NewPrependable(hdrSize)
+		pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: hdrSize,
+		})
 		pkt.Hash = tf.txHash
 		pkt.Owner = owner
 		pkt.EgressRoute = r
 		pkt.GSOOptions = gso
 		pkt.NetworkProtocolNumber = r.NetworkProtocolNumber()
 		data.ReadToVV(&pkt.Data, packetSize)
-		buildTCPHdr(r, tf, &pkt, gso)
+		buildTCPHdr(r, tf, pkt, gso)
 		tf.seq = tf.seq.Add(seqnum.Size(packetSize))
-		pkts.PushBack(&pkt)
+		pkts.PushBack(pkt)
 	}
 
 	if tf.ttl == 0 {
@@ -833,13 +833,13 @@ func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stac
 		return sendTCPBatch(r, tf, data, gso, owner)
 	}
 
-	pkt := stack.PacketBuffer{
-		Header: buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen),
-		Data:   data,
-		Hash:   tf.txHash,
-		Owner:  owner,
-	}
-	buildTCPHdr(r, tf, &pkt, gso)
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen,
+		Data:               data,
+	})
+	pkt.Hash = tf.txHash
+	pkt.Owner = owner
+	buildTCPHdr(r, tf, pkt, gso)
 
 	if tf.ttl == 0 {
 		tf.ttl = r.DefaultTTL()
@@ -995,24 +995,22 @@ func (e *endpoint) completeWorkerLocked() {
 
 // transitionToStateEstablisedLocked transitions a given endpoint
 // to an established state using the handshake parameters provided.
-// It also initializes sender/receiver if required.
+// It also initializes sender/receiver.
 func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
-	if e.snd == nil {
-		// Transfer handshake state to TCP connection. We disable
-		// receive window scaling if the peer doesn't support it
-		// (indicated by a negative send window scale).
-		e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
-	}
-	if e.rcv == nil {
-		rcvBufSize := seqnum.Size(e.receiveBufferSize())
-		e.rcvListMu.Lock()
-		e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize)
-		// Bootstrap the auto tuning algorithm. Starting at zero will
-		// result in a really large receive window after the first auto
-		// tuning adjustment.
-		e.rcvAutoParams.prevCopied = int(h.rcvWnd)
-		e.rcvListMu.Unlock()
-	}
+	// Transfer handshake state to TCP connection. We disable
+	// receive window scaling if the peer doesn't support it
+	// (indicated by a negative send window scale).
+	e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
+
+	rcvBufSize := seqnum.Size(e.receiveBufferSize())
+	e.rcvListMu.Lock()
+	e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize)
+	// Bootstrap the auto tuning algorithm. Starting at zero will
+	// result in a really large receive window after the first auto
+	// tuning adjustment.
+	e.rcvAutoParams.prevCopied = int(h.rcvWnd)
+	e.rcvListMu.Unlock()
+
 	e.setEndpointState(StateEstablished)
 }
 
@@ -1022,14 +1020,19 @@ func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
 // delivered to this endpoint from the demuxer when the endpoint
 // is transitioned to StateClose.
 func (e *endpoint) transitionToStateCloseLocked() {
-	if e.EndpointState() == StateClose {
+	s := e.EndpointState()
+	if s == StateClose {
 		return
 	}
+
+	if s.connected() {
+		e.stack.Stats().TCP.CurrentConnected.Decrement()
+		e.stack.Stats().TCP.EstablishedClosed.Increment()
+	}
+
 	// Mark the endpoint as fully closed for reads/writes.
 	e.cleanupLocked()
 	e.setEndpointState(StateClose)
-	e.stack.Stats().TCP.CurrentConnected.Decrement()
-	e.stack.Stats().TCP.EstablishedClosed.Increment()
 }
 
 // tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed
@@ -1052,8 +1055,8 @@ func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
 		panic("current endpoint not removed from demuxer, enqueing segments to itself")
 	}
 
-	if ep.(*endpoint).enqueueSegment(s) {
-		ep.(*endpoint).newSegmentWaker.Assert()
+	if ep := ep.(*endpoint); ep.enqueueSegment(s) {
+		ep.newSegmentWaker.Assert()
 	}
 }
 
@@ -1122,7 +1125,7 @@ func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
 func (e *endpoint) handleSegments(fastPath bool) *tcpip.Error {
 	checkRequeue := true
 	for i := 0; i < maxSegmentsPerWake; i++ {
-		if e.EndpointState() == StateClose || e.EndpointState() == StateError {
+		if e.EndpointState().closed() {
 			return nil
 		}
 		s := e.segmentQueue.dequeue()
@@ -1159,13 +1162,18 @@ func (e *endpoint) handleSegments(fastPath bool) *tcpip.Error {
 	return nil
 }
 
-// handleSegment handles a given segment and notifies the worker goroutine if
-// if the connection should be terminated.
-func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
-	// Invoke the tcp probe if installed.
+func (e *endpoint) probeSegment() {
 	if e.probe != nil {
 		e.probe(e.completeState())
 	}
+}
+
+// handleSegment handles a given segment and notifies the worker goroutine if
+// if the connection should be terminated.
+func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
+	// Invoke the tcp probe if installed. The tcp probe function will update
+	// the TCPEndpointState after the segment is processed.
+	defer e.probeSegment()
 
 	if s.flagIsSet(header.TCPFlagRst) {
 		if ok, err := e.handleReset(s); !ok {
@@ -1347,6 +1355,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 			e.setEndpointState(StateError)
 			e.HardError = err
 
+			e.workerCleanup = true
 			// Lock released below.
 			epilogue()
 			return err
@@ -1441,9 +1450,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 					if e.EndpointState() == StateFinWait2 && e.closed {
 						// The socket has been closed and we are in FIN_WAIT2
 						// so start the FIN_WAIT2 timer.
-						closeTimer = time.AfterFunc(e.tcpLingerTimeout, func() {
-							closeWaker.Assert()
-						})
+						closeTimer = time.AfterFunc(e.tcpLingerTimeout, closeWaker.Assert)
 						e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
 					}
 				}
@@ -1461,7 +1468,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 							return err
 						}
 					}
-					if e.EndpointState() != StateClose && e.EndpointState() != StateError {
+					if !e.EndpointState().closed() {
 						// Only block the worker if the endpoint
 						// is not in closed state or error state.
 						close(e.drainDone)
@@ -1517,6 +1524,7 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 	// Main loop. Handle segments until both send and receive ends of the
 	// connection have completed.
 	cleanupOnError := func(err *tcpip.Error) {
+		e.stack.Stats().TCP.CurrentConnected.Decrement()
 		e.workerCleanup = true
 		if err != nil {
 			e.resetConnectionLocked(err)
@@ -1526,7 +1534,12 @@ func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{
 	}
 
 loop:
-	for e.EndpointState() != StateTimeWait && e.EndpointState() != StateClose && e.EndpointState() != StateError {
+	for {
+		switch e.EndpointState() {
+		case StateTimeWait, StateClose, StateError:
+			break loop
+		}
+
 		e.mu.Unlock()
 		v, _ := s.Fetch(true)
 		e.mu.Lock()
@@ -1569,11 +1582,14 @@ loop:
 		reuseTW = e.doTimeWait()
 	}
 
-	// Mark endpoint as closed.
-	if e.EndpointState() != StateError {
-		e.transitionToStateCloseLocked()
+	// Handle any StateError transition from StateTimeWait.
+	if e.EndpointState() == StateError {
+		cleanupOnError(nil)
+		return nil
 	}
 
+	e.transitionToStateCloseLocked()
+
 	// Lock released below.
 	epilogue()
 
@@ -1686,7 +1702,7 @@ func (e *endpoint) doTimeWait() (twReuse func()) {
 			}
 		case notification:
 			n := e.fetchNotifications()
-			if n&notifyClose != 0 || n&notifyAbort != 0 {
+			if n&notifyAbort != 0 {
 				return nil
 			}
 			if n&notifyDrain != 0 {
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
index 6062ca916..98aecab9e 100644
--- a/pkg/tcpip/transport/tcp/dispatcher.go
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -15,6 +15,8 @@
 package tcp
 
 import (
+	"encoding/binary"
+
 	"gvisor.dev/gvisor/pkg/rand"
 	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -66,89 +68,68 @@ func (q *epQueue) empty() bool {
 // processor is responsible for processing packets queued to a tcp endpoint.
 type processor struct {
 	epQ              epQueue
+	sleeper          sleep.Sleeper
 	newEndpointWaker sleep.Waker
 	closeWaker       sleep.Waker
-	id               int
-	wg               sync.WaitGroup
-}
-
-func newProcessor(id int) *processor {
-	p := &processor{
-		id: id,
-	}
-	p.wg.Add(1)
-	go p.handleSegments()
-	return p
 }
 
 func (p *processor) close() {
 	p.closeWaker.Assert()
 }
 
-func (p *processor) wait() {
-	p.wg.Wait()
-}
-
 func (p *processor) queueEndpoint(ep *endpoint) {
 	// Queue an endpoint for processing by the processor goroutine.
 	p.epQ.enqueue(ep)
 	p.newEndpointWaker.Assert()
 }
 
-func (p *processor) handleSegments() {
-	const newEndpointWaker = 1
-	const closeWaker = 2
-	s := sleep.Sleeper{}
-	s.AddWaker(&p.newEndpointWaker, newEndpointWaker)
-	s.AddWaker(&p.closeWaker, closeWaker)
-	defer s.Done()
+const (
+	newEndpointWaker = 1
+	closeWaker       = 2
+)
+
+func (p *processor) start(wg *sync.WaitGroup) {
+	defer wg.Done()
+	defer p.sleeper.Done()
+
 	for {
-		id, ok := s.Fetch(true)
-		if ok && id == closeWaker {
-			p.wg.Done()
-			return
+		if id, _ := p.sleeper.Fetch(true); id == closeWaker {
+			break
 		}
-		for ep := p.epQ.dequeue(); ep != nil; ep = p.epQ.dequeue() {
+		for {
+			ep := p.epQ.dequeue()
+			if ep == nil {
+				break
+			}
 			if ep.segmentQueue.empty() {
 				continue
 			}
 
-			// If socket has transitioned out of connected state
-			// then just let the worker handle the packet.
+			// If socket has transitioned out of connected state then just let the
+			// worker handle the packet.
 			//
-			// NOTE: We read this outside of e.mu lock which means
-			// that by the time we get to handleSegments the
-			// endpoint may not be in ESTABLISHED. But this should
-			// be fine as all normal shutdown states are handled by
-			// handleSegments and if the endpoint moves to a
-			// CLOSED/ERROR state then handleSegments is a noop.
-			if ep.EndpointState() != StateEstablished {
-				ep.newSegmentWaker.Assert()
-				continue
-			}
-
-			if !ep.mu.TryLock() {
-				ep.newSegmentWaker.Assert()
-				continue
-			}
-			// If the endpoint is in a connected state then we do
-			// direct delivery to ensure low latency and avoid
-			// scheduler interactions.
-			if err := ep.handleSegments(true /* fastPath */); err != nil || ep.EndpointState() == StateClose {
-				// Send any active resets if required.
-				if err != nil {
+			// NOTE: We read this outside of e.mu lock which means that by the time
+			// we get to handleSegments the endpoint may not be in ESTABLISHED. But
+			// this should be fine as all normal shutdown states are handled by
+			// handleSegments and if the endpoint moves to a CLOSED/ERROR state
+			// then handleSegments is a noop.
+			if ep.EndpointState() == StateEstablished && ep.mu.TryLock() {
+				// If the endpoint is in a connected state then we do direct delivery
+				// to ensure low latency and avoid scheduler interactions.
+				switch err := ep.handleSegments(true /* fastPath */); {
+				case err != nil:
+					// Send any active resets if required.
 					ep.resetConnectionLocked(err)
+					fallthrough
+				case ep.EndpointState() == StateClose:
+					ep.notifyProtocolGoroutine(notifyTickleWorker)
+				case !ep.segmentQueue.empty():
+					p.epQ.enqueue(ep)
 				}
-				ep.notifyProtocolGoroutine(notifyTickleWorker)
 				ep.mu.Unlock()
-				continue
-			}
-
-			if !ep.segmentQueue.empty() {
-				p.epQ.enqueue(ep)
+			} else {
+				ep.newSegmentWaker.Assert()
 			}
-
-			ep.mu.Unlock()
 		}
 	}
 }
@@ -159,34 +140,39 @@ func (p *processor) handleSegments() {
 // hash of the endpoint id to ensure that delivery for the same endpoint happens
 // in-order.
 type dispatcher struct {
-	processors []*processor
+	processors []processor
 	seed       uint32
-}
-
-func newDispatcher(nProcessors int) *dispatcher {
-	processors := []*processor{}
-	for i := 0; i < nProcessors; i++ {
-		processors = append(processors, newProcessor(i))
-	}
-	return &dispatcher{
-		processors: processors,
-		seed:       generateRandUint32(),
+	wg         sync.WaitGroup
+}
+
+func (d *dispatcher) init(nProcessors int) {
+	d.close()
+	d.wait()
+	d.processors = make([]processor, nProcessors)
+	d.seed = generateRandUint32()
+	for i := range d.processors {
+		p := &d.processors[i]
+		p.sleeper.AddWaker(&p.newEndpointWaker, newEndpointWaker)
+		p.sleeper.AddWaker(&p.closeWaker, closeWaker)
+		d.wg.Add(1)
+		// NB: sleeper-waker registration must happen synchronously to avoid races
+		// with `close`.  It's possible to pull all this logic into `start`, but
+		// that results in a heap-allocated function literal.
+		go p.start(&d.wg)
 	}
 }
 
 func (d *dispatcher) close() {
-	for _, p := range d.processors {
-		p.close()
+	for i := range d.processors {
+		d.processors[i].close()
 	}
 }
 
 func (d *dispatcher) wait() {
-	for _, p := range d.processors {
-		p.wait()
-	}
+	d.wg.Wait()
 }
 
-func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
+func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	ep := stackEP.(*endpoint)
 	s := newSegment(r, id, pkt)
 	if !s.parse() {
@@ -231,20 +217,18 @@ func generateRandUint32() uint32 {
 	if _, err := rand.Read(b); err != nil {
 		panic(err)
 	}
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+	return binary.LittleEndian.Uint32(b)
 }
 
 func (d *dispatcher) selectProcessor(id stack.TransportEndpointID) *processor {
-	payload := []byte{
-		byte(id.LocalPort),
-		byte(id.LocalPort >> 8),
-		byte(id.RemotePort),
-		byte(id.RemotePort >> 8)}
+	var payload [4]byte
+	binary.LittleEndian.PutUint16(payload[0:], id.LocalPort)
+	binary.LittleEndian.PutUint16(payload[2:], id.RemotePort)
 
 	h := jenkins.Sum32(d.seed)
-	h.Write(payload)
+	h.Write(payload[:])
 	h.Write([]byte(id.LocalAddress))
 	h.Write([]byte(id.RemoteAddress))
 
-	return d.processors[h.Sum32()%uint32(len(d.processors))]
+	return &d.processors[h.Sum32()%uint32(len(d.processors))]
 }
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
index b5ba972f1..d08cfe0ff 100644
--- a/pkg/tcpip/transport/tcp/endpoint.go
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -63,7 +63,8 @@ const (
 	StateClosing
 )
 
-// connected is the set of states where an endpoint is connected to a peer.
+// connected returns true when s is one of the states representing an
+// endpoint connected to a peer.
 func (s EndpointState) connected() bool {
 	switch s {
 	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
@@ -73,6 +74,40 @@ func (s EndpointState) connected() bool {
 	}
 }
 
+// connecting returns true when s is one of the states representing a
+// connection in progress, but not yet fully established.
+func (s EndpointState) connecting() bool {
+	switch s {
+	case StateConnecting, StateSynSent, StateSynRecv:
+		return true
+	default:
+		return false
+	}
+}
+
+// handshake returns true when s is one of the states representing an endpoint
+// in the middle of a TCP handshake.
+func (s EndpointState) handshake() bool {
+	switch s {
+	case StateSynSent, StateSynRecv:
+		return true
+	default:
+		return false
+	}
+}
+
+// closed returns true when s is one of the states an endpoint transitions to
+// when closed or when it encounters an error. This is distinct from a newly
+// initialized endpoint that was never connected.
+func (s EndpointState) closed() bool {
+	switch s {
+	case StateClose, StateError:
+		return true
+	default:
+		return false
+	}
+}
+
 // String implements fmt.Stringer.String.
 func (s EndpointState) String() string {
 	switch s {
@@ -361,7 +396,8 @@ type endpoint struct {
 	mu          sync.Mutex `state:"nosave"`
 	ownedByUser uint32
 
-	// state must be read/set using the EndpointState()/setEndpointState() methods.
+	// state must be read/set using the EndpointState()/setEndpointState()
+	// methods.
 	state EndpointState `state:".(EndpointState)"`
 
 	// origEndpointState is only used during a restore phase to save the
@@ -370,8 +406,8 @@ type endpoint struct {
 	origEndpointState EndpointState `state:"nosave"`
 
 	isPortReserved    bool `state:"manual"`
-	isRegistered      bool
-	boundNICID        tcpip.NICID `state:"manual"`
+	isRegistered      bool `state:"manual"`
+	boundNICID        tcpip.NICID
 	route             stack.Route `state:"manual"`
 	ttl               uint8
 	v6only            bool
@@ -380,10 +416,14 @@ type endpoint struct {
 	// disabling SO_BROADCAST, albeit as a NOOP.
 	broadcast bool
 
+	// portFlags stores the current values of port related flags.
+	portFlags ports.Flags
+
 	// Values used to reserve a port or register a transport endpoint
 	// (which ever happens first).
 	boundBindToDevice tcpip.NICID
 	boundPortFlags    ports.Flags
+	boundDest         tcpip.FullAddress
 
 	// effectiveNetProtos contains the network protocols actually in use. In
 	// most cases it will only contain "netProto", but in cases like IPv6
@@ -391,7 +431,7 @@ type endpoint struct {
 	// protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
 	// IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
 	// address).
-	effectiveNetProtos []tcpip.NetworkProtocolNumber `state:"manual"`
+	effectiveNetProtos []tcpip.NetworkProtocolNumber
 
 	// workerRunning specifies if a worker goroutine is running.
 	workerRunning bool
@@ -409,10 +449,11 @@ type endpoint struct {
 	// recentTS is the timestamp that should be sent in the TSEcr field of
 	// the timestamp for future segments sent by the endpoint. This field is
 	// updated if required when a new segment is received by this endpoint.
-	//
-	// recentTS must be read/written atomically.
 	recentTS uint32
 
+	// recentTSTime is the unix time when we updated recentTS last.
+	recentTSTime time.Time `state:".(unixTime)"`
+
 	// tsOffset is a randomized offset added to the value of the
 	// TSVal field in the timestamp option.
 	tsOffset uint32
@@ -427,9 +468,6 @@ type endpoint struct {
 	// sack holds TCP SACK related information for this endpoint.
 	sack SACKInfo
 
-	// reusePort is set to true if SO_REUSEPORT is enabled.
-	reusePort bool
-
 	// bindToDevice is set to the NIC on which to bind or disabled if 0.
 	bindToDevice tcpip.NICID
 
@@ -449,7 +487,6 @@ type endpoint struct {
 	// The options below aren't implemented, but we remember the user
 	// settings because applications expect to be able to set/query these
 	// options.
-	reuseAddr bool
 
 	// slowAck holds the negated state of quick ack. It is stubbed out and
 	// does nothing.
@@ -759,15 +796,15 @@ func (e *endpoint) EndpointState() EndpointState {
 	return EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
 }
 
-// setRecentTimestamp atomically sets the recentTS field to the
-// provided value.
+// setRecentTimestamp sets the recentTS field to the provided value.
 func (e *endpoint) setRecentTimestamp(recentTS uint32) {
-	atomic.StoreUint32(&e.recentTS, recentTS)
+	e.recentTS = recentTS
+	e.recentTSTime = time.Now()
 }
 
-// recentTimestamp atomically reads and returns the value of the recentTS field.
+// recentTimestamp returns the value of the recentTS field.
 func (e *endpoint) recentTimestamp() uint32 {
-	return atomic.LoadUint32(&e.recentTS)
+	return e.recentTS
 }
 
 // keepalive is a synchronization wrapper used to appease stateify. See the
@@ -799,7 +836,6 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		rcvBufSize:  DefaultReceiveBufferSize,
 		sndBufSize:  DefaultSendBufferSize,
 		sndMTU:      int(math.MaxInt32),
-		reuseAddr:   true,
 		keepalive: keepalive{
 			// Linux defaults.
 			idle:     2 * time.Hour,
@@ -867,7 +903,7 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 	case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
 		// Ready for nothing.
 
-	case StateClose, StateError:
+	case StateClose, StateError, StateTimeWait:
 		// Ready for anything.
 		result = mask
 
@@ -986,14 +1022,15 @@ func (e *endpoint) closeNoShutdownLocked() {
 	// in Listen() when trying to register.
 	if e.EndpointState() == StateListen && e.isPortReserved {
 		if e.isRegistered {
-			e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
+			e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
 			e.isRegistered = false
 		}
 
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice, e.boundDest)
 		e.isPortReserved = false
 		e.boundBindToDevice = 0
 		e.boundPortFlags = ports.Flags{}
+		e.boundDest = tcpip.FullAddress{}
 	}
 
 	// Mark endpoint as closed.
@@ -1051,16 +1088,17 @@ func (e *endpoint) cleanupLocked() {
 	e.workerCleanup = false
 
 	if e.isRegistered {
-		e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
+		e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
 		e.isRegistered = false
 	}
 
 	if e.isPortReserved {
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice, e.boundDest)
 		e.isPortReserved = false
 	}
 	e.boundBindToDevice = 0
 	e.boundPortFlags = ports.Flags{}
+	e.boundDest = tcpip.FullAddress{}
 
 	e.route.Release()
 	e.stack.CompleteTransportEndpointCleanup(e)
@@ -1172,14 +1210,27 @@ func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
 	e.owner = owner
 }
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (stack.IPTables, error) {
-	return e.stack.IPTables(), nil
+func (e *endpoint) takeLastError() *tcpip.Error {
+	e.lastErrorMu.Lock()
+	defer e.lastErrorMu.Unlock()
+	err := e.lastError
+	e.lastError = nil
+	return err
 }
 
 // Read reads data from the endpoint.
 func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
 	e.LockUser()
+	defer e.UnlockUser()
+
+	// When in SYN-SENT state, let the caller block on the receive.
+	// An application can initiate a non-blocking connect and then block
+	// on a receive. It can expect to read any data after the handshake
+	// is complete. RFC793, section 3.9, p58.
+	if e.EndpointState() == StateSynSent {
+		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrWouldBlock
+	}
+
 	// The endpoint can be read if it's connected, or if it's already closed
 	// but has some pending unread data. Also note that a RST being received
 	// would cause the state to become StateError so we should allow the
@@ -1189,7 +1240,6 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
 		e.rcvListMu.Unlock()
 		he := e.HardError
-		e.UnlockUser()
 		if s == StateError {
 			return buffer.View{}, tcpip.ControlMessages{}, he
 		}
@@ -1199,7 +1249,6 @@ func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages,
 
 	v, err := e.readLocked()
 	e.rcvListMu.Unlock()
-	e.UnlockUser()
 
 	if err == tcpip.ErrClosedForReceive {
 		e.stats.ReadErrors.ReadClosed.Increment()
@@ -1486,12 +1535,12 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 
 	case tcpip.ReuseAddressOption:
 		e.LockUser()
-		e.reuseAddr = v
+		e.portFlags.TupleOnly = v
 		e.UnlockUser()
 
 	case tcpip.ReusePortOption:
 		e.LockUser()
-		e.reusePort = v
+		e.portFlags.LoadBalanced = v
 		e.UnlockUser()
 
 	case tcpip.V6OnlyOption:
@@ -1549,6 +1598,13 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		e.UnlockUser()
 		e.notifyProtocolGoroutine(notifyMSSChanged)
 
+	case tcpip.MTUDiscoverOption:
+		// Return not supported if attempting to set this option to
+		// anything other than path MTU discovery disabled.
+		if v != tcpip.PMTUDiscoveryDont {
+			return tcpip.ErrNotSupported
+		}
+
 	case tcpip.ReceiveBufferSizeOption:
 		// Make sure the receive buffer size is within the min and max
 		// allowed.
@@ -1722,15 +1778,8 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 			// Same as effectively disabling TCPLinger timeout.
 			v = 0
 		}
-		var stkTCPLingerTimeout tcpip.TCPLingerTimeoutOption
-		if err := e.stack.TransportProtocolOption(header.TCPProtocolNumber, &stkTCPLingerTimeout); err != nil {
-			// We were unable to retrieve a stack config, just use
-			// the DefaultTCPLingerTimeout.
-			if v > tcpip.TCPLingerTimeoutOption(DefaultTCPLingerTimeout) {
-				stkTCPLingerTimeout = tcpip.TCPLingerTimeoutOption(DefaultTCPLingerTimeout)
-			}
-		}
-		// Cap it to the stack wide TCPLinger timeout.
+		// Cap it to MaxTCPLingerTimeout.
+		stkTCPLingerTimeout := tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout)
 		if v > stkTCPLingerTimeout {
 			v = stkTCPLingerTimeout
 		}
@@ -1745,6 +1794,9 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.deferAccept = time.Duration(v)
 		e.UnlockUser()
 
+	case tcpip.SocketDetachFilterOption:
+		return nil
+
 	default:
 		return nil
 	}
@@ -1795,14 +1847,14 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 
 	case tcpip.ReuseAddressOption:
 		e.LockUser()
-		v := e.reuseAddr
+		v := e.portFlags.TupleOnly
 		e.UnlockUser()
 
 		return v, nil
 
 	case tcpip.ReusePortOption:
 		e.LockUser()
-		v := e.reusePort
+		v := e.portFlags.LoadBalanced
 		e.UnlockUser()
 
 		return v, nil
@@ -1819,6 +1871,9 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 
 		return v, nil
 
+	case tcpip.MulticastLoopOption:
+		return true, nil
+
 	default:
 		return false, tcpip.ErrUnknownProtocolOption
 	}
@@ -1853,6 +1908,11 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		v := header.TCPDefaultMSS
 		return v, nil
 
+	case tcpip.MTUDiscoverOption:
+		// Always return the path MTU discovery disabled setting since
+		// it's the only one supported.
+		return tcpip.PMTUDiscoveryDont, nil
+
 	case tcpip.ReceiveQueueSizeOption:
 		return e.readyReceiveSize()
 
@@ -1886,6 +1946,9 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		e.UnlockUser()
 		return v, nil
 
+	case tcpip.MulticastTTLOption:
+		return 1, nil
+
 	default:
 		return -1, tcpip.ErrUnknownProtocolOption
 	}
@@ -1895,11 +1958,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 	switch o := opt.(type) {
 	case tcpip.ErrorOption:
-		e.lastErrorMu.Lock()
-		err := e.lastError
-		e.lastError = nil
-		e.lastErrorMu.Unlock()
-		return err
+		return e.takeLastError()
 
 	case *tcpip.BindToDeviceOption:
 		e.LockUser()
@@ -1952,6 +2011,17 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
 		e.UnlockUser()
 
+	case *tcpip.OriginalDestinationOption:
+		ipt := e.stack.IPTables()
+		addr, port, err := ipt.OriginalDst(e.ID)
+		if err != nil {
+			return err
+		}
+		*o = tcpip.OriginalDestinationOption{
+			Addr: addr,
+			Port: port,
+		}
+
 	default:
 		return tcpip.ErrUnknownProtocolOption
 	}
@@ -2049,8 +2119,6 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 	}
 	defer r.Release()
 
-	origID := e.ID
-
 	netProtos := []tcpip.NetworkProtocolNumber{netProto}
 	e.ID.LocalAddress = r.LocalAddress
 	e.ID.RemoteAddress = r.RemoteAddress
@@ -2058,7 +2126,7 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 
 	if e.ID.LocalPort != 0 {
 		// The endpoint is bound to a port, attempt to register it.
-		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, e.ID, e, e.reusePort, e.boundBindToDevice)
+		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
 		if err != nil {
 			return err
 		}
@@ -2081,43 +2149,91 @@ func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tc
 		h.Write(portBuf)
 		portOffset := h.Sum32()
 
+		var twReuse tcpip.TCPTimeWaitReuseOption
+		if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil {
+			panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err))
+		}
+
+		reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal
+		if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly {
+			switch netProto {
+			case header.IPv4ProtocolNumber:
+				reuse = header.IsV4LoopbackAddress(e.ID.LocalAddress) && header.IsV4LoopbackAddress(e.ID.RemoteAddress)
+			case header.IPv6ProtocolNumber:
+				reuse = e.ID.LocalAddress == header.IPv6Loopback && e.ID.RemoteAddress == header.IPv6Loopback
+			}
+		}
+
 		if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, *tcpip.Error) {
 			if sameAddr && p == e.ID.RemotePort {
 				return false, nil
 			}
-			// reusePort is false below because connect cannot reuse a port even if
-			// reusePort was set.
-			if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, ports.Flags{LoadBalanced: false}, e.bindToDevice) {
-				return false, nil
+			if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr); err != nil {
+				if err != tcpip.ErrPortInUse || !reuse {
+					return false, nil
+				}
+				transEPID := e.ID
+				transEPID.LocalPort = p
+				// Check if an endpoint is registered with demuxer in TIME-WAIT and if
+				// we can reuse it. If we can't find a transport endpoint then we just
+				// skip using this port as it's possible that either an endpoint has
+				// bound the port but not registered with demuxer yet (no listen/connect
+				// done yet) or the reservation was freed between the check above and
+				// the FindTransportEndpoint below. But rather than retry the same port
+				// we just skip it and move on.
+				transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, &r)
+				if transEP == nil {
+					// ReservePort failed but there is no registered endpoint with
+					// demuxer. Which indicates there is at least some endpoint that has
+					// bound the port.
+					return false, nil
+				}
+
+				tcpEP := transEP.(*endpoint)
+				tcpEP.LockUser()
+				// If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but
+				// less than 1 second has elapsed since its recentTS was updated then
+				// we cannot reuse the port.
+				if tcpEP.EndpointState() != StateTimeWait || time.Since(tcpEP.recentTSTime) < 1*time.Second {
+					tcpEP.UnlockUser()
+					return false, nil
+				}
+				// Since the endpoint is in TIME-WAIT it should be safe to acquire its
+				// Lock while holding the lock for this endpoint as endpoints in
+				// TIME-WAIT do not acquire locks on other endpoints.
+				tcpEP.workerCleanup = false
+				tcpEP.cleanupLocked()
+				tcpEP.notifyProtocolGoroutine(notifyAbort)
+				tcpEP.UnlockUser()
+				// Now try and Reserve again if it fails then we skip.
+				if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr); err != nil {
+					return false, nil
+				}
 			}
 
 			id := e.ID
 			id.LocalPort = p
-			switch e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice) {
-			case nil:
-				// Port picking successful. Save the details of
-				// the selected port.
-				e.ID = id
-				e.boundBindToDevice = e.bindToDevice
-				return true, nil
-			case tcpip.ErrPortInUse:
-				return false, nil
-			default:
+			if err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.portFlags, e.bindToDevice); err != nil {
+				e.stack.ReleasePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr)
+				if err == tcpip.ErrPortInUse {
+					return false, nil
+				}
 				return false, err
 			}
+
+			// Port picking successful. Save the details of
+			// the selected port.
+			e.ID = id
+			e.isPortReserved = true
+			e.boundBindToDevice = e.bindToDevice
+			e.boundPortFlags = e.portFlags
+			e.boundDest = addr
+			return true, nil
 		}); err != nil {
 			return err
 		}
 	}
 
-	// Remove the port reservation. This can happen when Bind is called
-	// before Connect: in such a case we don't want to hold on to
-	// reservations anymore.
-	if e.isPortReserved {
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
-		e.isPortReserved = false
-	}
-
 	e.isRegistered = true
 	e.setEndpointState(StateConnecting)
 	e.route = r.Clone()
@@ -2296,7 +2412,7 @@ func (e *endpoint) listen(backlog int) *tcpip.Error {
 	}
 
 	// Register the endpoint.
-	if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.reusePort, e.boundBindToDevice); err != nil {
+	if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil {
 		return err
 	}
 
@@ -2388,16 +2504,13 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 		}
 	}
 
-	flags := ports.Flags{
-		LoadBalanced: e.reusePort,
-	}
-	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, flags, e.bindToDevice)
+	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.portFlags, e.bindToDevice, tcpip.FullAddress{})
 	if err != nil {
 		return err
 	}
 
 	e.boundBindToDevice = e.bindToDevice
-	e.boundPortFlags = flags
+	e.boundPortFlags = e.portFlags
 	e.isPortReserved = true
 	e.effectiveNetProtos = netProtos
 	e.ID.LocalPort = port
@@ -2405,7 +2518,7 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 	// Any failures beyond this point must remove the port registration.
 	defer func(portFlags ports.Flags, bindToDevice tcpip.NICID) {
 		if err != nil {
-			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, portFlags, bindToDevice)
+			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, portFlags, bindToDevice, tcpip.FullAddress{})
 			e.isPortReserved = false
 			e.effectiveNetProtos = nil
 			e.ID.LocalPort = 0
@@ -2428,6 +2541,10 @@ func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
 		e.ID.LocalAddress = addr.Addr
 	}
 
+	if err := e.stack.CheckRegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e.boundPortFlags, e.boundBindToDevice); err != nil {
+		return err
+	}
+
 	// Mark endpoint as bound.
 	e.setEndpointState(StateBound)
 
@@ -2462,7 +2579,7 @@ func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
 	}, nil
 }
 
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	// TCP HandlePacket is not required anymore as inbound packets first
 	// land at the Dispatcher which then can either delivery using the
 	// worker go routine or directly do the invoke the tcp processing inline
@@ -2481,7 +2598,7 @@ func (e *endpoint) enqueueSegment(s *segment) bool {
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 	switch typ {
 	case stack.ControlPacketTooBig:
 		e.sndBufMu.Lock()
@@ -2492,6 +2609,18 @@ func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.C
 		e.sndBufMu.Unlock()
 
 		e.notifyProtocolGoroutine(notifyMTUChanged)
+
+	case stack.ControlNoRoute:
+		e.lastErrorMu.Lock()
+		e.lastError = tcpip.ErrNoRoute
+		e.lastErrorMu.Unlock()
+		e.notifyProtocolGoroutine(notifyError)
+
+	case stack.ControlNetworkUnreachable:
+		e.lastErrorMu.Lock()
+		e.lastError = tcpip.ErrNetworkUnreachable
+		e.lastErrorMu.Unlock()
+		e.notifyProtocolGoroutine(notifyError)
 	}
 }
 
@@ -2611,15 +2740,14 @@ func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) {
 // timestamp returns the timestamp value to be used in the TSVal field of the
 // timestamp option for outgoing TCP segments for a given endpoint.
 func (e *endpoint) timestamp() uint32 {
-	return tcpTimeStamp(e.tsOffset)
+	return tcpTimeStamp(time.Now(), e.tsOffset)
 }
 
 // tcpTimeStamp returns a timestamp offset by the provided offset. This is
 // not inlined above as it's used when SYN cookies are in use and endpoint
 // is not created at the time when the SYN cookie is sent.
-func tcpTimeStamp(offset uint32) uint32 {
-	now := time.Now()
-	return uint32(now.Unix()*1000+int64(now.Nanosecond()/1e6)) + offset
+func tcpTimeStamp(curTime time.Time, offset uint32) uint32 {
+	return uint32(curTime.Unix()*1000+int64(curTime.Nanosecond()/1e6)) + offset
 }
 
 // timeStampOffset returns a randomized timestamp offset to be used when sending
@@ -2762,6 +2890,14 @@ func (e *endpoint) completeState() stack.TCPEndpointState {
 			WEst:                    cubic.wEst,
 		}
 	}
+
+	rc := e.snd.rc
+	s.Sender.RACKState = stack.TCPRACKState{
+		XmitTime:    rc.xmitTime,
+		EndSequence: rc.endSequence,
+		FACK:        rc.fack,
+		RTT:         rc.rtt,
+	}
 	return s
 }
 
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
index fc43c11e2..723e47ddc 100644
--- a/pkg/tcpip/transport/tcp/endpoint_state.go
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -49,11 +49,10 @@ func (e *endpoint) beforeSave() {
 	e.mu.Lock()
 	defer e.mu.Unlock()
 
-	switch e.EndpointState() {
-	case StateInitial, StateBound:
-		// TODO(b/138137272): this enumeration duplicates
-		// EndpointState.connected. remove it.
-	case StateEstablished, StateSynSent, StateSynRecv, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
+	epState := e.EndpointState()
+	switch {
+	case epState == StateInitial || epState == StateBound:
+	case epState.connected() || epState.handshake():
 		if e.route.Capabilities()&stack.CapabilitySaveRestore == 0 {
 			if e.route.Capabilities()&stack.CapabilityDisconnectOk == 0 {
 				panic(tcpip.ErrSaveRejection{fmt.Errorf("endpoint cannot be saved in connected state: local %v:%d, remote %v:%d", e.ID.LocalAddress, e.ID.LocalPort, e.ID.RemoteAddress, e.ID.RemotePort)})
@@ -69,15 +68,16 @@ func (e *endpoint) beforeSave() {
 			break
 		}
 		fallthrough
-	case StateListen, StateConnecting:
+	case epState == StateListen || epState == StateConnecting:
 		e.drainSegmentLocked()
-		if e.EndpointState() != StateClose && e.EndpointState() != StateError {
+		// Refresh epState, since drainSegmentLocked may have changed it.
+		epState = e.EndpointState()
+		if !epState.closed() {
 			if !e.workerRunning {
 				panic("endpoint has no worker running in listen, connecting, or connected state")
 			}
-			break
 		}
-	case StateError, StateClose:
+	case epState.closed():
 		for e.workerRunning {
 			e.mu.Unlock()
 			time.Sleep(100 * time.Millisecond)
@@ -93,10 +93,6 @@ func (e *endpoint) beforeSave() {
 	if e.waiterQueue != nil && !e.waiterQueue.IsEmpty() {
 		panic("endpoint still has waiters upon save")
 	}
-
-	if e.EndpointState() != StateClose && !((e.EndpointState() == StateBound || e.EndpointState() == StateListen) == e.isPortReserved) {
-		panic("endpoints which are not in the closed state must have a reserved port IFF they are in bound or listen state")
-	}
 }
 
 // saveAcceptedChan is invoked by stateify.
@@ -148,23 +144,23 @@ var connectingLoading sync.WaitGroup
 // Bound endpoint loading happens last.
 
 // loadState is invoked by stateify.
-func (e *endpoint) loadState(state EndpointState) {
+func (e *endpoint) loadState(epState EndpointState) {
 	// This is to ensure that the loading wait groups include all applicable
 	// endpoints before any asynchronous calls to the Wait() methods.
 	// For restore purposes we treat TimeWait like a connected endpoint.
-	if state.connected() || state == StateTimeWait {
+	if epState.connected() || epState == StateTimeWait {
 		connectedLoading.Add(1)
 	}
-	switch state {
-	case StateListen:
+	switch {
+	case epState == StateListen:
 		listenLoading.Add(1)
-	case StateConnecting, StateSynSent, StateSynRecv:
+	case epState.connecting():
 		connectingLoading.Add(1)
 	}
 	// Directly update the state here rather than using e.setEndpointState
 	// as the endpoint is still being loaded and the stack reference is not
 	// yet initialized.
-	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
+	atomic.StoreUint32((*uint32)(&e.state), uint32(epState))
 }
 
 // afterLoad is invoked by stateify.
@@ -183,33 +179,40 @@ func (e *endpoint) afterLoad() {
 func (e *endpoint) Resume(s *stack.Stack) {
 	e.stack = s
 	e.segmentQueue.setLimit(MaxUnprocessedSegments)
-	state := e.origEndpointState
-	switch state {
+	epState := e.origEndpointState
+	switch epState {
 	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
 		var ss SendBufferSizeOption
 		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
 			if e.sndBufSize < ss.Min || e.sndBufSize > ss.Max {
 				panic(fmt.Sprintf("endpoint.sndBufSize %d is outside the min and max allowed [%d, %d]", e.sndBufSize, ss.Min, ss.Max))
 			}
-			if e.rcvBufSize < ss.Min || e.rcvBufSize > ss.Max {
-				panic(fmt.Sprintf("endpoint.rcvBufSize %d is outside the min and max allowed [%d, %d]", e.rcvBufSize, ss.Min, ss.Max))
+		}
+
+		var rs ReceiveBufferSizeOption
+		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
+			if e.rcvBufSize < rs.Min || e.rcvBufSize > rs.Max {
+				panic(fmt.Sprintf("endpoint.rcvBufSize %d is outside the min and max allowed [%d, %d]", e.rcvBufSize, rs.Min, rs.Max))
 			}
 		}
 	}
 
 	bind := func() {
-		if len(e.BindAddr) == 0 {
-			e.BindAddr = e.ID.LocalAddress
+		addr, _, err := e.checkV4MappedLocked(tcpip.FullAddress{Addr: e.BindAddr, Port: e.ID.LocalPort})
+		if err != nil {
+			panic("unable to parse BindAddr: " + err.String())
 		}
-		addr := e.BindAddr
-		port := e.ID.LocalPort
-		if err := e.Bind(tcpip.FullAddress{Addr: addr, Port: port}); err != nil {
-			panic(fmt.Sprintf("endpoint binding [%v]:%d failed: %v", addr, port, err))
+		if ok := e.stack.ReserveTuple(e.effectiveNetProtos, ProtocolNumber, addr.Addr, addr.Port, e.boundPortFlags, e.boundBindToDevice, e.boundDest); !ok {
+			panic(fmt.Sprintf("unable to re-reserve tuple (%v, %q, %d, %+v, %d, %v)", e.effectiveNetProtos, addr.Addr, addr.Port, e.boundPortFlags, e.boundBindToDevice, e.boundDest))
 		}
+		e.isPortReserved = true
+
+		// Mark endpoint as bound.
+		e.setEndpointState(StateBound)
 	}
 
-	switch state {
-	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
+	switch {
+	case epState.connected():
 		bind()
 		if len(e.connectingAddress) == 0 {
 			e.connectingAddress = e.ID.RemoteAddress
@@ -232,13 +235,13 @@ func (e *endpoint) Resume(s *stack.Stack) {
 		closed := e.closed
 		e.mu.Unlock()
 		e.notifyProtocolGoroutine(notifyTickleWorker)
-		if state == StateFinWait2 && closed {
+		if epState == StateFinWait2 && closed {
 			// If the endpoint has been closed then make sure we notify so
 			// that the FIN_WAIT2 timer is started after a restore.
 			e.notifyProtocolGoroutine(notifyClose)
 		}
 		connectedLoading.Done()
-	case StateListen:
+	case epState == StateListen:
 		tcpip.AsyncLoading.Add(1)
 		go func() {
 			connectedLoading.Wait()
@@ -255,7 +258,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 			listenLoading.Done()
 			tcpip.AsyncLoading.Done()
 		}()
-	case StateConnecting, StateSynSent, StateSynRecv:
+	case epState.connecting():
 		tcpip.AsyncLoading.Add(1)
 		go func() {
 			connectedLoading.Wait()
@@ -267,7 +270,7 @@ func (e *endpoint) Resume(s *stack.Stack) {
 			connectingLoading.Done()
 			tcpip.AsyncLoading.Done()
 		}()
-	case StateBound:
+	case epState == StateBound:
 		tcpip.AsyncLoading.Add(1)
 		go func() {
 			connectedLoading.Wait()
@@ -276,27 +279,16 @@ func (e *endpoint) Resume(s *stack.Stack) {
 			bind()
 			tcpip.AsyncLoading.Done()
 		}()
-	case StateClose:
-		if e.isPortReserved {
-			tcpip.AsyncLoading.Add(1)
-			go func() {
-				connectedLoading.Wait()
-				listenLoading.Wait()
-				connectingLoading.Wait()
-				bind()
-				e.setEndpointState(StateClose)
-				tcpip.AsyncLoading.Done()
-			}()
-		}
+	case epState == StateClose:
+		e.isPortReserved = false
 		e.state = StateClose
 		e.stack.CompleteTransportEndpointCleanup(e)
 		tcpip.DeleteDanglingEndpoint(e)
-	case StateError:
+	case epState == StateError:
 		e.state = StateError
 		e.stack.CompleteTransportEndpointCleanup(e)
 		tcpip.DeleteDanglingEndpoint(e)
 	}
-
 }
 
 // saveLastError is invoked by stateify.
@@ -317,6 +309,16 @@ func (e *endpoint) loadLastError(s string) {
 	e.lastError = tcpip.StringToError(s)
 }
 
+// saveRecentTSTime is invoked by stateify.
+func (e *endpoint) saveRecentTSTime() unixTime {
+	return unixTime{e.recentTSTime.Unix(), e.recentTSTime.UnixNano()}
+}
+
+// loadRecentTSTime is invoked by stateify.
+func (e *endpoint) loadRecentTSTime(unix unixTime) {
+	e.recentTSTime = time.Unix(unix.second, unix.nano)
+}
+
 // saveHardError is invoked by stateify.
 func (e *EndpointInfo) saveHardError() string {
 	if e.HardError == nil {
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
index 704d01c64..070b634b4 100644
--- a/pkg/tcpip/transport/tcp/forwarder.go
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -61,7 +61,7 @@ func NewForwarder(s *stack.Stack, rcvWnd, maxInFlight int, handler func(*Forward
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
 	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
index 2a2a7ddeb..c5afa2680 100644
--- a/pkg/tcpip/transport/tcp/protocol.go
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -61,6 +61,10 @@ const (
 	// FIN_WAIT_2 state before being marked closed.
 	DefaultTCPLingerTimeout = 60 * time.Second
 
+	// MaxTCPLingerTimeout is the maximum amount of time that sockets
+	// linger in FIN_WAIT_2 state before being marked closed.
+	MaxTCPLingerTimeout = 120 * time.Second
+
 	// DefaultTCPTimeWaitTimeout is the amount of time that sockets linger
 	// in TIME_WAIT state before being marked closed.
 	DefaultTCPTimeWaitTimeout = 60 * time.Second
@@ -70,34 +74,55 @@ const (
 	DefaultSynRetries = 6
 )
 
-// SACKEnabled option can be used to enable SACK support in the TCP
-// protocol. See: https://tools.ietf.org/html/rfc2018.
+const (
+	ccReno  = "reno"
+	ccCubic = "cubic"
+)
+
+// SACKEnabled is used by stack.(*Stack).TransportProtocolOption to
+// enable/disable SACK support in TCP. See: https://tools.ietf.org/html/rfc2018.
 type SACKEnabled bool
 
-// DelayEnabled option can be used to enable Nagle's algorithm in the TCP protocol.
+// Recovery is used by stack.(*Stack).TransportProtocolOption to
+// set loss detection algorithm in TCP.
+type Recovery int32
+
+const (
+	// RACKLossDetection indicates RACK is used for loss detection and
+	// recovery.
+	RACKLossDetection Recovery = 1 << iota
+
+	// RACKStaticReoWnd indicates the reordering window should not be
+	// adjusted when DSACK is received.
+	RACKStaticReoWnd
+
+	// RACKNoDupTh indicates RACK should not consider the classic three
+	// duplicate acknowledgements rule to mark the segments as lost. This
+	// is used when reordering is not detected.
+	RACKNoDupTh
+)
+
+// DelayEnabled is used by stack.(Stack*).TransportProtocolOption to
+// enable/disable Nagle's algorithm in TCP.
 type DelayEnabled bool
 
-// SendBufferSizeOption allows the default, min and max send buffer sizes for
-// TCP endpoints to be queried or configured.
+// SendBufferSizeOption is used by stack.(Stack*).TransportProtocolOption
+// to get/set the default, min and max TCP send buffer sizes.
 type SendBufferSizeOption struct {
 	Min     int
 	Default int
 	Max     int
 }
 
-// ReceiveBufferSizeOption allows the default, min and max receive buffer size
-// for TCP endpoints to be queried or configured.
+// ReceiveBufferSizeOption is used by
+// stack.(Stack*).TransportProtocolOption to get/set the default, min and max
+// TCP receive buffer sizes.
 type ReceiveBufferSizeOption struct {
 	Min     int
 	Default int
 	Max     int
 }
 
-const (
-	ccReno  = "reno"
-	ccCubic = "cubic"
-)
-
 // syncRcvdCounter tracks the number of endpoints in the SYN-RCVD state. The
 // value is protected by a mutex so that we can increment only when it's
 // guaranteed not to go above a threshold.
@@ -158,20 +183,22 @@ func (s *synRcvdCounter) Threshold() uint64 {
 type protocol struct {
 	mu                         sync.RWMutex
 	sackEnabled                bool
+	recovery                   Recovery
 	delayEnabled               bool
 	sendBufferSize             SendBufferSizeOption
 	recvBufferSize             ReceiveBufferSizeOption
 	congestionControl          string
 	availableCongestionControl []string
 	moderateReceiveBuffer      bool
-	tcpLingerTimeout           time.Duration
-	tcpTimeWaitTimeout         time.Duration
+	lingerTimeout              time.Duration
+	timeWaitTimeout            time.Duration
+	timeWaitReuse              tcpip.TCPTimeWaitReuseOption
 	minRTO                     time.Duration
 	maxRTO                     time.Duration
 	maxRetries                 uint32
 	synRcvdCount               synRcvdCounter
 	synRetries                 uint8
-	dispatcher                 *dispatcher
+	dispatcher                 dispatcher
 }
 
 // Number returns the tcp protocol number.
@@ -206,7 +233,7 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 // to a specific processing queue. Each queue is serviced by its own processor
 // goroutine which is responsible for dequeuing and doing full TCP dispatch of
 // the packet.
-func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
+func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	p.dispatcher.queuePacket(r, ep, id, pkt)
 }
 
@@ -217,7 +244,7 @@ func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id st
 // a reset is sent in response to any incoming segment except another reset. In
 // particular, SYNs addressed to a non-existent connection are rejected by this
 // means."
-func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
+func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
 	s := newSegment(r, id, pkt)
 	defer s.decRef()
 
@@ -277,6 +304,12 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 		p.mu.Unlock()
 		return nil
 
+	case Recovery:
+		p.mu.Lock()
+		p.recovery = Recovery(v)
+		p.mu.Unlock()
+		return nil
+
 	case DelayEnabled:
 		p.mu.Lock()
 		p.delayEnabled = bool(v)
@@ -325,7 +358,7 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 			v = 0
 		}
 		p.mu.Lock()
-		p.tcpLingerTimeout = time.Duration(v)
+		p.lingerTimeout = time.Duration(v)
 		p.mu.Unlock()
 		return nil
 
@@ -334,7 +367,16 @@ func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 			v = 0
 		}
 		p.mu.Lock()
-		p.tcpTimeWaitTimeout = time.Duration(v)
+		p.timeWaitTimeout = time.Duration(v)
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.TCPTimeWaitReuseOption:
+		if v < tcpip.TCPTimeWaitReuseDisabled || v > tcpip.TCPTimeWaitReuseLoopbackOnly {
+			return tcpip.ErrInvalidOptionValue
+		}
+		p.mu.Lock()
+		p.timeWaitReuse = v
 		p.mu.Unlock()
 		return nil
 
@@ -391,6 +433,12 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
 		p.mu.RUnlock()
 		return nil
 
+	case *Recovery:
+		p.mu.RLock()
+		*v = Recovery(p.recovery)
+		p.mu.RUnlock()
+		return nil
+
 	case *DelayEnabled:
 		p.mu.RLock()
 		*v = DelayEnabled(p.delayEnabled)
@@ -429,13 +477,19 @@ func (p *protocol) Option(option interface{}) *tcpip.Error {
 
 	case *tcpip.TCPLingerTimeoutOption:
 		p.mu.RLock()
-		*v = tcpip.TCPLingerTimeoutOption(p.tcpLingerTimeout)
+		*v = tcpip.TCPLingerTimeoutOption(p.lingerTimeout)
 		p.mu.RUnlock()
 		return nil
 
 	case *tcpip.TCPTimeWaitTimeoutOption:
 		p.mu.RLock()
-		*v = tcpip.TCPTimeWaitTimeoutOption(p.tcpTimeWaitTimeout)
+		*v = tcpip.TCPTimeWaitTimeoutOption(p.timeWaitTimeout)
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.TCPTimeWaitReuseOption:
+		p.mu.RLock()
+		*v = tcpip.TCPTimeWaitReuseOption(p.timeWaitReuse)
 		p.mu.RUnlock()
 		return nil
 
@@ -490,20 +544,51 @@ func (p *protocol) SynRcvdCounter() *synRcvdCounter {
 	return &p.synRcvdCount
 }
 
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
+	// TCP header is variable length, peek at it first.
+	hdrLen := header.TCPMinimumSize
+	hdr, ok := pkt.Data.PullUp(hdrLen)
+	if !ok {
+		return false
+	}
+
+	// If the header has options, pull those up as well.
+	if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data.Size() {
+		// TODO(gvisor.dev/issue/2404): Figure out whether to reject this kind of
+		// packets.
+		hdrLen = offset
+	}
+
+	_, ok = pkt.TransportHeader().Consume(hdrLen)
+	return ok
+}
+
 // NewProtocol returns a TCP transport protocol.
 func NewProtocol() stack.TransportProtocol {
-	return &protocol{
-		sendBufferSize:             SendBufferSizeOption{MinBufferSize, DefaultSendBufferSize, MaxBufferSize},
-		recvBufferSize:             ReceiveBufferSizeOption{MinBufferSize, DefaultReceiveBufferSize, MaxBufferSize},
+	p := protocol{
+		sendBufferSize: SendBufferSizeOption{
+			Min:     MinBufferSize,
+			Default: DefaultSendBufferSize,
+			Max:     MaxBufferSize,
+		},
+		recvBufferSize: ReceiveBufferSizeOption{
+			Min:     MinBufferSize,
+			Default: DefaultReceiveBufferSize,
+			Max:     MaxBufferSize,
+		},
 		congestionControl:          ccReno,
 		availableCongestionControl: []string{ccReno, ccCubic},
-		tcpLingerTimeout:           DefaultTCPLingerTimeout,
-		tcpTimeWaitTimeout:         DefaultTCPTimeWaitTimeout,
+		lingerTimeout:              DefaultTCPLingerTimeout,
+		timeWaitTimeout:            DefaultTCPTimeWaitTimeout,
+		timeWaitReuse:              tcpip.TCPTimeWaitReuseLoopbackOnly,
 		synRcvdCount:               synRcvdCounter{threshold: SynRcvdCountThreshold},
-		dispatcher:                 newDispatcher(runtime.GOMAXPROCS(0)),
 		synRetries:                 DefaultSynRetries,
 		minRTO:                     MinRTO,
 		maxRTO:                     MaxRTO,
 		maxRetries:                 MaxRetries,
+		recovery:                   RACKLossDetection,
 	}
+	p.dispatcher.init(runtime.GOMAXPROCS(0))
+	return &p
 }
diff --git a/pkg/tcpip/transport/tcp/rack.go b/pkg/tcpip/transport/tcp/rack.go
new file mode 100644
index 000000000..d969ca23a
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/rack.go
@@ -0,0 +1,82 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+)
+
+// RACK is a loss detection algorithm used in TCP to detect packet loss and
+// reordering using transmission timestamp of the packets instead of packet or
+// sequence counts. To use RACK, SACK should be enabled on the connection.
+
+// rackControl stores the rack related fields.
+// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-6.1
+//
+// +stateify savable
+type rackControl struct {
+	// xmitTime is the latest transmission timestamp of rackControl.seg.
+	xmitTime time.Time `state:".(unixTime)"`
+
+	// endSequence is the ending TCP sequence number of rackControl.seg.
+	endSequence seqnum.Value
+
+	// fack is the highest selectively or cumulatively acknowledged
+	// sequence.
+	fack seqnum.Value
+
+	// rtt is the RTT of the most recently delivered packet on the
+	// connection (either cumulatively acknowledged or selectively
+	// acknowledged) that was not marked invalid as a possible spurious
+	// retransmission.
+	rtt time.Duration
+}
+
+// Update will update the RACK related fields when an ACK has been received.
+// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
+func (rc *rackControl) Update(seg *segment, ackSeg *segment, srtt time.Duration, offset uint32) {
+	rtt := time.Now().Sub(seg.xmitTime)
+
+	// If the ACK is for a retransmitted packet, do not update if it is a
+	// spurious inference which is determined by below checks:
+	// 1. When Timestamping option is available, if the TSVal is less than the
+	// transmit time of the most recent retransmitted packet.
+	// 2. When RTT calculated for the packet is less than the smoothed RTT
+	// for the connection.
+	// See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2
+	// step 2
+	if seg.xmitCount > 1 {
+		if ackSeg.parsedOptions.TS && ackSeg.parsedOptions.TSEcr != 0 {
+			if ackSeg.parsedOptions.TSEcr < tcpTimeStamp(seg.xmitTime, offset) {
+				return
+			}
+		}
+		if rtt < srtt {
+			return
+		}
+	}
+
+	rc.rtt = rtt
+	// Update rc.xmitTime and rc.endSequence to the transmit time and
+	// ending sequence number of the packet which has been acknowledged
+	// most recently.
+	endSeq := seg.sequenceNumber.Add(seqnum.Size(seg.data.Size()))
+	if rc.xmitTime.Before(seg.xmitTime) || (seg.xmitTime.Equal(rc.xmitTime) && rc.endSequence.LessThan(endSeq)) {
+		rc.xmitTime = seg.xmitTime
+		rc.endSequence = endSeq
+	}
+}
diff --git a/pkg/tcpip/transport/tcp/rack_state.go b/pkg/tcpip/transport/tcp/rack_state.go
new file mode 100644
index 000000000..c9dc7e773
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/rack_state.go
@@ -0,0 +1,29 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"time"
+)
+
+// saveXmitTime is invoked by stateify.
+func (rc *rackControl) saveXmitTime() unixTime {
+	return unixTime{rc.xmitTime.Unix(), rc.xmitTime.UnixNano()}
+}
+
+// loadXmitTime is invoked by stateify.
+func (rc *rackControl) loadXmitTime(unix unixTime) {
+	rc.xmitTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
index dd89a292a..5e0bfe585 100644
--- a/pkg/tcpip/transport/tcp/rcv.go
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -372,7 +372,7 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
 			// We only store the segment if it's within our buffer
 			// size limit.
 			if r.pendingBufUsed < r.pendingBufSize {
-				r.pendingBufUsed += s.logicalLen()
+				r.pendingBufUsed += seqnum.Size(s.segMemSize())
 				s.incRef()
 				heap.Push(&r.pendingRcvdSegments, s)
 				UpdateSACKBlocks(&r.ep.sack, segSeq, segSeq.Add(segLen), r.rcvNxt)
@@ -406,7 +406,7 @@ func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
 		}
 
 		heap.Pop(&r.pendingRcvdSegments)
-		r.pendingBufUsed -= s.logicalLen()
+		r.pendingBufUsed -= seqnum.Size(s.segMemSize())
 		s.decRef()
 	}
 	return false, nil
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
index 074edded6..94307d31a 100644
--- a/pkg/tcpip/transport/tcp/segment.go
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -35,6 +35,7 @@ type segment struct {
 	id     stack.TransportEndpointID `state:"manual"`
 	route  stack.Route               `state:"manual"`
 	data   buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
+	hdr    header.TCP
 	// views is used as buffer for data when its length is large
 	// enough to store a VectorisedView.
 	views [8]buffer.View `state:"nosave"`
@@ -60,13 +61,14 @@ type segment struct {
 	xmitCount uint32
 }
 
-func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) *segment {
+func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) *segment {
 	s := &segment{
 		refCnt: 1,
 		id:     id,
 		route:  r.Clone(),
 	}
 	s.data = pkt.Data.Clone(s.views[:])
+	s.hdr = header.TCP(pkt.TransportHeader().View())
 	s.rcvdTime = time.Now()
 	return s
 }
@@ -136,6 +138,12 @@ func (s *segment) logicalLen() seqnum.Size {
 	return l
 }
 
+// segMemSize is the amount of memory used to hold the segment data and
+// the associated metadata.
+func (s *segment) segMemSize() int {
+	return segSize + s.data.Size()
+}
+
 // parse populates the sequence & ack numbers, flags, and window fields of the
 // segment from the TCP header stored in the data. It then updates the view to
 // skip the header.
@@ -146,12 +154,6 @@ func (s *segment) logicalLen() seqnum.Size {
 // TCP checksum and stores the checksum and result of checksum verification in
 // the csum and csumValid fields of the segment.
 func (s *segment) parse() bool {
-	h, ok := s.data.PullUp(header.TCPMinimumSize)
-	if !ok {
-		return false
-	}
-	hdr := header.TCP(h)
-
 	// h is the header followed by the payload. We check that the offset to
 	// the data respects the following constraints:
 	// 1. That it's at least the minimum header size; if we don't do this
@@ -162,16 +164,12 @@ func (s *segment) parse() bool {
 	// N.B. The segment has already been validated as having at least the
 	//      minimum TCP size before reaching here, so it's safe to read the
 	//      fields.
-	offset := int(hdr.DataOffset())
-	if offset < header.TCPMinimumSize {
-		return false
-	}
-	hdrWithOpts, ok := s.data.PullUp(offset)
-	if !ok {
+	offset := int(s.hdr.DataOffset())
+	if offset < header.TCPMinimumSize || offset > len(s.hdr) {
 		return false
 	}
 
-	s.options = []byte(hdrWithOpts[header.TCPMinimumSize:])
+	s.options = []byte(s.hdr[header.TCPMinimumSize:])
 	s.parsedOptions = header.ParseTCPOptions(s.options)
 
 	// Query the link capabilities to decide if checksum validation is
@@ -180,22 +178,19 @@ func (s *segment) parse() bool {
 	if s.route.Capabilities()&stack.CapabilityRXChecksumOffload != 0 {
 		s.csumValid = true
 		verifyChecksum = false
-		s.data.TrimFront(offset)
 	}
 	if verifyChecksum {
-		hdr = header.TCP(hdrWithOpts)
-		s.csum = hdr.Checksum()
-		xsum := s.route.PseudoHeaderChecksum(ProtocolNumber, uint16(s.data.Size()))
-		xsum = hdr.CalculateChecksum(xsum)
-		s.data.TrimFront(offset)
+		s.csum = s.hdr.Checksum()
+		xsum := s.route.PseudoHeaderChecksum(ProtocolNumber, uint16(s.data.Size()+len(s.hdr)))
+		xsum = s.hdr.CalculateChecksum(xsum)
 		xsum = header.ChecksumVV(s.data, xsum)
 		s.csumValid = xsum == 0xffff
 	}
 
-	s.sequenceNumber = seqnum.Value(hdr.SequenceNumber())
-	s.ackNumber = seqnum.Value(hdr.AckNumber())
-	s.flags = hdr.Flags()
-	s.window = seqnum.Size(hdr.WindowSize())
+	s.sequenceNumber = seqnum.Value(s.hdr.SequenceNumber())
+	s.ackNumber = seqnum.Value(s.hdr.AckNumber())
+	s.flags = s.hdr.Flags()
+	s.window = seqnum.Size(s.hdr.WindowSize())
 	return true
 }
 
diff --git a/pkg/tcpip/transport/tcp/segment_unsafe.go b/pkg/tcpip/transport/tcp/segment_unsafe.go
new file mode 100644
index 000000000..0ab7b8f56
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/segment_unsafe.go
@@ -0,0 +1,23 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"unsafe"
+)
+
+const (
+	segSize = int(unsafe.Sizeof(segment{}))
+)
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
index 06dc9b7d7..c55589c45 100644
--- a/pkg/tcpip/transport/tcp/snd.go
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -191,6 +191,10 @@ type sender struct {
 
 	// cc is the congestion control algorithm in use for this sender.
 	cc congestionControl
+
+	// rc has the fields needed for implementing RACK loss detection
+	// algorithm.
+	rc rackControl
 }
 
 // rtt is a synchronization wrapper used to appease stateify. See the comment
@@ -618,6 +622,20 @@ func (s *sender) splitSeg(seg *segment, size int) {
 	nSeg.data.TrimFront(size)
 	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
 	s.writeList.InsertAfter(seg, nSeg)
+
+	// The segment being split does not carry PUSH flag because it is
+	// followed by the newly split segment.
+	// RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered
+	// segment (i.e., when there is no more queued data to be sent).
+	// Linux removes PSH flag only when the segment is being split over MSS
+	// and retains it when we are splitting the segment over lack of sender
+	// window space.
+	// ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point()
+	// ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test()
+	if seg.data.Size() > s.maxPayloadSize {
+		seg.flags ^= header.TCPFlagPsh
+	}
+
 	seg.data.CapLength(size)
 }
 
@@ -739,7 +757,7 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 	if !s.isAssignedSequenceNumber(seg) {
 		// Merge segments if allowed.
 		if seg.data.Size() != 0 {
-			available := int(seg.sequenceNumber.Size(end))
+			available := int(s.sndNxt.Size(end))
 			if available > limit {
 				available = limit
 			}
@@ -782,8 +800,11 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 					//   sent all at once.
 					return false
 				}
-				if atomic.LoadUint32(&s.ep.cork) != 0 {
-					// Hold back the segment until full.
+				// With TCP_CORK, hold back until minimum of the available
+				// send space and MSS.
+				// TODO(gvisor.dev/issue/2833): Drain the held segments after a
+				// timeout.
+				if seg.data.Size() < s.maxPayloadSize && atomic.LoadUint32(&s.ep.cork) != 0 {
 					return false
 				}
 			}
@@ -824,10 +845,52 @@ func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (se
 		if available == 0 {
 			return false
 		}
+
+		// If the whole segment or at least 1MSS sized segment cannot
+		// be accomodated in the receiver advertized window, skip
+		// splitting and sending of the segment. ref:
+		// net/ipv4/tcp_output.c::tcp_snd_wnd_test()
+		//
+		// Linux checks this for all segment transmits not triggered by
+		// a probe timer. On this condition, it defers the segment split
+		// and transmit to a short probe timer.
+		//
+		// ref: include/net/tcp.h::tcp_check_probe_timer()
+		// ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
+		//
+		// Instead of defining a new transmit timer, we attempt to split
+		// the segment right here if there are no pending segments. If
+		// there are pending segments, segment transmits are deferred to
+		// the retransmit timer handler.
+		if s.sndUna != s.sndNxt {
+			switch {
+			case available >= seg.data.Size():
+				// OK to send, the whole segments fits in the
+				// receiver's advertised window.
+			case available >= s.maxPayloadSize:
+				// OK to send, at least 1 MSS sized segment fits
+				// in the receiver's advertised window.
+			default:
+				return false
+			}
+		}
+
+		// The segment size limit is computed as a function of sender
+		// congestion window and MSS. When sender congestion window is >
+		// 1, this limit can be larger than MSS. Ensure that the
+		// currently available send space is not greater than minimum of
+		// this limit and MSS.
 		if available > limit {
 			available = limit
 		}
 
+		// If GSO is not in use then cap available to
+		// maxPayloadSize. When GSO is in use the gVisor GSO logic or
+		// the host GSO logic will cap the segment to the correct size.
+		if s.ep.gso == nil && available > s.maxPayloadSize {
+			available = s.maxPayloadSize
+		}
+
 		if seg.data.Size() > available {
 			s.splitSeg(seg, available)
 		}
@@ -1213,21 +1276,21 @@ func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) {
 
 // handleRcvdSegment is called when a segment is received; it is responsible for
 // updating the send-related state.
-func (s *sender) handleRcvdSegment(seg *segment) {
+func (s *sender) handleRcvdSegment(rcvdSeg *segment) {
 	// Check if we can extract an RTT measurement from this ack.
-	if !seg.parsedOptions.TS && s.rttMeasureSeqNum.LessThan(seg.ackNumber) {
+	if !rcvdSeg.parsedOptions.TS && s.rttMeasureSeqNum.LessThan(rcvdSeg.ackNumber) {
 		s.updateRTO(time.Now().Sub(s.rttMeasureTime))
 		s.rttMeasureSeqNum = s.sndNxt
 	}
 
 	// Update Timestamp if required. See RFC7323, section-4.3.
-	if s.ep.sendTSOk && seg.parsedOptions.TS {
-		s.ep.updateRecentTimestamp(seg.parsedOptions.TSVal, s.maxSentAck, seg.sequenceNumber)
+	if s.ep.sendTSOk && rcvdSeg.parsedOptions.TS {
+		s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.maxSentAck, rcvdSeg.sequenceNumber)
 	}
 
 	// Insert SACKBlock information into our scoreboard.
 	if s.ep.sackPermitted {
-		for _, sb := range seg.parsedOptions.SACKBlocks {
+		for _, sb := range rcvdSeg.parsedOptions.SACKBlocks {
 			// Only insert the SACK block if the following holds
 			// true:
 			//  * SACK block acks data after the ack number in the
@@ -1240,27 +1303,27 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 			// NOTE: This check specifically excludes DSACK blocks
 			// which have start/end before sndUna and are used to
 			// indicate spurious retransmissions.
-			if seg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) {
+			if rcvdSeg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) {
 				s.ep.scoreboard.Insert(sb)
-				seg.hasNewSACKInfo = true
+				rcvdSeg.hasNewSACKInfo = true
 			}
 		}
 		s.SetPipe()
 	}
 
 	// Count the duplicates and do the fast retransmit if needed.
-	rtx := s.checkDuplicateAck(seg)
+	rtx := s.checkDuplicateAck(rcvdSeg)
 
 	// Stash away the current window size.
-	s.sndWnd = seg.window
+	s.sndWnd = rcvdSeg.window
 
-	ack := seg.ackNumber
+	ack := rcvdSeg.ackNumber
 
 	// Disable zero window probing if remote advertizes a non-zero receive
 	// window. This can be with an ACK to the zero window probe (where the
 	// acknumber refers to the already acknowledged byte) OR to any previously
 	// unacknowledged segment.
-	if s.zeroWindowProbing && seg.window > 0 &&
+	if s.zeroWindowProbing && rcvdSeg.window > 0 &&
 		(ack == s.sndUna || (ack-1).InRange(s.sndUna, s.sndNxt)) {
 		s.disableZeroWindowProbing()
 	}
@@ -1285,10 +1348,10 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 		//    averaged RTT measurement only if the segment acknowledges
 		//    some new data, i.e., only if it advances the left edge of
 		//    the send window.
-		if s.ep.sendTSOk && seg.parsedOptions.TSEcr != 0 {
+		if s.ep.sendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 {
 			// TSVal/Ecr values sent by Netstack are at a millisecond
 			// granularity.
-			elapsed := time.Duration(s.ep.timestamp()-seg.parsedOptions.TSEcr) * time.Millisecond
+			elapsed := time.Duration(s.ep.timestamp()-rcvdSeg.parsedOptions.TSEcr) * time.Millisecond
 			s.updateRTO(elapsed)
 		}
 
@@ -1302,6 +1365,9 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 
 		ackLeft := acked
 		originalOutstanding := s.outstanding
+		s.rtt.Lock()
+		srtt := s.rtt.srtt
+		s.rtt.Unlock()
 		for ackLeft > 0 {
 			// We use logicalLen here because we can have FIN
 			// segments (which are always at the end of list) that
@@ -1321,6 +1387,11 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 				s.writeNext = seg.Next()
 			}
 
+			// Update the RACK fields if SACK is enabled.
+			if s.ep.sackPermitted {
+				s.rc.Update(seg, rcvdSeg, srtt, s.ep.tsOffset)
+			}
+
 			s.writeList.Remove(seg)
 
 			// if SACK is enabled then Only reduce outstanding if
@@ -1376,7 +1447,7 @@ func (s *sender) handleRcvdSegment(seg *segment) {
 	// that the window opened up, or the congestion window was inflated due
 	// to a duplicate ack during fast recovery. This will also re-enable
 	// the retransmit timer if needed.
-	if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || seg.hasNewSACKInfo {
+	if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || rcvdSeg.hasNewSACKInfo {
 		s.sendData()
 	}
 }
diff --git a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
index 5fe23113b..b9993ce1a 100644
--- a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
@@ -50,7 +50,7 @@ func TestFastRecovery(t *testing.T) {
 	// Write all the data in one shot. Packets will only be written at the
 	// MTU size though.
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Do slow start for a few iterations.
@@ -90,14 +90,14 @@ func TestFastRecovery(t *testing.T) {
 	// Wait before checking metrics.
 	metricPollFn := func() error {
 		if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
-			return fmt.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+			return fmt.Errorf("got stats.TCP.FastRetransmit.Value = %d, want = %d", got, want)
 		}
 		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
-			return fmt.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+			return fmt.Errorf("got stats.TCP.Retransmit.Value = %d, want = %d", got, want)
 		}
 
 		if got, want := c.Stack().Stats().TCP.FastRecovery.Value(), uint64(1); got != want {
-			return fmt.Errorf("got stats.TCP.FastRecovery.Value = %v, want = %v", got, want)
+			return fmt.Errorf("got stats.TCP.FastRecovery.Value = %d, want = %d", got, want)
 		}
 		return nil
 	}
@@ -128,10 +128,10 @@ func TestFastRecovery(t *testing.T) {
 	// Wait before checking metrics.
 	metricPollFn = func() error {
 		if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(2); got != want {
-			return fmt.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+			return fmt.Errorf("got stats.TCP.FastRetransmit.Value = %d, want = %d", got, want)
 		}
 		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(2); got != want {
-			return fmt.Errorf("got stats.TCP.Retransmit.Value = %v, want = %v", got, want)
+			return fmt.Errorf("got stats.TCP.Retransmit.Value = %d, want = %d", got, want)
 		}
 		return nil
 	}
@@ -215,7 +215,7 @@ func TestExponentialIncreaseDuringSlowStart(t *testing.T) {
 	// Write all the data in one shot. Packets will only be written at the
 	// MTU size though.
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	expected := tcp.InitialCwnd
@@ -257,7 +257,7 @@ func TestCongestionAvoidance(t *testing.T) {
 	// Write all the data in one shot. Packets will only be written at the
 	// MTU size though.
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Do slow start for a few iterations.
@@ -362,7 +362,7 @@ func TestCubicCongestionAvoidance(t *testing.T) {
 	// Write all the data in one shot. Packets will only be written at the
 	// MTU size though.
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Do slow start for a few iterations.
@@ -471,11 +471,11 @@ func TestRetransmit(t *testing.T) {
 	// MTU size though.
 	half := data[:len(data)/2]
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 	half = data[len(data)/2:]
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Do slow start for a few iterations.
@@ -508,23 +508,23 @@ func TestRetransmit(t *testing.T) {
 
 	metricPollFn := func() error {
 		if got, want := c.Stack().Stats().TCP.Timeouts.Value(), uint64(1); got != want {
-			return fmt.Errorf("got stats.TCP.Timeouts.Value = %v, want = %v", got, want)
+			return fmt.Errorf("got stats.TCP.Timeouts.Value = %d, want = %d", got, want)
 		}
 
 		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
-			return fmt.Errorf("got stats.TCP.Retransmits.Value = %v, want = %v", got, want)
+			return fmt.Errorf("got stats.TCP.Retransmits.Value = %d, want = %d", got, want)
 		}
 
 		if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Timeouts.Value(), uint64(1); got != want {
-			return fmt.Errorf("got EP SendErrors.Timeouts.Value = %v, want = %v", got, want)
+			return fmt.Errorf("got EP SendErrors.Timeouts.Value = %d, want = %d", got, want)
 		}
 
 		if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Retransmits.Value(), uint64(1); got != want {
-			return fmt.Errorf("got EP stats SendErrors.Retransmits.Value = %v, want = %v", got, want)
+			return fmt.Errorf("got EP stats SendErrors.Retransmits.Value = %d, want = %d", got, want)
 		}
 
 		if got, want := c.Stack().Stats().TCP.SlowStartRetransmits.Value(), uint64(1); got != want {
-			return fmt.Errorf("got stats.TCP.SlowStartRetransmits.Value = %v, want = %v", got, want)
+			return fmt.Errorf("got stats.TCP.SlowStartRetransmits.Value = %d, want = %d", got, want)
 		}
 
 		return nil
diff --git a/pkg/tcpip/transport/tcp/tcp_rack_test.go b/pkg/tcpip/transport/tcp/tcp_rack_test.go
new file mode 100644
index 000000000..e03f101e8
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/tcp_rack_test.go
@@ -0,0 +1,74 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_test
+
+import (
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context"
+)
+
+// TestRACKUpdate tests the RACK related fields are updated when an ACK is
+// received on a SACK enabled connection.
+func TestRACKUpdate(t *testing.T) {
+	const maxPayload = 10
+	const tsOptionSize = 12
+	const maxTCPOptionSize = 40
+
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxTCPOptionSize+maxPayload))
+	defer c.Cleanup()
+
+	var xmitTime time.Time
+	c.Stack().AddTCPProbe(func(state stack.TCPEndpointState) {
+		// Validate that the endpoint Sender.RACKState is what we expect.
+		if state.Sender.RACKState.XmitTime.Before(xmitTime) {
+			t.Fatalf("RACK transmit time failed to update when an ACK is received")
+		}
+
+		gotSeq := state.Sender.RACKState.EndSequence
+		wantSeq := state.Sender.SndNxt
+		if !gotSeq.LessThanEq(wantSeq) || gotSeq.LessThan(wantSeq) {
+			t.Fatalf("RACK sequence number failed to update, got: %v, but want: %v", gotSeq, wantSeq)
+		}
+
+		if state.Sender.RACKState.RTT == 0 {
+			t.Fatalf("RACK RTT failed to update when an ACK is received")
+		}
+	})
+	setStackSACKPermitted(t, c, true)
+	createConnectedWithSACKAndTS(c)
+
+	data := buffer.NewView(maxPayload)
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	// Write the data.
+	xmitTime = time.Now()
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	bytesRead := 0
+	c.ReceiveAndCheckPacketWithOptions(data, bytesRead, maxPayload, tsOptionSize)
+	bytesRead += maxPayload
+	c.SendAck(790, bytesRead)
+	time.Sleep(200 * time.Millisecond)
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
index ace79b7b2..99521f0c1 100644
--- a/pkg/tcpip/transport/tcp/tcp_sack_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -47,7 +47,7 @@ func createConnectedWithSACKAndTS(c *context.Context) *context.RawEndpoint {
 func setStackSACKPermitted(t *testing.T, c *context.Context, enable bool) {
 	t.Helper()
 	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enable)); err != nil {
-		t.Fatalf("c.s.SetTransportProtocolOption(tcp.ProtocolNumber, SACKEnabled(%v) = %v", enable, err)
+		t.Fatalf("c.s.SetTransportProtocolOption(tcp.ProtocolNumber, SACKEnabled(%t) = %s", enable, err)
 	}
 }
 
@@ -400,7 +400,7 @@ func TestSACKRecovery(t *testing.T) {
 	// Write all the data in one shot. Packets will only be written at the
 	// MTU size though.
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Do slow start for a few iterations.
@@ -454,7 +454,7 @@ func TestSACKRecovery(t *testing.T) {
 		}
 		for _, s := range stats {
 			if got, want := s.stat.Value(), s.want; got != want {
-				return fmt.Errorf("got %s.Value() = %v, want = %v", s.name, got, want)
+				return fmt.Errorf("got %s.Value() = %d, want = %d", s.name, got, want)
 			}
 		}
 		return nil
@@ -529,19 +529,19 @@ func TestSACKRecovery(t *testing.T) {
 		// In SACK recovery only the first segment is fast retransmitted when
 		// entering recovery.
 		if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
-			return fmt.Errorf("got stats.TCP.FastRetransmit.Value = %v, want = %v", got, want)
+			return fmt.Errorf("got stats.TCP.FastRetransmit.Value = %d, want = %d", got, want)
 		}
 
 		if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.FastRetransmit.Value(), uint64(1); got != want {
-			return fmt.Errorf("got EP stats SendErrors.FastRetransmit = %v, want = %v", got, want)
+			return fmt.Errorf("got EP stats SendErrors.FastRetransmit = %d, want = %d", got, want)
 		}
 
 		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(4); got != want {
-			return fmt.Errorf("got stats.TCP.Retransmits.Value = %v, want = %v", got, want)
+			return fmt.Errorf("got stats.TCP.Retransmits.Value = %d, want = %d", got, want)
 		}
 
 		if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Retransmits.Value(), uint64(4); got != want {
-			return fmt.Errorf("got EP stats Stats.SendErrors.Retransmits = %v, want = %v", got, want)
+			return fmt.Errorf("got EP stats Stats.SendErrors.Retransmits = %d, want = %d", got, want)
 		}
 		return nil
 	}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
index 6ef32a1b3..0f7e958e4 100644
--- a/pkg/tcpip/transport/tcp/tcp_test.go
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -57,7 +57,7 @@ func TestGiveUpConnect(t *testing.T) {
 	var wq waiter.Queue
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 
 	// Register for notification, then start connection attempt.
@@ -66,7 +66,7 @@ func TestGiveUpConnect(t *testing.T) {
 	defer wq.EventUnregister(&waitEntry)
 
 	if err := ep.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
-		t.Fatalf("got ep.Connect(...) = %v, want = %v", err, tcpip.ErrConnectStarted)
+		t.Fatalf("got ep.Connect(...) = %s, want = %s", err, tcpip.ErrConnectStarted)
 	}
 
 	// Close the connection, wait for completion.
@@ -75,21 +75,21 @@ func TestGiveUpConnect(t *testing.T) {
 	// Wait for ep to become writable.
 	<-notifyCh
 	if err := ep.GetSockOpt(tcpip.ErrorOption{}); err != tcpip.ErrAborted {
-		t.Fatalf("got ep.GetSockOpt(tcpip.ErrorOption{}) = %v, want = %v", err, tcpip.ErrAborted)
+		t.Fatalf("got ep.GetSockOpt(tcpip.ErrorOption{}) = %s, want = %s", err, tcpip.ErrAborted)
 	}
 
 	// Call Connect again to retreive the handshake failure status
 	// and stats updates.
 	if err := ep.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrAborted {
-		t.Fatalf("got ep.Connect(...) = %v, want = %v", err, tcpip.ErrAborted)
+		t.Fatalf("got ep.Connect(...) = %s, want = %s", err, tcpip.ErrAborted)
 	}
 
 	if got := c.Stack().Stats().TCP.FailedConnectionAttempts.Value(); got != 1 {
-		t.Errorf("got stats.TCP.FailedConnectionAttempts.Value() = %v, want = 1", got)
+		t.Errorf("got stats.TCP.FailedConnectionAttempts.Value() = %d, want = 1", got)
 	}
 
 	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
-		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)
 	}
 }
 
@@ -102,7 +102,7 @@ func TestConnectIncrementActiveConnection(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 	if got := stats.TCP.ActiveConnectionOpenings.Value(); got != want {
-		t.Errorf("got stats.TCP.ActtiveConnectionOpenings.Value() = %v, want = %v", got, want)
+		t.Errorf("got stats.TCP.ActtiveConnectionOpenings.Value() = %d, want = %d", got, want)
 	}
 }
 
@@ -115,10 +115,10 @@ func TestConnectDoesNotIncrementFailedConnectionAttempts(t *testing.T) {
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 	if got := stats.TCP.FailedConnectionAttempts.Value(); got != want {
-		t.Errorf("got stats.TCP.FailedConnectionAttempts.Value() = %v, want = %v", got, want)
+		t.Errorf("got stats.TCP.FailedConnectionAttempts.Value() = %d, want = %d", got, want)
 	}
 	if got := c.EP.Stats().(*tcp.Stats).FailedConnectionAttempts.Value(); got != want {
-		t.Errorf("got EP stats.FailedConnectionAttempts = %v, want = %v", got, want)
+		t.Errorf("got EP stats.FailedConnectionAttempts = %d, want = %d", got, want)
 	}
 }
 
@@ -129,20 +129,38 @@ func TestActiveFailedConnectionAttemptIncrement(t *testing.T) {
 	stats := c.Stack().Stats()
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 	c.EP = ep
 	want := stats.TCP.FailedConnectionAttempts.Value() + 1
 
 	if err := c.EP.Connect(tcpip.FullAddress{NIC: 2, Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrNoRoute {
-		t.Errorf("got c.EP.Connect(...) = %v, want = %v", err, tcpip.ErrNoRoute)
+		t.Errorf("got c.EP.Connect(...) = %s, want = %s", err, tcpip.ErrNoRoute)
 	}
 
 	if got := stats.TCP.FailedConnectionAttempts.Value(); got != want {
-		t.Errorf("got stats.TCP.FailedConnectionAttempts.Value() = %v, want = %v", got, want)
+		t.Errorf("got stats.TCP.FailedConnectionAttempts.Value() = %d, want = %d", got, want)
 	}
 	if got := c.EP.Stats().(*tcp.Stats).FailedConnectionAttempts.Value(); got != want {
-		t.Errorf("got EP stats FailedConnectionAttempts = %v, want = %v", got, want)
+		t.Errorf("got EP stats FailedConnectionAttempts = %d, want = %d", got, want)
+	}
+}
+
+func TestCloseWithoutConnect(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create TCP endpoint.
+	var err *tcpip.Error
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+
+	c.EP.Close()
+
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)
 	}
 }
 
@@ -156,10 +174,10 @@ func TestTCPSegmentsSentIncrement(t *testing.T) {
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	if got := stats.TCP.SegmentsSent.Value(); got != want {
-		t.Errorf("got stats.TCP.SegmentsSent.Value() = %v, want = %v", got, want)
+		t.Errorf("got stats.TCP.SegmentsSent.Value() = %d, want = %d", got, want)
 	}
 	if got := c.EP.Stats().(*tcp.Stats).SegmentsSent.Value(); got != want {
-		t.Errorf("got EP stats SegmentsSent.Value() = %v, want = %v", got, want)
+		t.Errorf("got EP stats SegmentsSent.Value() = %d, want = %d", got, want)
 	}
 }
 
@@ -170,16 +188,16 @@ func TestTCPResetsSentIncrement(t *testing.T) {
 	wq := &waiter.Queue{}
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 	want := stats.TCP.SegmentsSent.Value() + 1
 
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	if err := ep.Listen(10); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	// Send a SYN request.
@@ -213,7 +231,7 @@ func TestTCPResetsSentIncrement(t *testing.T) {
 
 	metricPollFn := func() error {
 		if got := stats.TCP.ResetsSent.Value(); got != want {
-			return fmt.Errorf("got stats.TCP.ResetsSent.Value() = %v, want = %v", got, want)
+			return fmt.Errorf("got stats.TCP.ResetsSent.Value() = %d, want = %d", got, want)
 		}
 		return nil
 	}
@@ -292,7 +310,7 @@ func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
 	// are released instantly on Close.
 	tcpTW := tcpip.TCPTimeWaitTimeoutOption(1 * time.Millisecond)
 	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpTW); err != nil {
-		t.Fatalf("e.stack.SetTransportProtocolOption(%d, %v) = %v", tcp.ProtocolNumber, tcpTW, err)
+		t.Fatalf("e.stack.SetTransportProtocolOption(%d, %#v) = %s", tcp.ProtocolNumber, tcpTW, err)
 	}
 
 	c.EP.Close()
@@ -355,7 +373,7 @@ func TestTCPResetsReceivedIncrement(t *testing.T) {
 	})
 
 	if got := stats.TCP.ResetsReceived.Value(); got != want {
-		t.Errorf("got stats.TCP.ResetsReceived.Value() = %v, want = %v", got, want)
+		t.Errorf("got stats.TCP.ResetsReceived.Value() = %d, want = %d", got, want)
 	}
 }
 
@@ -379,7 +397,7 @@ func TestTCPResetsDoNotGenerateResets(t *testing.T) {
 	})
 
 	if got := stats.TCP.ResetsReceived.Value(); got != want {
-		t.Errorf("got stats.TCP.ResetsReceived.Value() = %v, want = %v", got, want)
+		t.Errorf("got stats.TCP.ResetsReceived.Value() = %d, want = %d", got, want)
 	}
 	c.CheckNoPacketTimeout("got an unexpected packet", 100*time.Millisecond)
 }
@@ -403,7 +421,7 @@ func TestNonBlockingClose(t *testing.T) {
 	t0 := time.Now()
 	ep.Close()
 	if diff := time.Now().Sub(t0); diff > 3*time.Second {
-		t.Fatalf("Took too long to close: %v", diff)
+		t.Fatalf("Took too long to close: %s", diff)
 	}
 }
 
@@ -415,7 +433,7 @@ func TestConnectResetAfterClose(t *testing.T) {
 	// after 3 second in FIN_WAIT2 state.
 	tcpLingerTimeout := 3 * time.Second
 	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPLingerTimeoutOption(tcpLingerTimeout)); err != nil {
-		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpLingerTimeout, err)
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%s) failed: %s", tcpLingerTimeout, err)
 	}
 
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
@@ -497,11 +515,11 @@ func TestCurrentConnectedIncrement(t *testing.T) {
 	c.EP = nil
 
 	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 1 {
-		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 1", got)
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 1", got)
 	}
 	gotConnected := c.Stack().Stats().TCP.CurrentConnected.Value()
 	if gotConnected != 1 {
-		t.Errorf("got stats.TCP.CurrentConnected.Value() = %v, want = 1", gotConnected)
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 1", gotConnected)
 	}
 
 	ep.Close()
@@ -524,10 +542,10 @@ func TestCurrentConnectedIncrement(t *testing.T) {
 	})
 
 	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
-		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)
 	}
 	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != gotConnected {
-		t.Errorf("got stats.TCP.CurrentConnected.Value() = %v, want = %v", got, gotConnected)
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = %d", got, gotConnected)
 	}
 
 	// Ack and send FIN as well.
@@ -556,10 +574,10 @@ func TestCurrentConnectedIncrement(t *testing.T) {
 	time.Sleep(1200 * time.Millisecond)
 
 	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
-		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)
 	}
 	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
-		t.Errorf("got stats.TCP.CurrentConnected.Value() = %v, want = 0", got)
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)
 	}
 }
 
@@ -575,7 +593,7 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 	c.EP = nil
 
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateEstablished; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %d, got %d", want, got)
 	}
 
 	// Send a FIN for ESTABLISHED --> CLOSED-WAIT
@@ -603,7 +621,7 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 	time.Sleep(10 * time.Millisecond)
 
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateCloseWait; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %d, got %d", want, got)
 	}
 
 	// Close the application endpoint for CLOSE_WAIT --> LAST_ACK
@@ -620,7 +638,7 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 	)
 
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateLastAck; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 
 	// Pause the endpoint`s protocolMainLoop.
@@ -657,15 +675,15 @@ func TestClosingWithEnqueuedSegments(t *testing.T) {
 
 	// Expect the endpoint to be closed.
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateClose; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 
 	if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != 1 {
-		t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %v, want = 1", got)
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %d, want = 1", got)
 	}
 
 	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
-		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)
 	}
 
 	// Check if the endpoint was moved to CLOSED and netstack a reset in
@@ -691,7 +709,7 @@ func TestSimpleReceive(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
 	}
 
 	data := []byte{1, 2, 3}
@@ -714,7 +732,7 @@ func TestSimpleReceive(t *testing.T) {
 	// Receive data.
 	v, _, err := c.EP.Read(nil)
 	if err != nil {
-		t.Fatalf("Read failed: %v", err)
+		t.Fatalf("Read failed: %s", err)
 	}
 
 	if !bytes.Equal(data, v) {
@@ -781,7 +799,7 @@ func TestUserSuppliedMSSOnConnectV4(t *testing.T) {
 
 			// Start connection attempt to IPv4 address.
 			if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
-				t.Fatalf("Unexpected return value from Connect: %v", err)
+				t.Fatalf("unexpected return value from Connect: %s", err)
 			}
 
 			// Receive SYN packet with our user supplied MSS.
@@ -842,7 +860,7 @@ func TestUserSuppliedMSSOnConnectV6(t *testing.T) {
 
 			// Start connection attempt to IPv6 address.
 			if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestV6Addr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
-				t.Fatalf("Unexpected return value from Connect: %v", err)
+				t.Fatalf("unexpected return value from Connect: %s", err)
 			}
 
 			// Receive SYN packet with our user supplied MSS.
@@ -1239,7 +1257,7 @@ func TestConnectBindToDevice(t *testing.T) {
 			defer c.WQ.EventUnregister(&waitEntry)
 
 			if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
-				t.Fatalf("Unexpected return value from Connect: %v", err)
+				t.Fatalf("unexpected return value from Connect: %s", err)
 			}
 
 			// Receive SYN packet.
@@ -1251,7 +1269,7 @@ func TestConnectBindToDevice(t *testing.T) {
 				),
 			)
 			if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
-				t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+				t.Fatalf("unexpected endpoint state: want %s, got %s", want, got)
 			}
 			tcpHdr := header.TCP(header.IPv4(b).Payload())
 			c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
@@ -1270,74 +1288,97 @@ func TestConnectBindToDevice(t *testing.T) {
 
 			c.GetPacket()
 			if got, want := tcp.EndpointState(c.EP.State()), test.want; got != want {
-				t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+				t.Fatalf("unexpected endpoint state: want %s, got %s", want, got)
 			}
 		})
 	}
 }
 
-func TestRstOnSynSent(t *testing.T) {
-	c := context.New(t, defaultMTU)
-	defer c.Cleanup()
+func TestSynSent(t *testing.T) {
+	for _, test := range []struct {
+		name  string
+		reset bool
+	}{
+		{"RstOnSynSent", true},
+		{"CloseOnSynSent", false},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
 
-	// Create an endpoint, don't handshake because we want to interfere with the
-	// handshake process.
-	c.Create(-1)
+			// Create an endpoint, don't handshake because we want to interfere with the
+			// handshake process.
+			c.Create(-1)
 
-	// Start connection attempt.
-	waitEntry, ch := waiter.NewChannelEntry(nil)
-	c.WQ.EventRegister(&waitEntry, waiter.EventOut)
-	defer c.WQ.EventUnregister(&waitEntry)
+			// Start connection attempt.
+			waitEntry, ch := waiter.NewChannelEntry(nil)
+			c.WQ.EventRegister(&waitEntry, waiter.EventOut)
+			defer c.WQ.EventUnregister(&waitEntry)
 
-	addr := tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}
-	if err := c.EP.Connect(addr); err != tcpip.ErrConnectStarted {
-		t.Fatalf("got Connect(%+v) = %v, want %s", addr, err, tcpip.ErrConnectStarted)
-	}
+			addr := tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}
+			if err := c.EP.Connect(addr); err != tcpip.ErrConnectStarted {
+				t.Fatalf("got Connect(%+v) = %s, want %s", addr, err, tcpip.ErrConnectStarted)
+			}
 
-	// Receive SYN packet.
-	b := c.GetPacket()
-	checker.IPv4(t, b,
-		checker.TCP(
-			checker.DstPort(context.TestPort),
-			checker.TCPFlags(header.TCPFlagSyn),
-		),
-	)
+			// Receive SYN packet.
+			b := c.GetPacket()
+			checker.IPv4(t, b,
+				checker.TCP(
+					checker.DstPort(context.TestPort),
+					checker.TCPFlags(header.TCPFlagSyn),
+				),
+			)
 
-	// Ensure that we've reached SynSent state
-	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
-		t.Fatalf("got State() = %s, want %s", got, want)
-	}
-	tcpHdr := header.TCP(header.IPv4(b).Payload())
-	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+			if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
+				t.Fatalf("got State() = %s, want %s", got, want)
+			}
+			tcpHdr := header.TCP(header.IPv4(b).Payload())
+			c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
 
-	// Send a packet with a proper ACK and a RST flag to cause the socket
-	// to Error and close out
-	iss := seqnum.Value(789)
-	rcvWnd := seqnum.Size(30000)
-	c.SendPacket(nil, &context.Headers{
-		SrcPort: tcpHdr.DestinationPort(),
-		DstPort: tcpHdr.SourcePort(),
-		Flags:   header.TCPFlagRst | header.TCPFlagAck,
-		SeqNum:  iss,
-		AckNum:  c.IRS.Add(1),
-		RcvWnd:  rcvWnd,
-		TCPOpts: nil,
-	})
+			if test.reset {
+				// Send a packet with a proper ACK and a RST flag to cause the socket
+				// to error and close out.
+				iss := seqnum.Value(789)
+				rcvWnd := seqnum.Size(30000)
+				c.SendPacket(nil, &context.Headers{
+					SrcPort: tcpHdr.DestinationPort(),
+					DstPort: tcpHdr.SourcePort(),
+					Flags:   header.TCPFlagRst | header.TCPFlagAck,
+					SeqNum:  iss,
+					AckNum:  c.IRS.Add(1),
+					RcvWnd:  rcvWnd,
+					TCPOpts: nil,
+				})
+			} else {
+				c.EP.Close()
+			}
 
-	// Wait for receive to be notified.
-	select {
-	case <-ch:
-	case <-time.After(3 * time.Second):
-		t.Fatal("timed out waiting for packet to arrive")
-	}
+			// Wait for receive to be notified.
+			select {
+			case <-ch:
+			case <-time.After(3 * time.Second):
+				t.Fatal("timed out waiting for packet to arrive")
+			}
 
-	if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionRefused {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %s", err, tcpip.ErrConnectionRefused)
-	}
+			if test.reset {
+				if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionRefused {
+					t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionRefused)
+				}
+			} else {
+				if _, _, err := c.EP.Read(nil); err != tcpip.ErrAborted {
+					t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrAborted)
+				}
+			}
 
-	// Due to the RST the endpoint should be in an error state.
-	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
-		t.Fatalf("got State() = %s, want %s", got, want)
+			if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
+				t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)
+			}
+
+			// Due to the RST the endpoint should be in an error state.
+			if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
+				t.Fatalf("got State() = %s, want %s", got, want)
+			}
+		})
 	}
 }
 
@@ -1352,7 +1393,7 @@ func TestOutOfOrderReceive(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Send second half of data first, with seqnum 3 ahead of expected.
@@ -1379,7 +1420,7 @@ func TestOutOfOrderReceive(t *testing.T) {
 	// Wait 200ms and check that no data has been received.
 	time.Sleep(200 * time.Millisecond)
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Send the first 3 bytes now.
@@ -1406,7 +1447,7 @@ func TestOutOfOrderReceive(t *testing.T) {
 				}
 				continue
 			}
-			t.Fatalf("Read failed: %v", err)
+			t.Fatalf("Read failed: %s", err)
 		}
 
 		read = append(read, v...)
@@ -1436,7 +1477,7 @@ func TestOutOfOrderFlood(t *testing.T) {
 	c.CreateConnected(789, 30000, 10)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Send 100 packets before the actual one that is expected.
@@ -1513,7 +1554,7 @@ func TestRstOnCloseWithUnreadData(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
 	}
 
 	data := []byte{1, 2, 3}
@@ -1556,7 +1597,7 @@ func TestRstOnCloseWithUnreadData(t *testing.T) {
 		))
 	// The RST puts the endpoint into an error state.
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 
 	// This final ACK should be ignored because an ACK on a reset doesn't mean
@@ -1582,7 +1623,7 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
 	}
 
 	data := []byte{1, 2, 3}
@@ -1624,7 +1665,7 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 		))
 
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateFinWait1; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 
 	// Cause a RST to be generated by closing the read end now since we have
@@ -1643,7 +1684,7 @@ func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
 		))
 	// The RST puts the endpoint into an error state.
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 
 	// The ACK to the FIN should now be rejected since the connection has been
@@ -1665,19 +1706,19 @@ func TestShutdownRead(t *testing.T) {
 	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
 	}
 
 	if err := c.EP.Shutdown(tcpip.ShutdownRead); err != nil {
-		t.Fatalf("Shutdown failed: %v", err)
+		t.Fatalf("Shutdown failed: %s", err)
 	}
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrClosedForReceive {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrClosedForReceive)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrClosedForReceive)
 	}
 	var want uint64 = 1
 	if got := c.EP.Stats().(*tcp.Stats).ReadErrors.ReadClosed.Value(); got != want {
-		t.Fatalf("got EP stats Stats.ReadErrors.ReadClosed got %v want %v", got, want)
+		t.Fatalf("got EP stats Stats.ReadErrors.ReadClosed got %d want %d", got, want)
 	}
 }
 
@@ -1693,7 +1734,7 @@ func TestFullWindowReceive(t *testing.T) {
 
 	_, _, err := c.EP.Read(nil)
 	if err != tcpip.ErrWouldBlock {
-		t.Fatalf("Read failed: %v", err)
+		t.Fatalf("Read failed: %s", err)
 	}
 
 	// Fill up the window.
@@ -1728,7 +1769,7 @@ func TestFullWindowReceive(t *testing.T) {
 	// Receive data and check it.
 	v, _, err := c.EP.Read(nil)
 	if err != nil {
-		t.Fatalf("Read failed: %v", err)
+		t.Fatalf("Read failed: %s", err)
 	}
 
 	if !bytes.Equal(data, v) {
@@ -1737,7 +1778,7 @@ func TestFullWindowReceive(t *testing.T) {
 
 	var want uint64 = 1
 	if got := c.EP.Stats().(*tcp.Stats).ReceiveErrors.ZeroRcvWindowState.Value(); got != want {
-		t.Fatalf("got EP stats ReceiveErrors.ZeroRcvWindowState got %v want %v", got, want)
+		t.Fatalf("got EP stats ReceiveErrors.ZeroRcvWindowState got %d want %d", got, want)
 	}
 
 	// Check that we get an ACK for the newly non-zero window.
@@ -1760,7 +1801,7 @@ func TestNoWindowShrinking(t *testing.T) {
 	c.CreateConnected(789, 30000, 10)
 
 	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 5); err != nil {
-		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 5) failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 5) failed: %s", err)
 	}
 
 	we, ch := waiter.NewChannelEntry(nil)
@@ -1768,7 +1809,7 @@ func TestNoWindowShrinking(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Send 3 bytes, check that the peer acknowledges them.
@@ -1832,7 +1873,7 @@ func TestNoWindowShrinking(t *testing.T) {
 	for len(read) < len(data) {
 		v, _, err := c.EP.Read(nil)
 		if err != nil {
-			t.Fatalf("Read failed: %v", err)
+			t.Fatalf("Read failed: %s", err)
 		}
 
 		read = append(read, v...)
@@ -1866,7 +1907,7 @@ func TestSimpleSend(t *testing.T) {
 	copy(view, data)
 
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Check that data is received.
@@ -1908,7 +1949,7 @@ func TestZeroWindowSend(t *testing.T) {
 
 	_, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{})
 	if err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Check if we got a zero-window probe.
@@ -1976,7 +2017,7 @@ func TestScaledWindowConnect(t *testing.T) {
 	copy(view, data)
 
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Check that data is received, and that advertised window is 0xbfff,
@@ -2008,7 +2049,7 @@ func TestNonScaledWindowConnect(t *testing.T) {
 	copy(view, data)
 
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Check that data is received, and that advertised window is 0xffff,
@@ -2036,21 +2077,21 @@ func TestScaledWindowAccept(t *testing.T) {
 	wq := &waiter.Queue{}
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 	defer ep.Close()
 
 	// Set the window size greater than the maximum non-scaled window.
 	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 65535*3); err != nil {
-		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 65535*3) failed failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 65535*3) failed failed: %s", err)
 	}
 
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	if err := ep.Listen(10); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	// Do 3-way handshake.
@@ -2068,7 +2109,7 @@ func TestScaledWindowAccept(t *testing.T) {
 		case <-ch:
 			c.EP, _, err = ep.Accept()
 			if err != nil {
-				t.Fatalf("Accept failed: %v", err)
+				t.Fatalf("Accept failed: %s", err)
 			}
 
 		case <-time.After(1 * time.Second):
@@ -2081,7 +2122,7 @@ func TestScaledWindowAccept(t *testing.T) {
 	copy(view, data)
 
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Check that data is received, and that advertised window is 0xbfff,
@@ -2109,21 +2150,21 @@ func TestNonScaledWindowAccept(t *testing.T) {
 	wq := &waiter.Queue{}
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 	defer ep.Close()
 
 	// Set the window size greater than the maximum non-scaled window.
 	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 65535*3); err != nil {
-		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 65535*3) failed failed: %v", err)
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 65535*3) failed failed: %s", err)
 	}
 
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	if err := ep.Listen(10); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	// Do 3-way handshake w/ window scaling disabled. The SYN-ACK to the SYN
@@ -2142,7 +2183,7 @@ func TestNonScaledWindowAccept(t *testing.T) {
 		case <-ch:
 			c.EP, _, err = ep.Accept()
 			if err != nil {
-				t.Fatalf("Accept failed: %v", err)
+				t.Fatalf("Accept failed: %s", err)
 			}
 
 		case <-time.After(1 * time.Second):
@@ -2155,7 +2196,7 @@ func TestNonScaledWindowAccept(t *testing.T) {
 	copy(view, data)
 
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Check that data is received, and that advertised window is 0xffff,
@@ -2244,7 +2285,7 @@ func TestZeroScaledWindowReceive(t *testing.T) {
 	for sz < defaultMTU {
 		v, _, err := c.EP.Read(nil)
 		if err != nil {
-			t.Fatalf("Read failed: %v", err)
+			t.Fatalf("Read failed: %s", err)
 		}
 		sz += len(v)
 	}
@@ -2311,7 +2352,7 @@ func TestSegmentMerging(t *testing.T) {
 				allData = append(allData, data...)
 				view := buffer.NewViewFromBytes(data)
 				if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-					t.Fatalf("Write #%d failed: %v", i+1, err)
+					t.Fatalf("Write #%d failed: %s", i+1, err)
 				}
 			}
 
@@ -2381,7 +2422,7 @@ func TestDelay(t *testing.T) {
 		allData = append(allData, data...)
 		view := buffer.NewViewFromBytes(data)
 		if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-			t.Fatalf("Write #%d failed: %v", i+1, err)
+			t.Fatalf("Write #%d failed: %s", i+1, err)
 		}
 	}
 
@@ -2428,7 +2469,7 @@ func TestUndelay(t *testing.T) {
 	for i, data := range allData {
 		view := buffer.NewViewFromBytes(data)
 		if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-			t.Fatalf("Write #%d failed: %v", i+1, err)
+			t.Fatalf("Write #%d failed: %s", i+1, err)
 		}
 	}
 
@@ -2512,7 +2553,7 @@ func TestMSSNotDelayed(t *testing.T) {
 			for i, data := range allData {
 				view := buffer.NewViewFromBytes(data)
 				if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-					t.Fatalf("Write #%d failed: %v", i+1, err)
+					t.Fatalf("Write #%d failed: %s", i+1, err)
 				}
 			}
 
@@ -2563,7 +2604,7 @@ func testBrokenUpWrite(t *testing.T, c *context.Context, maxPayload int) {
 	copy(view, data)
 
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Check that data is received in chunks.
@@ -2631,7 +2672,7 @@ func TestSetTTL(t *testing.T) {
 			var err *tcpip.Error
 			c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 			if err != nil {
-				t.Fatalf("NewEndpoint failed: %v", err)
+				t.Fatalf("NewEndpoint failed: %s", err)
 			}
 
 			if err := c.EP.SetSockOptInt(tcpip.TTLOption, int(wantTTL)); err != nil {
@@ -2639,7 +2680,7 @@ func TestSetTTL(t *testing.T) {
 			}
 
 			if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
-				t.Fatalf("Unexpected return value from Connect: %s", err)
+				t.Fatalf("unexpected return value from Connect: %s", err)
 			}
 
 			// Receive SYN packet.
@@ -2671,7 +2712,7 @@ func TestPassiveSendMSSLessThanMTU(t *testing.T) {
 	wq := &waiter.Queue{}
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 	defer ep.Close()
 
@@ -2683,11 +2724,11 @@ func TestPassiveSendMSSLessThanMTU(t *testing.T) {
 	}
 
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	if err := ep.Listen(10); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	// Do 3-way handshake.
@@ -2705,7 +2746,7 @@ func TestPassiveSendMSSLessThanMTU(t *testing.T) {
 		case <-ch:
 			c.EP, _, err = ep.Accept()
 			if err != nil {
-				t.Fatalf("Accept failed: %v", err)
+				t.Fatalf("Accept failed: %s", err)
 			}
 
 		case <-time.After(1 * time.Second):
@@ -2794,7 +2835,7 @@ func TestForwarderSendMSSLessThanMTU(t *testing.T) {
 	select {
 	case err := <-ch:
 		if err != nil {
-			t.Fatalf("Error creating endpoint: %v", err)
+			t.Fatalf("Error creating endpoint: %s", err)
 		}
 	case <-time.After(2 * time.Second):
 		t.Fatalf("Timed out waiting for connection")
@@ -2813,7 +2854,7 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 	var err *tcpip.Error
 	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 
 	// Set the buffer size to a deterministic size so that we can check the
@@ -2830,7 +2871,7 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
-		t.Fatalf("got c.EP.Connect(...) = %v, want = %v", err, tcpip.ErrConnectStarted)
+		t.Fatalf("got c.EP.Connect(...) = %s, want = %s", err, tcpip.ErrConnectStarted)
 	}
 
 	// Receive SYN packet.
@@ -2884,7 +2925,7 @@ func TestSynOptionsOnActiveConnect(t *testing.T) {
 	select {
 	case <-ch:
 		if err := c.EP.GetSockOpt(tcpip.ErrorOption{}); err != nil {
-			t.Fatalf("GetSockOpt failed: %v", err)
+			t.Fatalf("GetSockOpt failed: %s", err)
 		}
 	case <-time.After(1 * time.Second):
 		t.Fatalf("Timed out waiting for connection")
@@ -2899,22 +2940,22 @@ func TestCloseListener(t *testing.T) {
 	var wq waiter.Queue
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 
 	if err := ep.Bind(tcpip.FullAddress{}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	if err := ep.Listen(10); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	// Close the listener and measure how long it takes.
 	t0 := time.Now()
 	ep.Close()
 	if diff := time.Now().Sub(t0); diff > 3*time.Second {
-		t.Fatalf("Took too long to close: %v", diff)
+		t.Fatalf("Took too long to close: %s", diff)
 	}
 }
 
@@ -2950,22 +2991,25 @@ loop:
 		case tcpip.ErrConnectionReset:
 			break loop
 		default:
-			t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrConnectionReset)
+			t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset)
 		}
 	}
 	// Expect the state to be StateError and subsequent Reads to fail with HardError.
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionReset {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrConnectionReset)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset)
 	}
 	if tcp.EndpointState(c.EP.State()) != tcp.StateError {
 		t.Fatalf("got EP state is not StateError")
 	}
 
 	if got := c.Stack().Stats().TCP.EstablishedResets.Value(); got != 1 {
-		t.Errorf("got stats.TCP.EstablishedResets.Value() = %v, want = 1", got)
+		t.Errorf("got stats.TCP.EstablishedResets.Value() = %d, want = 1", got)
 	}
 	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
-		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)
+	}
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)
 	}
 }
 
@@ -2990,7 +3034,7 @@ func TestSendOnResetConnection(t *testing.T) {
 	// Try to write.
 	view := buffer.NewView(10)
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != tcpip.ErrConnectionReset {
-		t.Fatalf("got c.EP.Write(...) = %v, want = %v", err, tcpip.ErrConnectionReset)
+		t.Fatalf("got c.EP.Write(...) = %s, want = %s", err, tcpip.ErrConnectionReset)
 	}
 }
 
@@ -3013,7 +3057,7 @@ func TestMaxRetransmitsTimeout(t *testing.T) {
 
 	_, _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(1)), tcpip.WriteOptions{})
 	if err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Expect first transmit and MaxRetries retransmits.
@@ -3048,7 +3092,10 @@ func TestMaxRetransmitsTimeout(t *testing.T) {
 	)
 
 	if got := c.Stack().Stats().TCP.EstablishedTimedout.Value(); got != 1 {
-		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout.Value() = %v, want = 1", got)
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout.Value() = %d, want = 1", got)
+	}
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)
 	}
 }
 
@@ -3066,7 +3113,7 @@ func TestMaxRTO(t *testing.T) {
 
 	_, _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(1)), tcpip.WriteOptions{})
 	if err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 	checker.IPv4(t, c.GetPacket(),
 		checker.TCP(
@@ -3089,6 +3136,63 @@ func TestMaxRTO(t *testing.T) {
 	}
 }
 
+// TestRetransmitIPv4IDUniqueness tests that the IPv4 Identification field is
+// unique on retransmits.
+func TestRetransmitIPv4IDUniqueness(t *testing.T) {
+	for _, tc := range []struct {
+		name string
+		size int
+	}{
+		{"1Byte", 1},
+		{"512Bytes", 512},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
+
+			c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */)
+
+			// Disabling PMTU discovery causes all packets sent from this socket to
+			// have DF=0. This needs to be done because the IPv4 ID uniqueness
+			// applies only to non-atomic IPv4 datagrams as defined in RFC 6864
+			// Section 4, and datagrams with DF=0 are non-atomic.
+			if err := c.EP.SetSockOptInt(tcpip.MTUDiscoverOption, tcpip.PMTUDiscoveryDont); err != nil {
+				t.Fatalf("disabling PMTU discovery via sockopt to force DF=0 failed: %s", err)
+			}
+
+			if _, _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(tc.size)), tcpip.WriteOptions{}); err != nil {
+				t.Fatalf("Write failed: %s", err)
+			}
+			pkt := c.GetPacket()
+			checker.IPv4(t, pkt,
+				checker.FragmentFlags(0),
+				checker.TCP(
+					checker.DstPort(context.TestPort),
+					checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+				),
+			)
+			idSet := map[uint16]struct{}{header.IPv4(pkt).ID(): struct{}{}}
+			// Expect two retransmitted packets, and that all packets received have
+			// unique IPv4 ID values.
+			for i := 0; i <= 2; i++ {
+				pkt := c.GetPacket()
+				checker.IPv4(t, pkt,
+					checker.FragmentFlags(0),
+					checker.TCP(
+						checker.DstPort(context.TestPort),
+						checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+					),
+				)
+				id := header.IPv4(pkt).ID()
+				if _, exists := idSet[id]; exists {
+					t.Fatalf("duplicate IPv4 ID=%d found in retransmitted packet", id)
+				}
+				idSet[id] = struct{}{}
+			}
+		})
+	}
+}
+
 func TestFinImmediately(t *testing.T) {
 	c := context.New(t, defaultMTU)
 	defer c.Cleanup()
@@ -3097,7 +3201,7 @@ func TestFinImmediately(t *testing.T) {
 
 	// Shutdown immediately, check that we get a FIN.
 	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
-		t.Fatalf("Shutdown failed: %v", err)
+		t.Fatalf("Shutdown failed: %s", err)
 	}
 
 	checker.IPv4(t, c.GetPacket(),
@@ -3140,7 +3244,7 @@ func TestFinRetransmit(t *testing.T) {
 
 	// Shutdown immediately, check that we get a FIN.
 	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
-		t.Fatalf("Shutdown failed: %v", err)
+		t.Fatalf("Shutdown failed: %s", err)
 	}
 
 	checker.IPv4(t, c.GetPacket(),
@@ -3195,7 +3299,7 @@ func TestFinWithNoPendingData(t *testing.T) {
 	// Write something out, and have it acknowledged.
 	view := buffer.NewView(10)
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	next := uint32(c.IRS) + 1
@@ -3221,7 +3325,7 @@ func TestFinWithNoPendingData(t *testing.T) {
 
 	// Shutdown, check that we get a FIN.
 	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
-		t.Fatalf("Shutdown failed: %v", err)
+		t.Fatalf("Shutdown failed: %s", err)
 	}
 
 	checker.IPv4(t, c.GetPacket(),
@@ -3268,7 +3372,7 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
 	view := buffer.NewView(10)
 	for i := tcp.InitialCwnd; i > 0; i-- {
 		if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-			t.Fatalf("Write failed: %v", err)
+			t.Fatalf("Write failed: %s", err)
 		}
 	}
 
@@ -3290,7 +3394,7 @@ func TestFinWithPendingDataCwndFull(t *testing.T) {
 	// because the congestion window doesn't allow it. Wait until a
 	// retransmit is received.
 	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
-		t.Fatalf("Shutdown failed: %v", err)
+		t.Fatalf("Shutdown failed: %s", err)
 	}
 
 	checker.IPv4(t, c.GetPacket(),
@@ -3354,7 +3458,7 @@ func TestFinWithPendingData(t *testing.T) {
 	// Write something out, and acknowledge it to get cwnd to 2.
 	view := buffer.NewView(10)
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	next := uint32(c.IRS) + 1
@@ -3380,7 +3484,7 @@ func TestFinWithPendingData(t *testing.T) {
 
 	// Write new data, but don't acknowledge it.
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	checker.IPv4(t, c.GetPacket(),
@@ -3396,7 +3500,7 @@ func TestFinWithPendingData(t *testing.T) {
 
 	// Shutdown the connection, check that we do get a FIN.
 	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
-		t.Fatalf("Shutdown failed: %v", err)
+		t.Fatalf("Shutdown failed: %s", err)
 	}
 
 	checker.IPv4(t, c.GetPacket(),
@@ -3441,7 +3545,7 @@ func TestFinWithPartialAck(t *testing.T) {
 	// FIN from the test side.
 	view := buffer.NewView(10)
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	next := uint32(c.IRS) + 1
@@ -3478,7 +3582,7 @@ func TestFinWithPartialAck(t *testing.T) {
 
 	// Write new data, but don't acknowledge it.
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	checker.IPv4(t, c.GetPacket(),
@@ -3494,7 +3598,7 @@ func TestFinWithPartialAck(t *testing.T) {
 
 	// Shutdown the connection, check that we do get a FIN.
 	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
-		t.Fatalf("Shutdown failed: %v", err)
+		t.Fatalf("Shutdown failed: %s", err)
 	}
 
 	checker.IPv4(t, c.GetPacket(),
@@ -3540,20 +3644,20 @@ func TestUpdateListenBacklog(t *testing.T) {
 	var wq waiter.Queue
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 
 	if err := ep.Bind(tcpip.FullAddress{}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	if err := ep.Listen(10); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	// Update the backlog with another Listen() on the same endpoint.
 	if err := ep.Listen(20); err != nil {
-		t.Fatalf("Listen failed to update backlog: %v", err)
+		t.Fatalf("Listen failed to update backlog: %s", err)
 	}
 
 	ep.Close()
@@ -3585,7 +3689,7 @@ func scaledSendWindow(t *testing.T, scale uint8) {
 	// Send some data. Check that it's capped by the window size.
 	view := buffer.NewView(65535)
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Check that only data that fits in the scaled window is sent.
@@ -3631,18 +3735,18 @@ func TestReceivedValidSegmentCountIncrement(t *testing.T) {
 	})
 
 	if got := stats.TCP.ValidSegmentsReceived.Value(); got != want {
-		t.Errorf("got stats.TCP.ValidSegmentsReceived.Value() = %v, want = %v", got, want)
+		t.Errorf("got stats.TCP.ValidSegmentsReceived.Value() = %d, want = %d", got, want)
 	}
 	if got := c.EP.Stats().(*tcp.Stats).SegmentsReceived.Value(); got != want {
-		t.Errorf("got EP stats Stats.SegmentsReceived = %v, want = %v", got, want)
+		t.Errorf("got EP stats Stats.SegmentsReceived = %d, want = %d", got, want)
 	}
 	// Ensure there were no errors during handshake. If these stats have
 	// incremented, then the connection should not have been established.
 	if got := c.EP.Stats().(*tcp.Stats).SendErrors.NoRoute.Value(); got != 0 {
-		t.Errorf("got EP stats Stats.SendErrors.NoRoute = %v, want = %v", got, 0)
+		t.Errorf("got EP stats Stats.SendErrors.NoRoute = %d, want = %d", got, 0)
 	}
 	if got := c.EP.Stats().(*tcp.Stats).SendErrors.NoLinkAddr.Value(); got != 0 {
-		t.Errorf("got EP stats Stats.SendErrors.NoLinkAddr = %v, want = %v", got, 0)
+		t.Errorf("got EP stats Stats.SendErrors.NoLinkAddr = %d, want = %d", got, 0)
 	}
 }
 
@@ -3666,10 +3770,10 @@ func TestReceivedInvalidSegmentCountIncrement(t *testing.T) {
 	c.SendSegment(vv)
 
 	if got := stats.TCP.InvalidSegmentsReceived.Value(); got != want {
-		t.Errorf("got stats.TCP.InvalidSegmentsReceived.Value() = %v, want = %v", got, want)
+		t.Errorf("got stats.TCP.InvalidSegmentsReceived.Value() = %d, want = %d", got, want)
 	}
 	if got := c.EP.Stats().(*tcp.Stats).ReceiveErrors.MalformedPacketsReceived.Value(); got != want {
-		t.Errorf("got EP Stats.ReceiveErrors.MalformedPacketsReceived stats = %v, want = %v", got, want)
+		t.Errorf("got EP Stats.ReceiveErrors.MalformedPacketsReceived stats = %d, want = %d", got, want)
 	}
 }
 
@@ -3770,7 +3874,7 @@ func TestReadAfterClosedState(t *testing.T) {
 	defer c.WQ.EventUnregister(&we)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %s", err, tcpip.ErrWouldBlock)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Shutdown immediately for write, check that we get a FIN.
@@ -3789,7 +3893,7 @@ func TestReadAfterClosedState(t *testing.T) {
 	)
 
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateFinWait1; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 
 	// Send some data and acknowledge the FIN.
@@ -3818,7 +3922,7 @@ func TestReadAfterClosedState(t *testing.T) {
 	time.Sleep(tcpTimeWaitTimeout * 2)
 
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateClose; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 
 	// Wait for receive to be notified.
@@ -3853,11 +3957,11 @@ func TestReadAfterClosedState(t *testing.T) {
 	// Now that we drained the queue, check that functions fail with the
 	// right error code.
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrClosedForReceive {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %s", err, tcpip.ErrClosedForReceive)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrClosedForReceive)
 	}
 
 	if _, _, err := c.EP.Peek([][]byte{peekBuf}); err != tcpip.ErrClosedForReceive {
-		t.Fatalf("got c.EP.Peek(...) = %v, want = %s", err, tcpip.ErrClosedForReceive)
+		t.Fatalf("got c.EP.Peek(...) = %s, want = %s", err, tcpip.ErrClosedForReceive)
 	}
 }
 
@@ -3871,66 +3975,84 @@ func TestReusePort(t *testing.T) {
 	var err *tcpip.Error
 	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
-		t.Fatalf("NewEndpoint failed; %v", err)
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
+		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
 	}
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	c.EP.Close()
 	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
-		t.Fatalf("NewEndpoint failed; %v", err)
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
+		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
 	}
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 	c.EP.Close()
 
 	// Second case, an endpoint that was bound and is connecting..
 	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
-		t.Fatalf("NewEndpoint failed; %v", err)
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
+		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
 	}
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 	if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
-		t.Fatalf("got c.EP.Connect(...) = %v, want = %v", err, tcpip.ErrConnectStarted)
+		t.Fatalf("got c.EP.Connect(...) = %s, want = %s", err, tcpip.ErrConnectStarted)
 	}
 	c.EP.Close()
 
 	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
-		t.Fatalf("NewEndpoint failed; %v", err)
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
+		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
 	}
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 	c.EP.Close()
 
 	// Third case, an endpoint that was bound and is listening.
 	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
-		t.Fatalf("NewEndpoint failed; %v", err)
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
+		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
 	}
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 	if err := c.EP.Listen(10); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 	c.EP.Close()
 
 	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
-		t.Fatalf("NewEndpoint failed; %v", err)
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
+		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
 	}
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 	if err := c.EP.Listen(10); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 }
 
@@ -3939,11 +4061,11 @@ func checkRecvBufferSize(t *testing.T, ep tcpip.Endpoint, v int) {
 
 	s, err := ep.GetSockOptInt(tcpip.ReceiveBufferSizeOption)
 	if err != nil {
-		t.Fatalf("GetSockOpt failed: %v", err)
+		t.Fatalf("GetSockOpt failed: %s", err)
 	}
 
 	if int(s) != v {
-		t.Fatalf("got receive buffer size = %v, want = %v", s, v)
+		t.Fatalf("got receive buffer size = %d, want = %d", s, v)
 	}
 }
 
@@ -3952,11 +4074,11 @@ func checkSendBufferSize(t *testing.T, ep tcpip.Endpoint, v int) {
 
 	s, err := ep.GetSockOptInt(tcpip.SendBufferSizeOption)
 	if err != nil {
-		t.Fatalf("GetSockOpt failed: %v", err)
+		t.Fatalf("GetSockOpt failed: %s", err)
 	}
 
 	if int(s) != v {
-		t.Fatalf("got send buffer size = %v, want = %v", s, v)
+		t.Fatalf("got send buffer size = %d, want = %d", s, v)
 	}
 }
 
@@ -3969,7 +4091,7 @@ func TestDefaultBufferSizes(t *testing.T) {
 	// Check the default values.
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
-		t.Fatalf("NewEndpoint failed; %v", err)
+		t.Fatalf("NewEndpoint failed; %s", err)
 	}
 	defer func() {
 		if ep != nil {
@@ -3981,28 +4103,34 @@ func TestDefaultBufferSizes(t *testing.T) {
 	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize)
 
 	// Change the default send buffer size.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{1, tcp.DefaultSendBufferSize * 2, tcp.DefaultSendBufferSize * 20}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %v", err)
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{
+		Min:     1,
+		Default: tcp.DefaultSendBufferSize * 2,
+		Max:     tcp.DefaultSendBufferSize * 20}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
 	}
 
 	ep.Close()
 	ep, err = s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
-		t.Fatalf("NewEndpoint failed; %v", err)
+		t.Fatalf("NewEndpoint failed; %s", err)
 	}
 
 	checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*2)
 	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize)
 
 	// Change the default receive buffer size.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{1, tcp.DefaultReceiveBufferSize * 3, tcp.DefaultReceiveBufferSize * 30}); err != nil {
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{
+		Min:     1,
+		Default: tcp.DefaultReceiveBufferSize * 3,
+		Max:     tcp.DefaultReceiveBufferSize * 30}); err != nil {
 		t.Fatalf("SetTransportProtocolOption failed: %v", err)
 	}
 
 	ep.Close()
 	ep, err = s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
-		t.Fatalf("NewEndpoint failed; %v", err)
+		t.Fatalf("NewEndpoint failed; %s", err)
 	}
 
 	checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*2)
@@ -4018,17 +4146,17 @@ func TestMinMaxBufferSizes(t *testing.T) {
 	// Check the default values.
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
-		t.Fatalf("NewEndpoint failed; %v", err)
+		t.Fatalf("NewEndpoint failed; %s", err)
 	}
 	defer ep.Close()
 
 	// Change the min/max values for send/receive
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{200, tcp.DefaultReceiveBufferSize * 2, tcp.DefaultReceiveBufferSize * 20}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %v", err)
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 200, Default: tcp.DefaultReceiveBufferSize * 2, Max: tcp.DefaultReceiveBufferSize * 20}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
 	}
 
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{300, tcp.DefaultSendBufferSize * 3, tcp.DefaultSendBufferSize * 30}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %v", err)
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{Min: 300, Default: tcp.DefaultSendBufferSize * 3, Max: tcp.DefaultSendBufferSize * 30}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
 	}
 
 	// Set values below the min.
@@ -4065,12 +4193,12 @@ func TestBindToDeviceOption(t *testing.T) {
 
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
-		t.Fatalf("NewEndpoint failed; %v", err)
+		t.Fatalf("NewEndpoint failed; %s", err)
 	}
 	defer ep.Close()
 
 	if err := s.CreateNIC(321, loopback.New()); err != nil {
-		t.Errorf("CreateNIC failed: %v", err)
+		t.Errorf("CreateNIC failed: %s", err)
 	}
 
 	// nicIDPtr is used instead of taking the address of NICID literals, which is
@@ -4095,12 +4223,12 @@ func TestBindToDeviceOption(t *testing.T) {
 			if testAction.setBindToDevice != nil {
 				bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
 				if gotErr, wantErr := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
-					t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, gotErr, wantErr)
+					t.Errorf("SetSockOpt(%#v) got %v, want %v", bindToDevice, gotErr, wantErr)
 				}
 			}
 			bindToDevice := tcpip.BindToDeviceOption(88888)
 			if err := ep.GetSockOpt(&bindToDevice); err != nil {
-				t.Errorf("GetSockOpt got %v, want %v", err, nil)
+				t.Errorf("GetSockOpt got %s, want %v", err, nil)
 			}
 			if got, want := bindToDevice, testAction.getBindToDevice; got != want {
 				t.Errorf("bindToDevice got %d, want %d", got, want)
@@ -4166,12 +4294,12 @@ func TestSelfConnect(t *testing.T) {
 	var wq waiter.Queue
 	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 	defer ep.Close()
 
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	// Register for notification, then start connection attempt.
@@ -4180,12 +4308,12 @@ func TestSelfConnect(t *testing.T) {
 	defer wq.EventUnregister(&waitEntry)
 
 	if err := ep.Connect(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != tcpip.ErrConnectStarted {
-		t.Fatalf("got ep.Connect(...) = %v, want = %v", err, tcpip.ErrConnectStarted)
+		t.Fatalf("got ep.Connect(...) = %s, want = %s", err, tcpip.ErrConnectStarted)
 	}
 
 	<-notifyCh
 	if err := ep.GetSockOpt(tcpip.ErrorOption{}); err != nil {
-		t.Fatalf("Connect failed: %v", err)
+		t.Fatalf("Connect failed: %s", err)
 	}
 
 	// Write something.
@@ -4193,7 +4321,7 @@ func TestSelfConnect(t *testing.T) {
 	view := buffer.NewView(len(data))
 	copy(view, data)
 	if _, _, err := ep.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	// Read back what was written.
@@ -4202,12 +4330,12 @@ func TestSelfConnect(t *testing.T) {
 	rd, _, err := ep.Read(nil)
 	if err != nil {
 		if err != tcpip.ErrWouldBlock {
-			t.Fatalf("Read failed: %v", err)
+			t.Fatalf("Read failed: %s", err)
 		}
 		<-notifyCh
 		rd, _, err = ep.Read(nil)
 		if err != nil {
-			t.Fatalf("Read failed: %v", err)
+			t.Fatalf("Read failed: %s", err)
 		}
 	}
 
@@ -4291,7 +4419,7 @@ func TestConnectAvoidsBoundPorts(t *testing.T) {
 												}
 												ep, err := s.NewEndpoint(tcp.ProtocolNumber, networkProtocolNumber, &wq)
 												if err != nil {
-													t.Fatalf("NewEndpoint failed: %v", err)
+													t.Fatalf("NewEndpoint failed: %s", err)
 												}
 												eps = append(eps, ep)
 												switch network {
@@ -4342,7 +4470,7 @@ func TestConnectAvoidsBoundPorts(t *testing.T) {
 
 											for i := ports.FirstEphemeral; i <= math.MaxUint16; i++ {
 												if makeEP(exhaustedNetwork).Bind(tcpip.FullAddress{Addr: address(t, exhaustedAddressType, isAny), Port: uint16(i)}); err != nil {
-													t.Fatalf("Bind(%d) failed: %v", i, err)
+													t.Fatalf("Bind(%d) failed: %s", i, err)
 												}
 											}
 											want := tcpip.ErrConnectStarted
@@ -4350,7 +4478,7 @@ func TestConnectAvoidsBoundPorts(t *testing.T) {
 												want = tcpip.ErrNoPortAvailable
 											}
 											if err := makeEP(candidateNetwork).Connect(tcpip.FullAddress{Addr: address(t, candidateAddressType, false), Port: 31337}); err != want {
-												t.Fatalf("got ep.Connect(..) = %v, want = %v", err, want)
+												t.Fatalf("got ep.Connect(..) = %s, want = %s", err, want)
 											}
 										})
 									}
@@ -4384,7 +4512,7 @@ func TestPathMTUDiscovery(t *testing.T) {
 	}
 
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	receivePackets := func(c *context.Context, sizes []int, which int, seqNum uint32) []byte {
@@ -4487,7 +4615,7 @@ func TestStackSetCongestionControl(t *testing.T) {
 
 			var oldCC tcpip.CongestionControlOption
 			if err := s.TransportProtocolOption(tcp.ProtocolNumber, &oldCC); err != nil {
-				t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &oldCC, err)
+				t.Fatalf("s.TransportProtocolOption(%v, %v) = %s", tcp.ProtocolNumber, &oldCC, err)
 			}
 
 			if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tc.cc); err != tc.err {
@@ -4574,12 +4702,12 @@ func TestEndpointSetCongestionControl(t *testing.T) {
 				var err *tcpip.Error
 				c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
 				if err != nil {
-					t.Fatalf("NewEndpoint failed: %v", err)
+					t.Fatalf("NewEndpoint failed: %s", err)
 				}
 
 				var oldCC tcpip.CongestionControlOption
 				if err := c.EP.GetSockOpt(&oldCC); err != nil {
-					t.Fatalf("c.EP.SockOpt(%v) = %v", &oldCC, err)
+					t.Fatalf("c.EP.SockOpt(%v) = %s", &oldCC, err)
 				}
 
 				if connected {
@@ -4587,12 +4715,12 @@ func TestEndpointSetCongestionControl(t *testing.T) {
 				}
 
 				if err := c.EP.SetSockOpt(tc.cc); err != tc.err {
-					t.Fatalf("c.EP.SetSockOpt(%v) = %v, want %v", tc.cc, err, tc.err)
+					t.Fatalf("c.EP.SetSockOpt(%v) = %s, want %s", tc.cc, err, tc.err)
 				}
 
 				var cc tcpip.CongestionControlOption
 				if err := c.EP.GetSockOpt(&cc); err != nil {
-					t.Fatalf("c.EP.SockOpt(%v) = %v", &cc, err)
+					t.Fatalf("c.EP.SockOpt(%v) = %s", &cc, err)
 				}
 
 				got, want := cc, oldCC
@@ -4615,7 +4743,7 @@ func enableCUBIC(t *testing.T, c *context.Context) {
 	t.Helper()
 	opt := tcpip.CongestionControlOption("cubic")
 	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, opt); err != nil {
-		t.Fatalf("c.s.SetTransportProtocolOption(tcp.ProtocolNumber, %v = %v", opt, err)
+		t.Fatalf("c.s.SetTransportProtocolOption(tcp.ProtocolNumber, %s = %s", opt, err)
 	}
 }
 
@@ -4657,14 +4785,14 @@ func TestKeepalive(t *testing.T) {
 
 	// Check that the connection is still alive.
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Send some data and wait before ACKing it. Keepalives should be disabled
 	// during this period.
 	view := buffer.NewView(3)
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	next := uint32(c.IRS) + 1
@@ -4744,15 +4872,18 @@ func TestKeepalive(t *testing.T) {
 	)
 
 	if got := c.Stack().Stats().TCP.EstablishedTimedout.Value(); got != 1 {
-		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout.Value() = %v, want = 1", got)
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout.Value() = %d, want = 1", got)
 	}
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrTimeout)
 	}
 
 	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
-		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)
+	}
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)
 	}
 }
 
@@ -4854,19 +4985,19 @@ func TestListenBacklogFull(t *testing.T) {
 	var err *tcpip.Error
 	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 
 	// Bind to wildcard.
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	// Test acceptance.
 	// Start listening.
 	listenBacklog := 2
 	if err := c.EP.Listen(listenBacklog); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	for i := 0; i < listenBacklog; i++ {
@@ -4899,7 +5030,7 @@ func TestListenBacklogFull(t *testing.T) {
 			case <-ch:
 				_, _, err = c.EP.Accept()
 				if err != nil {
-					t.Fatalf("Accept failed: %v", err)
+					t.Fatalf("Accept failed: %s", err)
 				}
 
 			case <-time.After(1 * time.Second):
@@ -4928,7 +5059,7 @@ func TestListenBacklogFull(t *testing.T) {
 		case <-ch:
 			newEP, _, err = c.EP.Accept()
 			if err != nil {
-				t.Fatalf("Accept failed: %v", err)
+				t.Fatalf("Accept failed: %s", err)
 			}
 
 		case <-time.After(1 * time.Second):
@@ -4942,7 +5073,7 @@ func TestListenBacklogFull(t *testing.T) {
 	b := c.GetPacket()
 	tcp := header.TCP(header.IPv4(b).Payload())
 	if string(tcp.Payload()) != data {
-		t.Fatalf("Unexpected data: got %v, want %v", string(tcp.Payload()), data)
+		t.Fatalf("unexpected data: got %s, want %s", string(tcp.Payload()), data)
 	}
 }
 
@@ -5162,19 +5293,19 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
 	var err *tcpip.Error
 	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 
 	// Bind to wildcard.
 	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 
 	// Test acceptance.
 	// Start listening.
 	listenBacklog := 1
 	if err := c.EP.Listen(listenBacklog); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	// Send two SYN's the first one should get a SYN-ACK, the
@@ -5240,7 +5371,7 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
 		case <-ch:
 			newEP, _, err = c.EP.Accept()
 			if err != nil {
-				t.Fatalf("Accept failed: %v", err)
+				t.Fatalf("Accept failed: %s", err)
 			}
 
 		case <-time.After(1 * time.Second):
@@ -5254,7 +5385,7 @@ func TestListenSynRcvdQueueFull(t *testing.T) {
 	pkt := c.GetPacket()
 	tcp = header.TCP(header.IPv4(pkt).Payload())
 	if string(tcp.Payload()) != data {
-		t.Fatalf("Unexpected data: got %v, want %v", string(tcp.Payload()), data)
+		t.Fatalf("unexpected data: got %s, want %s", string(tcp.Payload()), data)
 	}
 }
 
@@ -5316,7 +5447,7 @@ func TestListenBacklogFullSynCookieInUse(t *testing.T) {
 		case <-ch:
 			_, _, err = c.EP.Accept()
 			if err != nil {
-				t.Fatalf("Accept failed: %v", err)
+				t.Fatalf("Accept failed: %s", err)
 			}
 
 		case <-time.After(1 * time.Second):
@@ -5450,7 +5581,7 @@ func TestSynRcvdBadSeqNumber(t *testing.T) {
 	pkt := c.GetPacket()
 	tcpHdr = header.TCP(header.IPv4(pkt).Payload())
 	if string(tcpHdr.Payload()) != data {
-		t.Fatalf("Unexpected data: got %s, want %s", string(tcpHdr.Payload()), data)
+		t.Fatalf("unexpected data: got %s, want %s", string(tcpHdr.Payload()), data)
 	}
 }
 
@@ -5460,20 +5591,20 @@ func TestPassiveConnectionAttemptIncrement(t *testing.T) {
 
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 	c.EP = ep
 	if err := ep.Bind(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateBound; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 	if err := c.EP.Listen(1); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateListen; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 
 	stats := c.Stack().Stats()
@@ -5494,7 +5625,7 @@ func TestPassiveConnectionAttemptIncrement(t *testing.T) {
 		case <-ch:
 			_, _, err = c.EP.Accept()
 			if err != nil {
-				t.Fatalf("Accept failed: %v", err)
+				t.Fatalf("Accept failed: %s", err)
 			}
 
 		case <-time.After(1 * time.Second):
@@ -5503,7 +5634,7 @@ func TestPassiveConnectionAttemptIncrement(t *testing.T) {
 	}
 
 	if got := stats.TCP.PassiveConnectionOpenings.Value(); got != want {
-		t.Errorf("got stats.TCP.PassiveConnectionOpenings.Value() = %v, want = %v", got, want)
+		t.Errorf("got stats.TCP.PassiveConnectionOpenings.Value() = %d, want = %d", got, want)
 	}
 }
 
@@ -5514,14 +5645,14 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
 	stats := c.Stack().Stats()
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 	c.EP = ep
 	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 	if err := c.EP.Listen(1); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 
 	srcPort := uint16(context.TestPort)
@@ -5546,10 +5677,10 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
 
 	time.Sleep(50 * time.Millisecond)
 	if got := stats.TCP.ListenOverflowSynDrop.Value(); got != want {
-		t.Errorf("got stats.TCP.ListenOverflowSynDrop.Value() = %v, want = %v", got, want)
+		t.Errorf("got stats.TCP.ListenOverflowSynDrop.Value() = %d, want = %d", got, want)
 	}
 	if got := c.EP.Stats().(*tcp.Stats).ReceiveErrors.ListenOverflowSynDrop.Value(); got != want {
-		t.Errorf("got EP stats Stats.ReceiveErrors.ListenOverflowSynDrop = %v, want = %v", got, want)
+		t.Errorf("got EP stats Stats.ReceiveErrors.ListenOverflowSynDrop = %d, want = %d", got, want)
 	}
 
 	we, ch := waiter.NewChannelEntry(nil)
@@ -5564,7 +5695,7 @@ func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
 		case <-ch:
 			_, _, err = c.EP.Accept()
 			if err != nil {
-				t.Fatalf("Accept failed: %v", err)
+				t.Fatalf("Accept failed: %s", err)
 			}
 
 		case <-time.After(1 * time.Second):
@@ -5579,28 +5710,28 @@ func TestEndpointBindListenAcceptState(t *testing.T) {
 	wq := &waiter.Queue{}
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
 	if err != nil {
-		t.Fatalf("NewEndpoint failed: %v", err)
+		t.Fatalf("NewEndpoint failed: %s", err)
 	}
 
 	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
-		t.Fatalf("Bind failed: %v", err)
+		t.Fatalf("Bind failed: %s", err)
 	}
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateBound; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 
 	if _, _, err := ep.Read(nil); err != tcpip.ErrNotConnected {
-		t.Errorf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrNotConnected)
+		t.Errorf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrNotConnected)
 	}
 	if got := ep.Stats().(*tcp.Stats).ReadErrors.NotConnected.Value(); got != 1 {
-		t.Errorf("got EP stats Stats.ReadErrors.NotConnected got %v want %v", got, 1)
+		t.Errorf("got EP stats Stats.ReadErrors.NotConnected got %d want %d", got, 1)
 	}
 
 	if err := ep.Listen(10); err != nil {
-		t.Fatalf("Listen failed: %v", err)
+		t.Fatalf("Listen failed: %s", err)
 	}
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateListen; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 
 	c.PassiveConnectWithOptions(100, 5, header.TCPSynOptions{MSS: defaultIPv4MSS})
@@ -5617,7 +5748,7 @@ func TestEndpointBindListenAcceptState(t *testing.T) {
 		case <-ch:
 			aep, _, err = ep.Accept()
 			if err != nil {
-				t.Fatalf("Accept failed: %v", err)
+				t.Fatalf("Accept failed: %s", err)
 			}
 
 		case <-time.After(1 * time.Second):
@@ -5625,25 +5756,25 @@ func TestEndpointBindListenAcceptState(t *testing.T) {
 		}
 	}
 	if got, want := tcp.EndpointState(aep.State()), tcp.StateEstablished; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 	if err := aep.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrAlreadyConnected {
-		t.Errorf("Unexpected error attempting to call connect on an established endpoint, got: %v, want: %v", err, tcpip.ErrAlreadyConnected)
+		t.Errorf("unexpected error attempting to call connect on an established endpoint, got: %s, want: %s", err, tcpip.ErrAlreadyConnected)
 	}
 	// Listening endpoint remains in listen state.
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateListen; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 
 	ep.Close()
 	// Give worker goroutines time to receive the close notification.
 	time.Sleep(1 * time.Second)
 	if got, want := tcp.EndpointState(ep.State()), tcp.StateClose; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 	// Accepted endpoint remains open when the listen endpoint is closed.
 	if got, want := tcp.EndpointState(aep.State()), tcp.StateEstablished; got != want {
-		t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
 	}
 
 }
@@ -5663,13 +5794,13 @@ func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
 	// the segment queue holding unprocessed packets is limited to 500.
 	const receiveBufferSize = 80 << 10 // 80KB.
 	const maxReceiveBufferSize = receiveBufferSize * 10
-	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{1, receiveBufferSize, maxReceiveBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %v", err)
+	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
 	}
 
 	// Enable auto-tuning.
 	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %v", err)
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
 	}
 	// Change the expected window scale to match the value needed for the
 	// maximum buffer size defined above.
@@ -5784,13 +5915,13 @@ func TestReceiveBufferAutoTuning(t *testing.T) {
 	// the segment queue holding unprocessed packets is limited to 300.
 	const receiveBufferSize = 80 << 10 // 80KB.
 	const maxReceiveBufferSize = receiveBufferSize * 10
-	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{1, receiveBufferSize, maxReceiveBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %v", err)
+	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
 	}
 
 	// Enable auto-tuning.
 	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %v", err)
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
 	}
 	// Change the expected window scale to match the value needed for the
 	// maximum buffer size used by stack.
@@ -5935,7 +6066,7 @@ func TestDelayEnabled(t *testing.T) {
 		c := context.New(t, defaultMTU)
 		defer c.Cleanup()
 		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, v.delayEnabled); err != nil {
-			t.Fatalf("SetTransportProtocolOption(tcp, %t) failed: %v", v.delayEnabled, err)
+			t.Fatalf("SetTransportProtocolOption(tcp, %t) failed: %s", v.delayEnabled, err)
 		}
 		checkDelayOption(t, c, v.delayEnabled, v.wantDelayOption)
 	}
@@ -5946,7 +6077,7 @@ func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.Del
 
 	var gotDelayEnabled tcp.DelayEnabled
 	if err := c.Stack().TransportProtocolOption(tcp.ProtocolNumber, &gotDelayEnabled); err != nil {
-		t.Fatalf("TransportProtocolOption(tcp, &gotDelayEnabled) failed: %v", err)
+		t.Fatalf("TransportProtocolOption(tcp, &gotDelayEnabled) failed: %s", err)
 	}
 	if gotDelayEnabled != wantDelayEnabled {
 		t.Errorf("TransportProtocolOption(tcp, &gotDelayEnabled) got %t, want %t", gotDelayEnabled, wantDelayEnabled)
@@ -5954,7 +6085,7 @@ func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.Del
 
 	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, new(waiter.Queue))
 	if err != nil {
-		t.Fatalf("NewEndPoint(tcp, ipv4, new(waiter.Queue)) failed: %v", err)
+		t.Fatalf("NewEndPoint(tcp, ipv4, new(waiter.Queue)) failed: %s", err)
 	}
 	gotDelayOption, err := ep.GetSockOptBool(tcpip.DelayOption)
 	if err != nil {
@@ -5981,7 +6112,7 @@ func TestTCPLingerTimeout(t *testing.T) {
 		{"InRangeLingerTimeout", 10 * time.Second, 10 * time.Second},
 		// Values > stack's TCPLingerTimeout are capped to the stack's
 		// value. Defaults to tcp.DefaultTCPLingerTimeout(60 seconds)
-		{"AboveMaxLingerTimeout", 65 * time.Second, 60 * time.Second},
+		{"AboveMaxLingerTimeout", 125 * time.Second, 120 * time.Second},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
@@ -6515,10 +6646,10 @@ func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
 		checker.TCPFlags(header.TCPFlagRst)))
 
 	if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != want {
-		t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %v, want = %v", got, want)
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %d, want = %d", got, want)
 	}
 	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
-		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %v, want = 0", got)
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)
 	}
 }
 
@@ -6715,7 +6846,7 @@ func TestTCPUserTimeout(t *testing.T) {
 	// Send some data and wait before ACKing it.
 	view := buffer.NewView(3)
 	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
-		t.Fatalf("Write failed: %v", err)
+		t.Fatalf("Write failed: %s", err)
 	}
 
 	next := uint32(c.IRS) + 1
@@ -6765,11 +6896,14 @@ func TestTCPUserTimeout(t *testing.T) {
 	)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrTimeout)
 	}
 
 	if got, want := c.Stack().Stats().TCP.EstablishedTimedout.Value(), origEstablishedTimedout+1; got != want {
-		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %v, want = %v", got, want)
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %d, want = %d", got, want)
+	}
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)
 	}
 }
 
@@ -6796,7 +6930,7 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 
 	// Check that the connection is still alive.
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrWouldBlock)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Now receive 1 keepalives, but don't ACK it.
@@ -6837,10 +6971,13 @@ func TestKeepaliveWithUserTimeout(t *testing.T) {
 	)
 
 	if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
-		t.Fatalf("got c.EP.Read(nil) = %v, want = %v", err, tcpip.ErrTimeout)
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrTimeout)
 	}
 	if got, want := c.Stack().Stats().TCP.EstablishedTimedout.Value(), origEstablishedTimedout+1; got != want {
-		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %v, want = %v", got, want)
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %d, want = %d", got, want)
+	}
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)
 	}
 }
 
@@ -6896,11 +7033,11 @@ func TestIncreaseWindowOnReceive(t *testing.T) {
 	// ack should be sent in response to that. The window was not
 	// zero, but it grew to larger than MSS.
 	if _, _, err := c.EP.Read(nil); err != nil {
-		t.Fatalf("Read failed: %v", err)
+		t.Fatalf("Read failed: %s", err)
 	}
 
 	if _, _, err := c.EP.Read(nil); err != nil {
-		t.Fatalf("Read failed: %v", err)
+		t.Fatalf("Read failed: %s", err)
 	}
 
 	// After reading two packets, we surely crossed MSS. See the ack:
@@ -6997,13 +7134,13 @@ func TestTCPDeferAccept(t *testing.T) {
 
 	const tcpDeferAccept = 1 * time.Second
 	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
-		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %v", tcpDeferAccept, err)
+		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %s", tcpDeferAccept, err)
 	}
 
 	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
 
 	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: %s", err, tcpip.ErrWouldBlock)
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Send data. This should result in an acceptable endpoint.
@@ -7026,7 +7163,7 @@ func TestTCPDeferAccept(t *testing.T) {
 	time.Sleep(50 * time.Millisecond)
 	aep, _, err := c.EP.Accept()
 	if err != nil {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: nil", err)
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: nil", err)
 	}
 
 	aep.Close()
@@ -7054,13 +7191,13 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 
 	const tcpDeferAccept = 1 * time.Second
 	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
-		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %v", tcpDeferAccept, err)
+		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %s", tcpDeferAccept, err)
 	}
 
 	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
 
 	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: %s", err, tcpip.ErrWouldBlock)
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: %s", err, tcpip.ErrWouldBlock)
 	}
 
 	// Sleep for a little of the tcpDeferAccept timeout.
@@ -7094,7 +7231,7 @@ func TestTCPDeferAcceptTimeout(t *testing.T) {
 	time.Sleep(50 * time.Millisecond)
 	aep, _, err := c.EP.Accept()
 	if err != nil {
-		t.Fatalf("c.EP.Accept() returned unexpected error got: %v, want: nil", err)
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: nil", err)
 	}
 
 	aep.Close()
@@ -7160,3 +7297,53 @@ func TestResetDuringClose(t *testing.T) {
 
 	wg.Wait()
 }
+
+func TestStackTimeWaitReuse(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	s := c.Stack()
+	var twReuse tcpip.TCPTimeWaitReuseOption
+	if err := s.TransportProtocolOption(tcp.ProtocolNumber, &twReuse); err != nil {
+		t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &twReuse, err)
+	}
+	if got, want := twReuse, tcpip.TCPTimeWaitReuseLoopbackOnly; got != want {
+		t.Fatalf("got tcpip.TCPTimeWaitReuseOption: %v, want: %v", got, want)
+	}
+}
+
+func TestSetStackTimeWaitReuse(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	s := c.Stack()
+	testCases := []struct {
+		v   int
+		err *tcpip.Error
+	}{
+		{int(tcpip.TCPTimeWaitReuseDisabled), nil},
+		{int(tcpip.TCPTimeWaitReuseGlobal), nil},
+		{int(tcpip.TCPTimeWaitReuseLoopbackOnly), nil},
+		{int(tcpip.TCPTimeWaitReuseLoopbackOnly) + 1, tcpip.ErrInvalidOptionValue},
+		{int(tcpip.TCPTimeWaitReuseDisabled) - 1, tcpip.ErrInvalidOptionValue},
+	}
+
+	for _, tc := range testCases {
+		err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitReuseOption(tc.v))
+		if got, want := err, tc.err; got != want {
+			t.Fatalf("s.TransportProtocolOption(%v, %v) = %v, want %v", tcp.ProtocolNumber, tc.v, err, tc.err)
+		}
+		if tc.err != nil {
+			continue
+		}
+
+		var twReuse tcpip.TCPTimeWaitReuseOption
+		if err := s.TransportProtocolOption(tcp.ProtocolNumber, &twReuse); err != nil {
+			t.Fatalf("s.TransportProtocolOption(%v, %v) = %v, want nil", tcp.ProtocolNumber, &twReuse, err)
+		}
+
+		if got, want := twReuse, tcpip.TCPTimeWaitReuseOption(tc.v); got != want {
+			t.Fatalf("got tcpip.TCPTimeWaitReuseOption: %v, want: %v", got, want)
+		}
+	}
+}
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
index 7b1d72cf4..927bc71e0 100644
--- a/pkg/tcpip/transport/tcp/testing/context/context.go
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -143,13 +143,15 @@ func New(t *testing.T, mtu uint32) *Context {
 		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
 	})
 
+	const sendBufferSize = 1 << 20 // 1 MiB
+	const recvBufferSize = 1 << 20 // 1 MiB
 	// Allow minimum send/receive buffer sizes to be 1 during tests.
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{1, tcp.DefaultSendBufferSize, 10 * tcp.DefaultSendBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %v", err)
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{Min: 1, Default: sendBufferSize, Max: 10 * sendBufferSize}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
 	}
 
-	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{1, tcp.DefaultReceiveBufferSize, 10 * tcp.DefaultReceiveBufferSize}); err != nil {
-		t.Fatalf("SetTransportProtocolOption failed: %v", err)
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: recvBufferSize, Max: 10 * recvBufferSize}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
 	}
 
 	// Increase minimum RTO in tests to avoid test flakes due to early
@@ -202,7 +204,7 @@ func New(t *testing.T, mtu uint32) *Context {
 		t:           t,
 		s:           s,
 		linkEP:      ep,
-		WindowScale: uint8(tcp.FindWndScale(tcp.DefaultReceiveBufferSize)),
+		WindowScale: uint8(tcp.FindWndScale(recvBufferSize)),
 	}
 }
 
@@ -255,8 +257,8 @@ func (c *Context) GetPacket() []byte {
 		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv4.ProtocolNumber)
 	}
 
-	hdr := p.Pkt.Header.View()
-	b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
+	vv := buffer.NewVectorisedView(p.Pkt.Size(), p.Pkt.Views())
+	b := vv.ToView()
 
 	if p.GSO != nil && p.GSO.L3HdrLen != header.IPv4MinimumSize {
 		c.t.Errorf("L3HdrLen %v (expected %v)", p.GSO.L3HdrLen, header.IPv4MinimumSize)
@@ -282,8 +284,8 @@ func (c *Context) GetPacketNonBlocking() []byte {
 		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv4.ProtocolNumber)
 	}
 
-	hdr := p.Pkt.Header.View()
-	b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
+	vv := buffer.NewVectorisedView(p.Pkt.Size(), p.Pkt.Views())
+	b := vv.ToView()
 
 	checker.IPv4(c.t, b, checker.SrcAddr(StackAddr), checker.DstAddr(TestAddr))
 	return b
@@ -316,9 +318,10 @@ func (c *Context) SendICMPPacket(typ header.ICMPv4Type, code uint8, p1, p2 []byt
 	copy(icmp[header.ICMPv4PayloadOffset:], p2)
 
 	// Inject packet.
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
 	})
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, pkt)
 }
 
 // BuildSegment builds a TCP segment based on the given Headers and payload.
@@ -372,26 +375,29 @@ func (c *Context) BuildSegmentWithAddrs(payload []byte, h *Headers, src, dst tcp
 // SendSegment sends a TCP segment that has already been built and written to a
 // buffer.VectorisedView.
 func (c *Context) SendSegment(s buffer.VectorisedView) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: s,
 	})
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, pkt)
 }
 
 // SendPacket builds and sends a TCP segment(with the provided payload & TCP
 // headers) in an IPv4 packet via the link layer endpoint.
 func (c *Context) SendPacket(payload []byte, h *Headers) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: c.BuildSegment(payload, h),
 	})
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, pkt)
 }
 
 // SendPacketWithAddrs builds and sends a TCP segment(with the provided payload
 // & TCPheaders) in an IPv4 packet via the link layer endpoint using the
 // provided source and destination IPv4 addresses.
 func (c *Context) SendPacketWithAddrs(payload []byte, h *Headers, src, dst tcpip.Address) {
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: c.BuildSegmentWithAddrs(payload, h, src, dst),
 	})
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, pkt)
 }
 
 // SendAck sends an ACK packet.
@@ -512,9 +518,8 @@ func (c *Context) GetV6Packet() []byte {
 	if p.Proto != ipv6.ProtocolNumber {
 		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv6.ProtocolNumber)
 	}
-	b := make([]byte, p.Pkt.Header.UsedLength()+p.Pkt.Data.Size())
-	copy(b, p.Pkt.Header.View())
-	copy(b[p.Pkt.Header.UsedLength():], p.Pkt.Data.ToView())
+	vv := buffer.NewVectorisedView(p.Pkt.Size(), p.Pkt.Views())
+	b := vv.ToView()
 
 	checker.IPv6(c.t, b, checker.SrcAddr(StackV6Addr), checker.DstAddr(TestV6Addr))
 	return b
@@ -564,9 +569,10 @@ func (c *Context) SendV6PacketWithAddrs(payload []byte, h *Headers, src, dst tcp
 	t.SetChecksum(^t.CalculateChecksum(xsum))
 
 	// Inject packet.
-	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.PacketBuffer{
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
 		Data: buf.ToVectorisedView(),
 	})
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, pkt)
 }
 
 // CreateConnected creates a connected TCP endpoint.
diff --git a/pkg/tcpip/transport/tcp/timer.go b/pkg/tcpip/transport/tcp/timer.go
index c70525f27..7981d469b 100644
--- a/pkg/tcpip/transport/tcp/timer.go
+++ b/pkg/tcpip/transport/tcp/timer.go
@@ -85,6 +85,7 @@ func (t *timer) init(w *sleep.Waker) {
 // cleanup frees all resources associated with the timer.
 func (t *timer) cleanup() {
 	t.timer.Stop()
+	*t = timer{}
 }
 
 // checkExpiration checks if the given timer has actually expired, it should be
diff --git a/pkg/tcpip/transport/tcp/timer_test.go b/pkg/tcpip/transport/tcp/timer_test.go
new file mode 100644
index 000000000..dbd6dff54
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/timer_test.go
@@ -0,0 +1,47 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sleep"
+)
+
+func TestCleanup(t *testing.T) {
+	const (
+		timerDurationSeconds     = 2
+		isAssertedTimeoutSeconds = timerDurationSeconds + 1
+	)
+
+	tmr := timer{}
+	w := sleep.Waker{}
+	tmr.init(&w)
+	tmr.enable(timerDurationSeconds * time.Second)
+	tmr.cleanup()
+
+	if want := (timer{}); tmr != want {
+		t.Errorf("got tmr = %+v, want = %+v", tmr, want)
+	}
+
+	// The waker should not be asserted.
+	for i := 0; i < isAssertedTimeoutSeconds; i++ {
+		time.Sleep(time.Second)
+		if w.IsAsserted() {
+			t.Fatalf("waker asserted unexpectedly")
+		}
+	}
+}
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
index 12bc1b5b5..558b06df0 100644
--- a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
@@ -106,6 +106,11 @@ func (t *TCB) UpdateStateOutbound(tcp header.TCP) Result {
 	return st
 }
 
+// State returns the current state of the TCB.
+func (t *TCB) State() Result {
+	return t.state
+}
+
 // IsAlive returns true as long as the connection is established(Alive)
 // or connecting state.
 func (t *TCB) IsAlive() bool {
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
index 647b2067a..73608783c 100644
--- a/pkg/tcpip/transport/udp/endpoint.go
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -15,6 +15,9 @@
 package udp
 
 import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sleep"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/tcpip"
 	"gvisor.dev/gvisor/pkg/tcpip/buffer"
@@ -93,6 +96,7 @@ type endpoint struct {
 	// The following fields are protected by the mu mutex.
 	mu             sync.RWMutex `state:"nosave"`
 	sndBufSize     int
+	sndBufSizeMax  int
 	state          EndpointState
 	route          stack.Route `state:"manual"`
 	dstPort        uint16
@@ -102,9 +106,10 @@ type endpoint struct {
 	multicastAddr  tcpip.Address
 	multicastNICID tcpip.NICID
 	multicastLoop  bool
-	reusePort      bool
+	portFlags      ports.Flags
 	bindToDevice   tcpip.NICID
 	broadcast      bool
+	noChecksum     bool
 
 	lastErrorMu sync.Mutex   `state:"nosave"`
 	lastError   *tcpip.Error `state:".(string)"`
@@ -158,7 +163,7 @@ type multicastMembership struct {
 }
 
 func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
-	return &endpoint{
+	e := &endpoint{
 		stack: s,
 		TransportEndpointInfo: stack.TransportEndpointInfo{
 			NetProto:   netProto,
@@ -180,10 +185,23 @@ func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQue
 		multicastTTL:  1,
 		multicastLoop: true,
 		rcvBufSizeMax: 32 * 1024,
-		sndBufSize:    32 * 1024,
+		sndBufSizeMax: 32 * 1024,
 		state:         StateInitial,
 		uniqueID:      s.UniqueID(),
 	}
+
+	// Override with stack defaults.
+	var ss stack.SendBufferSizeOption
+	if err := s.Option(&ss); err == nil {
+		e.sndBufSizeMax = ss.Default
+	}
+
+	var rs stack.ReceiveBufferSizeOption
+	if err := s.Option(&rs); err == nil {
+		e.rcvBufSizeMax = rs.Default
+	}
+
+	return e
 }
 
 // UniqueID implements stack.TransportEndpoint.UniqueID.
@@ -213,8 +231,8 @@ func (e *endpoint) Close() {
 
 	switch e.state {
 	case StateBound, StateConnected:
-		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
-		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
+		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice, tcpip.FullAddress{})
 		e.boundBindToDevice = 0
 		e.boundPortFlags = ports.Flags{}
 	}
@@ -247,11 +265,6 @@ func (e *endpoint) Close() {
 // ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
 func (e *endpoint) ModerateRecvBuf(copied int) {}
 
-// IPTables implements tcpip.Endpoint.IPTables.
-func (e *endpoint) IPTables() (stack.IPTables, error) {
-	return e.stack.IPTables(), nil
-}
-
 // Read reads data from the endpoint. This method does not block if
 // there is no data pending.
 func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
@@ -430,24 +443,33 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 	}
 
 	var route *stack.Route
+	var resolve func(waker *sleep.Waker) (ch <-chan struct{}, err *tcpip.Error)
 	var dstPort uint16
 	if to == nil {
 		route = &e.route
 		dstPort = e.dstPort
-
-		if route.IsResolutionRequired() {
-			// Promote lock to exclusive if using a shared route, given that it may need to
-			// change in Route.Resolve() call below.
+		resolve = func(waker *sleep.Waker) (ch <-chan struct{}, err *tcpip.Error) {
+			// Promote lock to exclusive if using a shared route, given that it may
+			// need to change in Route.Resolve() call below.
 			e.mu.RUnlock()
-			defer e.mu.RLock()
-
 			e.mu.Lock()
-			defer e.mu.Unlock()
 
 			// Recheck state after lock was re-acquired.
 			if e.state != StateConnected {
-				return 0, nil, tcpip.ErrInvalidEndpointState
+				err = tcpip.ErrInvalidEndpointState
+			}
+			if err == nil && route.IsResolutionRequired() {
+				ch, err = route.Resolve(waker)
 			}
+
+			e.mu.Unlock()
+			e.mu.RLock()
+
+			// Recheck state after lock was re-acquired.
+			if e.state != StateConnected {
+				err = tcpip.ErrInvalidEndpointState
+			}
+			return
 		}
 	} else {
 		// Reject destination address if it goes through a different
@@ -461,10 +483,6 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 			nicID = e.BindNICID
 		}
 
-		if to.Addr == header.IPv4Broadcast && !e.broadcast {
-			return 0, nil, tcpip.ErrBroadcastDisabled
-		}
-
 		dst, netProto, err := e.checkV4MappedLocked(*to)
 		if err != nil {
 			return 0, nil, err
@@ -478,10 +496,15 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 
 		route = &r
 		dstPort = dst.Port
+		resolve = route.Resolve
+	}
+
+	if !e.broadcast && route.IsOutboundBroadcast() {
+		return 0, nil, tcpip.ErrBroadcastDisabled
 	}
 
 	if route.IsResolutionRequired() {
-		if ch, err := route.Resolve(nil); err != nil {
+		if ch, err := resolve(nil); err != nil {
 			if err == tcpip.ErrWouldBlock {
 				return 0, ch, tcpip.ErrNoLinkAddress
 			}
@@ -507,7 +530,7 @@ func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-c
 		useDefaultTTL = false
 	}
 
-	if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS, e.owner); err != nil {
+	if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS, e.owner, e.noChecksum); err != nil {
 		return 0, nil, err
 	}
 	return int64(len(v)), nil, nil
@@ -531,6 +554,11 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		e.multicastLoop = v
 		e.mu.Unlock()
 
+	case tcpip.NoChecksumOption:
+		e.mu.Lock()
+		e.noChecksum = v
+		e.mu.Unlock()
+
 	case tcpip.ReceiveTOSOption:
 		e.mu.Lock()
 		e.receiveTOS = v
@@ -552,10 +580,13 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 		e.mu.Unlock()
 
 	case tcpip.ReuseAddressOption:
+		e.mu.Lock()
+		e.portFlags.MostRecent = v
+		e.mu.Unlock()
 
 	case tcpip.ReusePortOption:
 		e.mu.Lock()
-		e.reusePort = v
+		e.portFlags.LoadBalanced = v
 		e.mu.Unlock()
 
 	case tcpip.V6OnlyOption:
@@ -581,6 +612,13 @@ func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
 // SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 	switch opt {
+	case tcpip.MTUDiscoverOption:
+		// Return not supported if the value is not disabling path
+		// MTU discovery.
+		if v != tcpip.PMTUDiscoveryDont {
+			return tcpip.ErrNotSupported
+		}
+
 	case tcpip.MulticastTTLOption:
 		e.mu.Lock()
 		e.multicastTTL = uint8(v)
@@ -602,8 +640,43 @@ func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
 		e.mu.Unlock()
 
 	case tcpip.ReceiveBufferSizeOption:
+		// Make sure the receive buffer size is within the min and max
+		// allowed.
+		var rs stack.ReceiveBufferSizeOption
+		if err := e.stack.Option(&rs); err != nil {
+			panic(fmt.Sprintf("e.stack.Option(%#v) = %s", rs, err))
+		}
+
+		if v < rs.Min {
+			v = rs.Min
+		}
+		if v > rs.Max {
+			v = rs.Max
+		}
+
+		e.mu.Lock()
+		e.rcvBufSizeMax = v
+		e.mu.Unlock()
+		return nil
 	case tcpip.SendBufferSizeOption:
+		// Make sure the send buffer size is within the min and max
+		// allowed.
+		var ss stack.SendBufferSizeOption
+		if err := e.stack.Option(&ss); err != nil {
+			panic(fmt.Sprintf("e.stack.Option(%#v) = %s", ss, err))
+		}
+
+		if v < ss.Min {
+			v = ss.Min
+		}
+		if v > ss.Max {
+			v = ss.Max
+		}
 
+		e.mu.Lock()
+		e.sndBufSizeMax = v
+		e.mu.Unlock()
+		return nil
 	}
 
 	return nil
@@ -743,6 +816,9 @@ func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
 		e.mu.Lock()
 		e.bindToDevice = id
 		e.mu.Unlock()
+
+	case tcpip.SocketDetachFilterOption:
+		return nil
 	}
 	return nil
 }
@@ -765,6 +841,12 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		e.mu.RUnlock()
 		return v, nil
 
+	case tcpip.NoChecksumOption:
+		e.mu.RLock()
+		v := e.noChecksum
+		e.mu.RUnlock()
+		return v, nil
+
 	case tcpip.ReceiveTOSOption:
 		e.mu.RLock()
 		v := e.receiveTOS
@@ -789,11 +871,15 @@ func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
 		return v, nil
 
 	case tcpip.ReuseAddressOption:
-		return false, nil
+		e.mu.RLock()
+		v := e.portFlags.MostRecent
+		e.mu.RUnlock()
+
+		return v, nil
 
 	case tcpip.ReusePortOption:
 		e.mu.RLock()
-		v := e.reusePort
+		v := e.portFlags.LoadBalanced
 		e.mu.RUnlock()
 
 		return v, nil
@@ -830,6 +916,10 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 		e.mu.RUnlock()
 		return v, nil
 
+	case tcpip.MTUDiscoverOption:
+		// The only supported setting is path MTU discovery disabled.
+		return tcpip.PMTUDiscoveryDont, nil
+
 	case tcpip.MulticastTTLOption:
 		e.mu.Lock()
 		v := int(e.multicastTTL)
@@ -848,7 +938,7 @@ func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
 
 	case tcpip.SendBufferSizeOption:
 		e.mu.Lock()
-		v := e.sndBufSize
+		v := e.sndBufSizeMax
 		e.mu.Unlock()
 		return v, nil
 
@@ -895,22 +985,29 @@ func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
 
 // sendUDP sends a UDP segment via the provided network endpoint and under the
 // provided identity.
-func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort uint16, ttl uint8, useDefaultTTL bool, tos uint8, owner tcpip.PacketOwner) *tcpip.Error {
-	// Allocate a buffer for the UDP header.
-	hdr := buffer.NewPrependable(header.UDPMinimumSize + int(r.MaxHeaderLength()))
+func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort uint16, ttl uint8, useDefaultTTL bool, tos uint8, owner tcpip.PacketOwner, noChecksum bool) *tcpip.Error {
+	pkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+		ReserveHeaderBytes: header.UDPMinimumSize + int(r.MaxHeaderLength()),
+		Data:               data,
+	})
+	pkt.Owner = owner
 
-	// Initialize the header.
-	udp := header.UDP(hdr.Prepend(header.UDPMinimumSize))
+	// Initialize the UDP header.
+	udp := header.UDP(pkt.TransportHeader().Push(header.UDPMinimumSize))
 
-	length := uint16(hdr.UsedLength() + data.Size())
+	length := uint16(pkt.Size())
 	udp.Encode(&header.UDPFields{
 		SrcPort: localPort,
 		DstPort: remotePort,
 		Length:  length,
 	})
 
-	// Only calculate the checksum if offloading isn't supported.
-	if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 {
+	// Set the checksum field unless TX checksum offload is enabled.
+	// On IPv4, UDP checksum is optional, and a zero value indicates the
+	// transmitter skipped the checksum generation (RFC768).
+	// On IPv6, UDP checksum is not optional (RFC2460 Section 8.1).
+	if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 &&
+		(!noChecksum || r.NetProto == header.IPv6ProtocolNumber) {
 		xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
 		for _, v := range data.Views() {
 			xsum = header.Checksum(v, xsum)
@@ -921,12 +1018,11 @@ func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort u
 	if useDefaultTTL {
 		ttl = r.DefaultTTL()
 	}
-	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: ttl, TOS: tos}, stack.PacketBuffer{
-		Header:          hdr,
-		Data:            data,
-		TransportHeader: buffer.View(udp),
-		Owner:           owner,
-	}); err != nil {
+	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{
+		Protocol: ProtocolNumber,
+		TTL:      ttl,
+		TOS:      tos,
+	}, pkt); err != nil {
 		r.Stats().UDP.PacketSendErrors.Increment()
 		return err
 	}
@@ -958,6 +1054,11 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 		id  stack.TransportEndpointID
 		btd tcpip.NICID
 	)
+
+	// We change this value below and we need the old value to unregister
+	// the endpoint.
+	boundPortFlags := e.boundPortFlags
+
 	// Exclude ephemerally bound endpoints.
 	if e.BindNICID != 0 || e.ID.LocalAddress == "" {
 		var err *tcpip.Error
@@ -970,16 +1071,17 @@ func (e *endpoint) Disconnect() *tcpip.Error {
 			return err
 		}
 		e.state = StateBound
+		boundPortFlags = e.boundPortFlags
 	} else {
 		if e.ID.LocalPort != 0 {
 			// Release the ephemeral port.
-			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice)
+			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, boundPortFlags, e.boundBindToDevice, tcpip.FullAddress{})
 			e.boundPortFlags = ports.Flags{}
 		}
 		e.state = StateInitial
 	}
 
-	e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
+	e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, boundPortFlags, e.boundBindToDevice)
 	e.ID = id
 	e.boundBindToDevice = btd
 	e.route.Release()
@@ -1051,6 +1153,8 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 		}
 	}
 
+	oldPortFlags := e.boundPortFlags
+
 	id, btd, err := e.registerWithStack(nicID, netProtos, id)
 	if err != nil {
 		return err
@@ -1058,7 +1162,7 @@ func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
 
 	// Remove the old registration.
 	if e.ID.LocalPort != 0 {
-		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundBindToDevice)
+		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, oldPortFlags, e.boundBindToDevice)
 	}
 
 	e.ID = id
@@ -1122,22 +1226,17 @@ func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
 
 func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, *tcpip.Error) {
 	if e.ID.LocalPort == 0 {
-		flags := ports.Flags{
-			LoadBalanced: e.reusePort,
-			// FIXME(b/129164367): Support SO_REUSEADDR.
-			MostRecent: false,
-		}
-		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, flags, e.bindToDevice)
+		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.portFlags, e.bindToDevice, tcpip.FullAddress{})
 		if err != nil {
 			return id, e.bindToDevice, err
 		}
-		e.boundPortFlags = flags
 		id.LocalPort = port
 	}
+	e.boundPortFlags = e.portFlags
 
-	err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice)
+	err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.boundPortFlags, e.bindToDevice)
 	if err != nil {
-		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.boundPortFlags, e.bindToDevice)
+		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.boundPortFlags, e.bindToDevice, tcpip.FullAddress{})
 		e.boundPortFlags = ports.Flags{}
 	}
 	return id, e.bindToDevice, err
@@ -1269,22 +1368,47 @@ func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
 
 // HandlePacket is called by the stack when new packets arrive to this transport
 // endpoint.
-func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) {
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
 	// Get the header then trim it from the view.
-	hdr, ok := pkt.Data.PullUp(header.UDPMinimumSize)
-	if !ok || int(header.UDP(hdr).Length()) > pkt.Data.Size() {
+	hdr := header.UDP(pkt.TransportHeader().View())
+	if int(hdr.Length()) > pkt.Data.Size()+header.UDPMinimumSize {
 		// Malformed packet.
 		e.stack.Stats().UDP.MalformedPacketsReceived.Increment()
 		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
 		return
 	}
 
-	pkt.Data.TrimFront(header.UDPMinimumSize)
+	// Never receive from a multicast address.
+	if header.IsV4MulticastAddress(id.RemoteAddress) ||
+		header.IsV6MulticastAddress(id.RemoteAddress) {
+		e.stack.Stats().UDP.InvalidSourceAddress.Increment()
+		e.stack.Stats().IP.InvalidSourceAddressesReceived.Increment()
+		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
+		return
+	}
+
+	// Verify checksum unless RX checksum offload is enabled.
+	// On IPv4, UDP checksum is optional, and a zero value means
+	// the transmitter omitted the checksum generation (RFC768).
+	// On IPv6, UDP checksum is not optional (RFC2460 Section 8.1).
+	if r.Capabilities()&stack.CapabilityRXChecksumOffload == 0 &&
+		(hdr.Checksum() != 0 || r.NetProto == header.IPv6ProtocolNumber) {
+		xsum := r.PseudoHeaderChecksum(ProtocolNumber, hdr.Length())
+		for _, v := range pkt.Data.Views() {
+			xsum = header.Checksum(v, xsum)
+		}
+		if hdr.CalculateChecksum(xsum) != 0xffff {
+			// Checksum Error.
+			e.stack.Stats().UDP.ChecksumErrors.Increment()
+			e.stats.ReceiveErrors.ChecksumErrors.Increment()
+			return
+		}
+	}
 
-	e.rcvMu.Lock()
 	e.stack.Stats().UDP.PacketsReceived.Increment()
 	e.stats.PacketsReceived.Increment()
 
+	e.rcvMu.Lock()
 	// Drop the packet if our buffer is currently full.
 	if !e.rcvReady || e.rcvClosed {
 		e.rcvMu.Unlock()
@@ -1317,15 +1441,18 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 	// Save any useful information from the network header to the packet.
 	switch r.NetProto {
 	case header.IPv4ProtocolNumber:
-		packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
-		packet.packetInfo.LocalAddr = r.LocalAddress
-		packet.packetInfo.DestinationAddr = r.RemoteAddress
-		packet.packetInfo.NIC = r.NICID()
+		packet.tos, _ = header.IPv4(pkt.NetworkHeader().View()).TOS()
 	case header.IPv6ProtocolNumber:
-		packet.tos, _ = header.IPv6(pkt.NetworkHeader).TOS()
+		packet.tos, _ = header.IPv6(pkt.NetworkHeader().View()).TOS()
 	}
 
-	packet.timestamp = e.stack.NowNanoseconds()
+	// TODO(gvisor.dev/issue/3556): r.LocalAddress may be a multicast or broadcast
+	// address. packetInfo.LocalAddr should hold a unicast address that can be
+	// used to respond to the incoming packet.
+	packet.packetInfo.LocalAddr = r.LocalAddress
+	packet.packetInfo.DestinationAddr = r.LocalAddress
+	packet.packetInfo.NIC = r.NICID()
+	packet.timestamp = e.stack.Clock().NowNanoseconds()
 
 	e.rcvMu.Unlock()
 
@@ -1336,7 +1463,7 @@ func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pk
 }
 
 // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
-func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt stack.PacketBuffer) {
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
 	if typ == stack.ControlPortUnreachable {
 		e.mu.RLock()
 		defer e.mu.RUnlock()
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
index a674ceb68..c67e0ba95 100644
--- a/pkg/tcpip/transport/udp/forwarder.go
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -43,7 +43,7 @@ func NewForwarder(s *stack.Stack, handler func(*ForwarderRequest)) *Forwarder {
 //
 // This function is expected to be passed as an argument to the
 // stack.SetTransportProtocolHandler function.
-func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
 	f.handler(&ForwarderRequest{
 		stack: f.stack,
 		route: r,
@@ -61,7 +61,7 @@ type ForwarderRequest struct {
 	stack *stack.Stack
 	route *stack.Route
 	id    stack.TransportEndpointID
-	pkt   stack.PacketBuffer
+	pkt   *stack.PacketBuffer
 }
 
 // ID returns the 4-tuple (src address, src port, dst address, dst port) that
@@ -73,7 +73,7 @@ func (r *ForwarderRequest) ID() stack.TransportEndpointID {
 // CreateEndpoint creates a connected UDP endpoint for the session request.
 func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
 	ep := newEndpoint(r.stack, r.route.NetProto, queue)
-	if err := r.stack.RegisterTransportEndpoint(r.route.NICID(), []tcpip.NetworkProtocolNumber{r.route.NetProto}, ProtocolNumber, r.id, ep, ep.reusePort, ep.bindToDevice); err != nil {
+	if err := r.stack.RegisterTransportEndpoint(r.route.NICID(), []tcpip.NetworkProtocolNumber{r.route.NetProto}, ProtocolNumber, r.id, ep, ep.portFlags, ep.bindToDevice); err != nil {
 		ep.Close()
 		return nil, err
 	}
@@ -82,6 +82,7 @@ func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint,
 	ep.route = r.route.Clone()
 	ep.dstPort = r.id.RemotePort
 	ep.RegisterNICID = r.route.NICID()
+	ep.boundPortFlags = ep.portFlags
 
 	ep.state = StateConnected
 
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
index 52af6de22..63d4bed7c 100644
--- a/pkg/tcpip/transport/udp/protocol.go
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -32,9 +32,24 @@ import (
 const (
 	// ProtocolNumber is the udp protocol number.
 	ProtocolNumber = header.UDPProtocolNumber
+
+	// MinBufferSize is the smallest size of a receive or send buffer.
+	MinBufferSize = 4 << 10 // 4KiB bytes.
+
+	// DefaultSendBufferSize is the default size of the send buffer for
+	// an endpoint.
+	DefaultSendBufferSize = 32 << 10 // 32KiB
+
+	// DefaultReceiveBufferSize is the default size of the receive buffer
+	// for an endpoint.
+	DefaultReceiveBufferSize = 32 << 10 // 32KiB
+
+	// MaxBufferSize is the largest size a receive/send buffer can grow to.
+	MaxBufferSize = 4 << 20 // 4MiB
 )
 
-type protocol struct{}
+type protocol struct {
+}
 
 // Number returns the udp protocol number.
 func (*protocol) Number() tcpip.TransportProtocolNumber {
@@ -66,15 +81,9 @@ func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
 
 // HandleUnknownDestinationPacket handles packets targeted at this protocol but
 // that don't match any existing endpoint.
-func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt stack.PacketBuffer) bool {
-	// Get the header then trim it from the view.
-	h, ok := pkt.Data.PullUp(header.UDPMinimumSize)
-	if !ok {
-		// Malformed packet.
-		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
-		return true
-	}
-	if int(header.UDP(h).Length()) > pkt.Data.Size() {
+func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
+	hdr := header.UDP(pkt.TransportHeader().View())
+	if int(hdr.Length()) > pkt.Data.Size()+header.UDPMinimumSize {
 		// Malformed packet.
 		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
 		return true
@@ -121,7 +130,7 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		}
 		headerLen := int(r.MaxHeaderLength()) + header.ICMPv4MinimumSize
 		available := int(mtu) - headerLen
-		payloadLen := len(pkt.NetworkHeader) + pkt.Data.Size()
+		payloadLen := pkt.NetworkHeader().View().Size() + pkt.TransportHeader().View().Size() + pkt.Data.Size()
 		if payloadLen > available {
 			payloadLen = available
 		}
@@ -130,20 +139,21 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		// For example, a raw or packet socket may use what UDP
 		// considers an unreachable destination. Thus we deep copy pkt
 		// to prevent multiple ownership and SR errors.
-		newNetHeader := append(buffer.View(nil), pkt.NetworkHeader...)
-		payload := newNetHeader.ToVectorisedView()
-		payload.Append(pkt.Data.ToView().ToVectorisedView())
+		newHeader := append(buffer.View(nil), pkt.NetworkHeader().View()...)
+		newHeader = append(newHeader, pkt.TransportHeader().View()...)
+		payload := newHeader.ToVectorisedView()
+		payload.AppendView(pkt.Data.ToView())
 		payload.CapLength(payloadLen)
 
-		hdr := buffer.NewPrependable(headerLen)
-		pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
-		pkt.SetType(header.ICMPv4DstUnreachable)
-		pkt.SetCode(header.ICMPv4PortUnreachable)
-		pkt.SetChecksum(header.ICMPv4Checksum(pkt, payload))
-		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
-			Header: hdr,
-			Data:   payload,
+		icmpPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: headerLen,
+			Data:               payload,
 		})
+		icmpHdr := header.ICMPv4(icmpPkt.TransportHeader().Push(header.ICMPv4MinimumSize))
+		icmpHdr.SetType(header.ICMPv4DstUnreachable)
+		icmpHdr.SetCode(header.ICMPv4PortUnreachable)
+		icmpHdr.SetChecksum(header.ICMPv4Checksum(icmpHdr, icmpPkt.Data))
+		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, icmpPkt)
 
 	case header.IPv6AddressSize:
 		if !r.Stack().AllowICMPMessage() {
@@ -164,34 +174,35 @@ func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.Trans
 		}
 		headerLen := int(r.MaxHeaderLength()) + header.ICMPv6DstUnreachableMinimumSize
 		available := int(mtu) - headerLen
-		payloadLen := len(pkt.NetworkHeader) + pkt.Data.Size()
+		network, transport := pkt.NetworkHeader().View(), pkt.TransportHeader().View()
+		payloadLen := len(network) + len(transport) + pkt.Data.Size()
 		if payloadLen > available {
 			payloadLen = available
 		}
-		payload := buffer.NewVectorisedView(len(pkt.NetworkHeader), []buffer.View{pkt.NetworkHeader})
+		payload := buffer.NewVectorisedView(len(network)+len(transport), []buffer.View{network, transport})
 		payload.Append(pkt.Data)
 		payload.CapLength(payloadLen)
 
-		hdr := buffer.NewPrependable(headerLen)
-		pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6DstUnreachableMinimumSize))
-		pkt.SetType(header.ICMPv6DstUnreachable)
-		pkt.SetCode(header.ICMPv6PortUnreachable)
-		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, payload))
-		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, stack.PacketBuffer{
-			Header: hdr,
-			Data:   payload,
+		icmpPkt := stack.NewPacketBuffer(stack.PacketBufferOptions{
+			ReserveHeaderBytes: headerLen,
+			Data:               payload,
 		})
+		icmpHdr := header.ICMPv6(icmpPkt.TransportHeader().Push(header.ICMPv6DstUnreachableMinimumSize))
+		icmpHdr.SetType(header.ICMPv6DstUnreachable)
+		icmpHdr.SetCode(header.ICMPv6PortUnreachable)
+		icmpHdr.SetChecksum(header.ICMPv6Checksum(icmpHdr, r.LocalAddress, r.RemoteAddress, icmpPkt.Data))
+		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, icmpPkt)
 	}
 	return true
 }
 
 // SetOption implements stack.TransportProtocol.SetOption.
-func (*protocol) SetOption(option interface{}) *tcpip.Error {
+func (p *protocol) SetOption(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
 // Option implements stack.TransportProtocol.Option.
-func (*protocol) Option(option interface{}) *tcpip.Error {
+func (p *protocol) Option(option interface{}) *tcpip.Error {
 	return tcpip.ErrUnknownProtocolOption
 }
 
@@ -201,6 +212,12 @@ func (*protocol) Close() {}
 // Wait implements stack.TransportProtocol.Wait.
 func (*protocol) Wait() {}
 
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
+	_, ok := pkt.TransportHeader().Consume(header.UDPMinimumSize)
+	return ok
+}
+
 // NewProtocol returns a UDP transport protocol.
 func NewProtocol() stack.TransportProtocol {
 	return &protocol{}
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
index 8acaa607a..71776d6db 100644
--- a/pkg/tcpip/transport/udp/udp_test.go
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -83,16 +83,18 @@ type header4Tuple struct {
 type testFlow int
 
 const (
-	unicastV4       testFlow = iota // V4 unicast on a V4 socket
-	unicastV4in6                    // V4-mapped unicast on a V6-dual socket
-	unicastV6                       // V6 unicast on a V6 socket
-	unicastV6Only                   // V6 unicast on a V6-only socket
-	multicastV4                     // V4 multicast on a V4 socket
-	multicastV4in6                  // V4-mapped multicast on a V6-dual socket
-	multicastV6                     // V6 multicast on a V6 socket
-	multicastV6Only                 // V6 multicast on a V6-only socket
-	broadcast                       // V4 broadcast on a V4 socket
-	broadcastIn6                    // V4-mapped broadcast on a V6-dual socket
+	unicastV4         testFlow = iota // V4 unicast on a V4 socket
+	unicastV4in6                      // V4-mapped unicast on a V6-dual socket
+	unicastV6                         // V6 unicast on a V6 socket
+	unicastV6Only                     // V6 unicast on a V6-only socket
+	multicastV4                       // V4 multicast on a V4 socket
+	multicastV4in6                    // V4-mapped multicast on a V6-dual socket
+	multicastV6                       // V6 multicast on a V6 socket
+	multicastV6Only                   // V6 multicast on a V6-only socket
+	broadcast                         // V4 broadcast on a V4 socket
+	broadcastIn6                      // V4-mapped broadcast on a V6-dual socket
+	reverseMulticast4                 // V4 multicast src. Must fail.
+	reverseMulticast6                 // V6 multicast src. Must fail.
 )
 
 func (flow testFlow) String() string {
@@ -117,6 +119,10 @@ func (flow testFlow) String() string {
 		return "broadcast"
 	case broadcastIn6:
 		return "broadcastIn6"
+	case reverseMulticast4:
+		return "reverseMulticast4"
+	case reverseMulticast6:
+		return "reverseMulticast6"
 	default:
 		return "unknown"
 	}
@@ -168,6 +174,9 @@ func (flow testFlow) header4Tuple(d packetDirection) header4Tuple {
 			h.dstAddr.Addr = multicastV6Addr
 		}
 	}
+	if flow.isReverseMulticast() {
+		h.srcAddr.Addr = flow.getMcastAddr()
+	}
 	return h
 }
 
@@ -199,9 +208,9 @@ func (flow testFlow) netProto() tcpip.NetworkProtocolNumber {
 // endpoint for this flow.
 func (flow testFlow) sockProto() tcpip.NetworkProtocolNumber {
 	switch flow {
-	case unicastV4in6, unicastV6, unicastV6Only, multicastV4in6, multicastV6, multicastV6Only, broadcastIn6:
+	case unicastV4in6, unicastV6, unicastV6Only, multicastV4in6, multicastV6, multicastV6Only, broadcastIn6, reverseMulticast6:
 		return ipv6.ProtocolNumber
-	case unicastV4, multicastV4, broadcast:
+	case unicastV4, multicastV4, broadcast, reverseMulticast4:
 		return ipv4.ProtocolNumber
 	default:
 		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
@@ -224,7 +233,7 @@ func (flow testFlow) isV6Only() bool {
 	switch flow {
 	case unicastV6Only, multicastV6Only:
 		return true
-	case unicastV4, unicastV4in6, unicastV6, multicastV4, multicastV4in6, multicastV6, broadcast, broadcastIn6:
+	case unicastV4, unicastV4in6, unicastV6, multicastV4, multicastV4in6, multicastV6, broadcast, broadcastIn6, reverseMulticast4, reverseMulticast6:
 		return false
 	default:
 		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
@@ -235,7 +244,7 @@ func (flow testFlow) isMulticast() bool {
 	switch flow {
 	case multicastV4, multicastV4in6, multicastV6, multicastV6Only:
 		return true
-	case unicastV4, unicastV4in6, unicastV6, unicastV6Only, broadcast, broadcastIn6:
+	case unicastV4, unicastV4in6, unicastV6, unicastV6Only, broadcast, broadcastIn6, reverseMulticast4, reverseMulticast6:
 		return false
 	default:
 		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
@@ -246,7 +255,7 @@ func (flow testFlow) isBroadcast() bool {
 	switch flow {
 	case broadcast, broadcastIn6:
 		return true
-	case unicastV4, unicastV4in6, unicastV6, unicastV6Only, multicastV4, multicastV4in6, multicastV6, multicastV6Only:
+	case unicastV4, unicastV4in6, unicastV6, unicastV6Only, multicastV4, multicastV4in6, multicastV6, multicastV6Only, reverseMulticast4, reverseMulticast6:
 		return false
 	default:
 		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
@@ -257,13 +266,22 @@ func (flow testFlow) isMapped() bool {
 	switch flow {
 	case unicastV4in6, multicastV4in6, broadcastIn6:
 		return true
-	case unicastV4, unicastV6, unicastV6Only, multicastV4, multicastV6, multicastV6Only, broadcast:
+	case unicastV4, unicastV6, unicastV6Only, multicastV4, multicastV6, multicastV6Only, broadcast, reverseMulticast4, reverseMulticast6:
 		return false
 	default:
 		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
 	}
 }
 
+func (flow testFlow) isReverseMulticast() bool {
+	switch flow {
+	case reverseMulticast4, reverseMulticast6:
+		return true
+	default:
+		return false
+	}
+}
+
 type testContext struct {
 	t      *testing.T
 	linkEP *channel.Endpoint
@@ -292,15 +310,15 @@ func newDualTestContextWithOptions(t *testing.T, mtu uint32, options stack.Optio
 		wep = sniffer.New(ep)
 	}
 	if err := s.CreateNIC(1, wep); err != nil {
-		t.Fatalf("CreateNIC failed: %v", err)
+		t.Fatalf("CreateNIC failed: %s", err)
 	}
 
 	if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr); err != nil {
-		t.Fatalf("AddAddress failed: %v", err)
+		t.Fatalf("AddAddress failed: %s", err)
 	}
 
 	if err := s.AddAddress(1, ipv6.ProtocolNumber, stackV6Addr); err != nil {
-		t.Fatalf("AddAddress failed: %v", err)
+		t.Fatalf("AddAddress failed: %s", err)
 	}
 
 	s.SetRouteTable([]tcpip.Route{
@@ -370,8 +388,8 @@ func (c *testContext) getPacketAndVerify(flow testFlow, checkers ...checker.Netw
 		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, flow.netProto())
 	}
 
-	hdr := p.Pkt.Header.View()
-	b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
+	vv := buffer.NewVectorisedView(p.Pkt.Size(), p.Pkt.Views())
+	b := vv.ToView()
 
 	h := flow.header4Tuple(outgoing)
 	checkers = append(
@@ -391,17 +409,21 @@ func (c *testContext) injectPacket(flow testFlow, payload []byte) {
 
 	h := flow.header4Tuple(incoming)
 	if flow.isV4() {
-		c.injectV4Packet(payload, &h, true /* valid */)
+		buf := c.buildV4Packet(payload, &h)
+		c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: buf.ToVectorisedView(),
+		}))
 	} else {
-		c.injectV6Packet(payload, &h, true /* valid */)
+		buf := c.buildV6Packet(payload, &h)
+		c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+			Data: buf.ToVectorisedView(),
+		}))
 	}
 }
 
-// injectV6Packet creates a V6 test packet with the given payload and header
-// values, and injects it into the link endpoint. valid indicates if the
-// caller intends to inject a packet with a valid or an invalid UDP header.
-// We can invalidate the header by corrupting the UDP payload length.
-func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool) {
+// buildV6Packet creates a V6 test packet with the given payload and header
+// values in a buffer.
+func (c *testContext) buildV6Packet(payload []byte, h *header4Tuple) buffer.View {
 	// Allocate a buffer for data and headers.
 	buf := buffer.NewView(header.UDPMinimumSize + header.IPv6MinimumSize + len(payload))
 	payloadStart := len(buf) - len(payload)
@@ -420,16 +442,10 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 
 	// Initialize the UDP header.
 	u := header.UDP(buf[header.IPv6MinimumSize:])
-	l := uint16(header.UDPMinimumSize + len(payload))
-	if !valid {
-		// Change the UDP payload length to corrupt the header
-		// as requested by the caller.
-		l++
-	}
 	u.Encode(&header.UDPFields{
 		SrcPort: h.srcAddr.Port,
 		DstPort: h.dstAddr.Port,
-		Length:  l,
+		Length:  uint16(header.UDPMinimumSize + len(payload)),
 	})
 
 	// Calculate the UDP pseudo-header checksum.
@@ -439,19 +455,12 @@ func (c *testContext) injectV6Packet(payload []byte, h *header4Tuple, valid bool
 	xsum = header.Checksum(payload, xsum)
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
-	// Inject packet.
-	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.PacketBuffer{
-		Data:            buf.ToVectorisedView(),
-		NetworkHeader:   buffer.View(ip),
-		TransportHeader: buffer.View(u),
-	})
+	return buf
 }
 
-// injectV4Packet creates a V4 test packet with the given payload and header
-// values, and injects it into the link endpoint. valid indicates if the
-// caller intends to inject a packet with a valid or an invalid UDP header.
-// We can invalidate the header by corrupting the UDP payload length.
-func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool) {
+// buildV4Packet creates a V4 test packet with the given payload and header
+// values in a buffer.
+func (c *testContext) buildV4Packet(payload []byte, h *header4Tuple) buffer.View {
 	// Allocate a buffer for data and headers.
 	buf := buffer.NewView(header.UDPMinimumSize + header.IPv4MinimumSize + len(payload))
 	payloadStart := len(buf) - len(payload)
@@ -485,13 +494,7 @@ func (c *testContext) injectV4Packet(payload []byte, h *header4Tuple, valid bool
 	xsum = header.Checksum(payload, xsum)
 	u.SetChecksum(^u.CalculateChecksum(xsum))
 
-	// Inject packet.
-
-	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.PacketBuffer{
-		Data:            buf.ToVectorisedView(),
-		NetworkHeader:   buffer.View(ip),
-		TransportHeader: buffer.View(u),
-	})
+	return buf
 }
 
 func newPayload() []byte {
@@ -513,7 +516,7 @@ func TestBindToDeviceOption(t *testing.T) {
 
 	ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
 	if err != nil {
-		t.Fatalf("NewEndpoint failed; %v", err)
+		t.Fatalf("NewEndpoint failed; %s", err)
 	}
 	defer ep.Close()
 
@@ -647,7 +650,7 @@ func TestBindEphemeralPort(t *testing.T) {
 	c.createEndpoint(ipv6.ProtocolNumber)
 
 	if err := c.ep.Bind(tcpip.FullAddress{}); err != nil {
-		t.Fatalf("ep.Bind(...) failed: %v", err)
+		t.Fatalf("ep.Bind(...) failed: %s", err)
 	}
 }
 
@@ -658,19 +661,19 @@ func TestBindReservedPort(t *testing.T) {
 	c.createEndpoint(ipv6.ProtocolNumber)
 
 	if err := c.ep.Connect(tcpip.FullAddress{Addr: testV6Addr, Port: testPort}); err != nil {
-		c.t.Fatalf("Connect failed: %v", err)
+		c.t.Fatalf("Connect failed: %s", err)
 	}
 
 	addr, err := c.ep.GetLocalAddress()
 	if err != nil {
-		t.Fatalf("GetLocalAddress failed: %v", err)
+		t.Fatalf("GetLocalAddress failed: %s", err)
 	}
 
 	// We can't bind the address reserved by the connected endpoint above.
 	{
 		ep, err := c.s.NewEndpoint(udp.ProtocolNumber, ipv6.ProtocolNumber, &c.wq)
 		if err != nil {
-			t.Fatalf("NewEndpoint failed: %v", err)
+			t.Fatalf("NewEndpoint failed: %s", err)
 		}
 		defer ep.Close()
 		if got, want := ep.Bind(addr), tcpip.ErrPortInUse; got != want {
@@ -681,7 +684,7 @@ func TestBindReservedPort(t *testing.T) {
 	func() {
 		ep, err := c.s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &c.wq)
 		if err != nil {
-			t.Fatalf("NewEndpoint failed: %v", err)
+			t.Fatalf("NewEndpoint failed: %s", err)
 		}
 		defer ep.Close()
 		// We can't bind ipv4-any on the port reserved by the connected endpoint
@@ -691,7 +694,7 @@ func TestBindReservedPort(t *testing.T) {
 		}
 		// We can bind an ipv4 address on this port, though.
 		if err := ep.Bind(tcpip.FullAddress{Addr: stackAddr, Port: addr.Port}); err != nil {
-			t.Fatalf("ep.Bind(...) failed: %v", err)
+			t.Fatalf("ep.Bind(...) failed: %s", err)
 		}
 	}()
 
@@ -701,11 +704,11 @@ func TestBindReservedPort(t *testing.T) {
 	func() {
 		ep, err := c.s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &c.wq)
 		if err != nil {
-			t.Fatalf("NewEndpoint failed: %v", err)
+			t.Fatalf("NewEndpoint failed: %s", err)
 		}
 		defer ep.Close()
 		if err := ep.Bind(tcpip.FullAddress{Port: addr.Port}); err != nil {
-			t.Fatalf("ep.Bind(...) failed: %v", err)
+			t.Fatalf("ep.Bind(...) failed: %s", err)
 		}
 	}()
 }
@@ -718,7 +721,7 @@ func TestV4ReadOnV6(t *testing.T) {
 
 	// Bind to wildcard.
 	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %v", err)
+		c.t.Fatalf("Bind failed: %s", err)
 	}
 
 	// Test acceptance.
@@ -733,7 +736,7 @@ func TestV4ReadOnBoundToV4MappedWildcard(t *testing.T) {
 
 	// Bind to v4 mapped wildcard.
 	if err := c.ep.Bind(tcpip.FullAddress{Addr: v4MappedWildcardAddr, Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %v", err)
+		c.t.Fatalf("Bind failed: %s", err)
 	}
 
 	// Test acceptance.
@@ -748,7 +751,7 @@ func TestV4ReadOnBoundToV4Mapped(t *testing.T) {
 
 	// Bind to local address.
 	if err := c.ep.Bind(tcpip.FullAddress{Addr: stackV4MappedAddr, Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %v", err)
+		c.t.Fatalf("Bind failed: %s", err)
 	}
 
 	// Test acceptance.
@@ -763,7 +766,7 @@ func TestV6ReadOnV6(t *testing.T) {
 
 	// Bind to wildcard.
 	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %v", err)
+		c.t.Fatalf("Bind failed: %s", err)
 	}
 
 	// Test acceptance.
@@ -800,7 +803,10 @@ func TestV4ReadSelfSource(t *testing.T) {
 			h := unicastV4.header4Tuple(incoming)
 			h.srcAddr = h.dstAddr
 
-			c.injectV4Packet(payload, &h, true /* valid */)
+			buf := c.buildV4Packet(payload, &h)
+			c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+				Data: buf.ToVectorisedView(),
+			}))
 
 			if got := c.s.Stats().IP.InvalidSourceAddressesReceived.Value(); got != tt.wantInvalidSource {
 				t.Errorf("c.s.Stats().IP.InvalidSourceAddressesReceived got %d, want %d", got, tt.wantInvalidSource)
@@ -821,7 +827,7 @@ func TestV4ReadOnV4(t *testing.T) {
 
 	// Bind to wildcard.
 	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %v", err)
+		c.t.Fatalf("Bind failed: %s", err)
 	}
 
 	// Test acceptance.
@@ -884,6 +890,60 @@ func TestV4ReadOnBoundToBroadcast(t *testing.T) {
 	}
 }
 
+// TestReadFromMulticast checks that an endpoint will NOT receive a packet
+// that was sent with multicast SOURCE address.
+func TestReadFromMulticast(t *testing.T) {
+	for _, flow := range []testFlow{reverseMulticast4, reverseMulticast6} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+				t.Fatalf("Bind failed: %s", err)
+			}
+			testFailingRead(c, flow, false /* expectReadError */)
+		})
+	}
+}
+
+// TestReadFromMulticaststats checks that a discarded packet
+// that that was sent with multicast SOURCE address increments
+// the correct counters and that a regular packet does not.
+func TestReadFromMulticastStats(t *testing.T) {
+	t.Helper()
+	for _, flow := range []testFlow{reverseMulticast4, reverseMulticast6, unicastV4} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+				t.Fatalf("Bind failed: %s", err)
+			}
+
+			payload := newPayload()
+			c.injectPacket(flow, payload)
+
+			var want uint64 = 0
+			if flow.isReverseMulticast() {
+				want = 1
+			}
+			if got := c.s.Stats().IP.InvalidSourceAddressesReceived.Value(); got != want {
+				t.Errorf("got stats.IP.InvalidSourceAddressesReceived.Value() = %d, want = %d", got, want)
+			}
+			if got := c.s.Stats().UDP.InvalidSourceAddress.Value(); got != want {
+				t.Errorf("got stats.UDP.InvalidSourceAddress.Value() = %d, want = %d", got, want)
+			}
+			if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.MalformedPacketsReceived.Value(); got != want {
+				t.Errorf("got EP Stats.ReceiveErrors.MalformedPacketsReceived stats = %d, want = %d", got, want)
+			}
+		})
+	}
+}
+
 // TestV4ReadBroadcastOnBoundToWildcard checks that an endpoint can bind to ANY
 // and receive broadcast and unicast data.
 func TestV4ReadBroadcastOnBoundToWildcard(t *testing.T) {
@@ -959,7 +1019,7 @@ func testWriteInternal(c *testContext, flow testFlow, setDest bool, checkers ...
 	payload := buffer.View(newPayload())
 	n, _, err := c.ep.Write(tcpip.SlicePayload(payload), writeOpts)
 	if err != nil {
-		c.t.Fatalf("Write failed: %v", err)
+		c.t.Fatalf("Write failed: %s", err)
 	}
 	if n != int64(len(payload)) {
 		c.t.Fatalf("Bad number of bytes written: got %v, want %v", n, len(payload))
@@ -1009,7 +1069,7 @@ func TestDualWriteBoundToWildcard(t *testing.T) {
 
 	// Bind to wildcard.
 	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %v", err)
+		c.t.Fatalf("Bind failed: %s", err)
 	}
 
 	p := testDualWrite(c)
@@ -1026,7 +1086,7 @@ func TestDualWriteConnectedToV6(t *testing.T) {
 
 	// Connect to v6 address.
 	if err := c.ep.Connect(tcpip.FullAddress{Addr: testV6Addr, Port: testPort}); err != nil {
-		c.t.Fatalf("Bind failed: %v", err)
+		c.t.Fatalf("Bind failed: %s", err)
 	}
 
 	testWrite(c, unicastV6)
@@ -1047,7 +1107,7 @@ func TestDualWriteConnectedToV4Mapped(t *testing.T) {
 
 	// Connect to v4 mapped address.
 	if err := c.ep.Connect(tcpip.FullAddress{Addr: testV4MappedAddr, Port: testPort}); err != nil {
-		c.t.Fatalf("Bind failed: %v", err)
+		c.t.Fatalf("Bind failed: %s", err)
 	}
 
 	testWrite(c, unicastV4in6)
@@ -1074,7 +1134,7 @@ func TestV6WriteOnBoundToV4Mapped(t *testing.T) {
 
 	// Bind to v4 mapped address.
 	if err := c.ep.Bind(tcpip.FullAddress{Addr: stackV4MappedAddr, Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %v", err)
+		c.t.Fatalf("Bind failed: %s", err)
 	}
 
 	// Write to v6 address.
@@ -1089,7 +1149,7 @@ func TestV6WriteOnConnected(t *testing.T) {
 
 	// Connect to v6 address.
 	if err := c.ep.Connect(tcpip.FullAddress{Addr: testV6Addr, Port: testPort}); err != nil {
-		c.t.Fatalf("Connect failed: %v", err)
+		c.t.Fatalf("Connect failed: %s", err)
 	}
 
 	testWriteWithoutDestination(c, unicastV6)
@@ -1103,7 +1163,7 @@ func TestV4WriteOnConnected(t *testing.T) {
 
 	// Connect to v4 mapped address.
 	if err := c.ep.Connect(tcpip.FullAddress{Addr: testV4MappedAddr, Port: testPort}); err != nil {
-		c.t.Fatalf("Connect failed: %v", err)
+		c.t.Fatalf("Connect failed: %s", err)
 	}
 
 	testWriteWithoutDestination(c, unicastV4)
@@ -1238,7 +1298,7 @@ func TestReadIncrementsPacketsReceived(t *testing.T) {
 
 	// Bind to wildcard.
 	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %v", err)
+		c.t.Fatalf("Bind failed: %s", err)
 	}
 
 	testRead(c, unicastV4)
@@ -1249,6 +1309,105 @@ func TestReadIncrementsPacketsReceived(t *testing.T) {
 	}
 }
 
+func TestReadIPPacketInfo(t *testing.T) {
+	tests := []struct {
+		name              string
+		proto             tcpip.NetworkProtocolNumber
+		flow              testFlow
+		expectedLocalAddr tcpip.Address
+		expectedDestAddr  tcpip.Address
+	}{
+		{
+			name:              "IPv4 unicast",
+			proto:             header.IPv4ProtocolNumber,
+			flow:              unicastV4,
+			expectedLocalAddr: stackAddr,
+			expectedDestAddr:  stackAddr,
+		},
+		{
+			name:  "IPv4 multicast",
+			proto: header.IPv4ProtocolNumber,
+			flow:  multicastV4,
+			// This should actually be a unicast address assigned to the interface.
+			//
+			// TODO(gvisor.dev/issue/3556): This check is validating incorrect
+			// behaviour. We still include the test so that once the bug is
+			// resolved, this test will start to fail and the individual tasked
+			// with fixing this bug knows to also fix this test :).
+			expectedLocalAddr: multicastAddr,
+			expectedDestAddr:  multicastAddr,
+		},
+		{
+			name:  "IPv4 broadcast",
+			proto: header.IPv4ProtocolNumber,
+			flow:  broadcast,
+			// This should actually be a unicast address assigned to the interface.
+			//
+			// TODO(gvisor.dev/issue/3556): This check is validating incorrect
+			// behaviour. We still include the test so that once the bug is
+			// resolved, this test will start to fail and the individual tasked
+			// with fixing this bug knows to also fix this test :).
+			expectedLocalAddr: broadcastAddr,
+			expectedDestAddr:  broadcastAddr,
+		},
+		{
+			name:              "IPv6 unicast",
+			proto:             header.IPv6ProtocolNumber,
+			flow:              unicastV6,
+			expectedLocalAddr: stackV6Addr,
+			expectedDestAddr:  stackV6Addr,
+		},
+		{
+			name:  "IPv6 multicast",
+			proto: header.IPv6ProtocolNumber,
+			flow:  multicastV6,
+			// This should actually be a unicast address assigned to the interface.
+			//
+			// TODO(gvisor.dev/issue/3556): This check is validating incorrect
+			// behaviour. We still include the test so that once the bug is
+			// resolved, this test will start to fail and the individual tasked
+			// with fixing this bug knows to also fix this test :).
+			expectedLocalAddr: multicastV6Addr,
+			expectedDestAddr:  multicastV6Addr,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpoint(test.proto)
+
+			bindAddr := tcpip.FullAddress{Port: stackPort}
+			if err := c.ep.Bind(bindAddr); err != nil {
+				t.Fatalf("Bind(%+v): %s", bindAddr, err)
+			}
+
+			if test.flow.isMulticast() {
+				ifoptSet := tcpip.AddMembershipOption{NIC: 1, MulticastAddr: test.flow.getMcastAddr()}
+				if err := c.ep.SetSockOpt(ifoptSet); err != nil {
+					c.t.Fatalf("SetSockOpt(%+v): %s:", ifoptSet, err)
+				}
+			}
+
+			if err := c.ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, true); err != nil {
+				t.Fatalf("c.ep.SetSockOptBool(tcpip.ReceiveIPPacketInfoOption, true): %s", err)
+			}
+
+			testRead(c, test.flow, checker.ReceiveIPPacketInfo(tcpip.IPPacketInfo{
+				NIC:             1,
+				LocalAddr:       test.expectedLocalAddr,
+				DestinationAddr: test.expectedDestAddr,
+			}))
+
+			if got := c.s.Stats().UDP.PacketsReceived.Value(); got != 1 {
+				t.Fatalf("Read did not increment PacketsReceived: got = %d, want = 1", got)
+			}
+		})
+	}
+}
+
 func TestWriteIncrementsPacketsSent(t *testing.T) {
 	c := newDualTestContext(t, defaultMTU)
 	defer c.cleanup()
@@ -1263,6 +1422,30 @@ func TestWriteIncrementsPacketsSent(t *testing.T) {
 	}
 }
 
+func TestNoChecksum(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, unicastV6} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Disable the checksum generation.
+			if err := c.ep.SetSockOptBool(tcpip.NoChecksumOption, true); err != nil {
+				t.Fatalf("SetSockOptBool failed: %s", err)
+			}
+			// This option is effective on IPv4 only.
+			testWrite(c, flow, checker.UDP(checker.NoChecksum(flow.isV4())))
+
+			// Enable the checksum generation.
+			if err := c.ep.SetSockOptBool(tcpip.NoChecksumOption, false); err != nil {
+				t.Fatalf("SetSockOptBool failed: %s", err)
+			}
+			testWrite(c, flow, checker.UDP(checker.NoChecksum(false)))
+		})
+	}
+}
+
 func TestTTL(t *testing.T) {
 	for _, flow := range []testFlow{unicastV4, unicastV4in6, unicastV6, unicastV6Only, multicastV4, multicastV4in6, multicastV6, broadcast, broadcastIn6} {
 		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
@@ -1510,12 +1693,12 @@ func TestMulticastInterfaceOption(t *testing.T) {
 									Port: stackPort,
 								}
 								if err := c.ep.Connect(addr); err != nil {
-									c.t.Fatalf("Connect failed: %v", err)
+									c.t.Fatalf("Connect failed: %s", err)
 								}
 							}
 
 							if err := c.ep.SetSockOpt(ifoptSet); err != nil {
-								c.t.Fatalf("SetSockOpt failed: %v", err)
+								c.t.Fatalf("SetSockOpt failed: %s", err)
 							}
 
 							// Verify multicast interface addr and NIC were set correctly.
@@ -1523,7 +1706,7 @@ func TestMulticastInterfaceOption(t *testing.T) {
 							ifoptWant := tcpip.MulticastInterfaceOption{NIC: 1, InterfaceAddr: ifoptSet.InterfaceAddr}
 							var ifoptGot tcpip.MulticastInterfaceOption
 							if err := c.ep.GetSockOpt(&ifoptGot); err != nil {
-								c.t.Fatalf("GetSockOpt failed: %v", err)
+								c.t.Fatalf("GetSockOpt failed: %s", err)
 							}
 							if ifoptGot != ifoptWant {
 								c.t.Errorf("got GetSockOpt() = %#v, want = %#v", ifoptGot, ifoptWant)
@@ -1583,9 +1766,8 @@ func TestV4UnknownDestination(t *testing.T) {
 				return
 			}
 
-			var pkt []byte
-			pkt = append(pkt, p.Pkt.Header.View()...)
-			pkt = append(pkt, p.Pkt.Data.ToView()...)
+			vv := buffer.NewVectorisedView(p.Pkt.Size(), p.Pkt.Views())
+			pkt := vv.ToView()
 			if got, want := len(pkt), header.IPv4MinimumProcessableDatagramSize; got > want {
 				t.Fatalf("got an ICMP packet of size: %d, want: sz <= %d", got, want)
 			}
@@ -1661,9 +1843,8 @@ func TestV6UnknownDestination(t *testing.T) {
 				return
 			}
 
-			var pkt []byte
-			pkt = append(pkt, p.Pkt.Header.View()...)
-			pkt = append(pkt, p.Pkt.Data.ToView()...)
+			vv := buffer.NewVectorisedView(p.Pkt.Size(), p.Pkt.Views())
+			pkt := vv.ToView()
 			if got, want := len(pkt), header.IPv6MinimumMTU; got > want {
 				t.Fatalf("got an ICMP packet of size: %d, want: sz <= %d", got, want)
 			}
@@ -1695,7 +1876,7 @@ func TestV6UnknownDestination(t *testing.T) {
 }
 
 // TestIncrementMalformedPacketsReceived verifies if the malformed received
-// global and endpoint stats get incremented.
+// global and endpoint stats are incremented.
 func TestIncrementMalformedPacketsReceived(t *testing.T) {
 	c := newDualTestContext(t, defaultMTU)
 	defer c.cleanup()
@@ -1703,20 +1884,271 @@ func TestIncrementMalformedPacketsReceived(t *testing.T) {
 	c.createEndpoint(ipv6.ProtocolNumber)
 	// Bind to wildcard.
 	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %v", err)
+		c.t.Fatalf("Bind failed: %s", err)
 	}
 
 	payload := newPayload()
-	c.t.Helper()
 	h := unicastV6.header4Tuple(incoming)
-	c.injectV6Packet(payload, &h, false /* !valid */)
+	buf := c.buildV6Packet(payload, &h)
 
-	var want uint64 = 1
+	// Invalidate the UDP header length field.
+	u := header.UDP(buf[header.IPv6MinimumSize:])
+	u.SetLength(u.Length() + 1)
+
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: buf.ToVectorisedView(),
+	}))
+
+	const want = 1
 	if got := c.s.Stats().UDP.MalformedPacketsReceived.Value(); got != want {
-		t.Errorf("got stats.UDP.MalformedPacketsReceived.Value() = %v, want = %v", got, want)
+		t.Errorf("got stats.UDP.MalformedPacketsReceived.Value() = %d, want = %d", got, want)
 	}
 	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.MalformedPacketsReceived.Value(); got != want {
-		t.Errorf("got EP Stats.ReceiveErrors.MalformedPacketsReceived stats = %v, want = %v", got, want)
+		t.Errorf("got EP Stats.ReceiveErrors.MalformedPacketsReceived stats = %d, want = %d", got, want)
+	}
+}
+
+// TestShortHeader verifies that when a packet with a too-short UDP header is
+// received, the malformed received global stat gets incremented.
+func TestShortHeader(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	h := unicastV6.header4Tuple(incoming)
+
+	// Allocate a buffer for an IPv6 and too-short UDP header.
+	const udpSize = header.UDPMinimumSize - 1
+	buf := buffer.NewView(header.IPv6MinimumSize + udpSize)
+	// Initialize the IP header.
+	ip := header.IPv6(buf)
+	ip.Encode(&header.IPv6Fields{
+		TrafficClass:  testTOS,
+		PayloadLength: uint16(udpSize),
+		NextHeader:    uint8(udp.ProtocolNumber),
+		HopLimit:      65,
+		SrcAddr:       h.srcAddr.Addr,
+		DstAddr:       h.dstAddr.Addr,
+	})
+
+	// Initialize the UDP header.
+	udpHdr := header.UDP(buffer.NewView(header.UDPMinimumSize))
+	udpHdr.Encode(&header.UDPFields{
+		SrcPort: h.srcAddr.Port,
+		DstPort: h.dstAddr.Port,
+		Length:  header.UDPMinimumSize,
+	})
+	// Calculate the UDP pseudo-header checksum.
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, h.srcAddr.Addr, h.dstAddr.Addr, uint16(len(udpHdr)))
+	udpHdr.SetChecksum(^udpHdr.CalculateChecksum(xsum))
+	// Copy all but the last byte of the UDP header into the packet.
+	copy(buf[header.IPv6MinimumSize:], udpHdr)
+
+	// Inject packet.
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: buf.ToVectorisedView(),
+	}))
+
+	if got, want := c.s.Stats().MalformedRcvdPackets.Value(), uint64(1); got != want {
+		t.Errorf("got c.s.Stats().MalformedRcvdPackets.Value() = %d, want = %d", got, want)
+	}
+}
+
+// TestIncrementChecksumErrorsV4 verifies if a checksum error is detected,
+// global and endpoint stats are incremented.
+func TestIncrementChecksumErrorsV4(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv4.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	payload := newPayload()
+	h := unicastV4.header4Tuple(incoming)
+	buf := c.buildV4Packet(payload, &h)
+
+	// Invalidate the UDP header checksum field, taking care to avoid
+	// overflow to zero, which would disable checksum validation.
+	for u := header.UDP(buf[header.IPv4MinimumSize:]); ; {
+		u.SetChecksum(u.Checksum() + 1)
+		if u.Checksum() != 0 {
+			break
+		}
+	}
+
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: buf.ToVectorisedView(),
+	}))
+
+	const want = 1
+	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+	}
+}
+
+// TestIncrementChecksumErrorsV6 verifies if a checksum error is detected,
+// global and endpoint stats are incremented.
+func TestIncrementChecksumErrorsV6(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	payload := newPayload()
+	h := unicastV6.header4Tuple(incoming)
+	buf := c.buildV6Packet(payload, &h)
+
+	// Invalidate the UDP header checksum field.
+	u := header.UDP(buf[header.IPv6MinimumSize:])
+	u.SetChecksum(u.Checksum() + 1)
+
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: buf.ToVectorisedView(),
+	}))
+
+	const want = 1
+	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+	}
+}
+
+// TestPayloadModifiedV4 verifies if a checksum error is detected,
+// global and endpoint stats are incremented.
+func TestPayloadModifiedV4(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv4.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	payload := newPayload()
+	h := unicastV4.header4Tuple(incoming)
+	buf := c.buildV4Packet(payload, &h)
+	// Modify the payload so that the checksum value in the UDP header will be incorrect.
+	buf[len(buf)-1]++
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: buf.ToVectorisedView(),
+	}))
+
+	const want = 1
+	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+	}
+}
+
+// TestPayloadModifiedV6 verifies if a checksum error is detected,
+// global and endpoint stats are incremented.
+func TestPayloadModifiedV6(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	payload := newPayload()
+	h := unicastV6.header4Tuple(incoming)
+	buf := c.buildV6Packet(payload, &h)
+	// Modify the payload so that the checksum value in the UDP header will be incorrect.
+	buf[len(buf)-1]++
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: buf.ToVectorisedView(),
+	}))
+
+	const want = 1
+	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+	}
+}
+
+// TestChecksumZeroV4 verifies if the checksum value is zero, global and
+// endpoint states are *not* incremented (UDP checksum is optional on IPv4).
+func TestChecksumZeroV4(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv4.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	payload := newPayload()
+	h := unicastV4.header4Tuple(incoming)
+	buf := c.buildV4Packet(payload, &h)
+	// Set the checksum field in the UDP header to zero.
+	u := header.UDP(buf[header.IPv4MinimumSize:])
+	u.SetChecksum(0)
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: buf.ToVectorisedView(),
+	}))
+
+	const want = 0
+	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+	}
+}
+
+// TestChecksumZeroV6 verifies if the checksum value is zero, global and
+// endpoint states are incremented (UDP checksum is *not* optional on IPv6).
+func TestChecksumZeroV6(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	payload := newPayload()
+	h := unicastV6.header4Tuple(incoming)
+	buf := c.buildV6Packet(payload, &h)
+	// Set the checksum field in the UDP header to zero.
+	u := header.UDP(buf[header.IPv6MinimumSize:])
+	u.SetChecksum(0)
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, stack.NewPacketBuffer(stack.PacketBufferOptions{
+		Data: buf.ToVectorisedView(),
+	}))
+
+	const want = 1
+	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
 	}
 }
 
@@ -1730,15 +2162,15 @@ func TestShutdownRead(t *testing.T) {
 
 	// Bind to wildcard.
 	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
-		c.t.Fatalf("Bind failed: %v", err)
+		c.t.Fatalf("Bind failed: %s", err)
 	}
 
 	if err := c.ep.Connect(tcpip.FullAddress{Addr: testV6Addr, Port: testPort}); err != nil {
-		c.t.Fatalf("Connect failed: %v", err)
+		c.t.Fatalf("Connect failed: %s", err)
 	}
 
 	if err := c.ep.Shutdown(tcpip.ShutdownRead); err != nil {
-		t.Fatalf("Shutdown failed: %v", err)
+		t.Fatalf("Shutdown failed: %s", err)
 	}
 
 	testFailingRead(c, unicastV6, true /* expectReadError */)
@@ -1761,11 +2193,11 @@ func TestShutdownWrite(t *testing.T) {
 	c.createEndpoint(ipv6.ProtocolNumber)
 
 	if err := c.ep.Connect(tcpip.FullAddress{Addr: testV6Addr, Port: testPort}); err != nil {
-		c.t.Fatalf("Connect failed: %v", err)
+		c.t.Fatalf("Connect failed: %s", err)
 	}
 
 	if err := c.ep.Shutdown(tcpip.ShutdownWrite); err != nil {
-		t.Fatalf("Shutdown failed: %v", err)
+		t.Fatalf("Shutdown failed: %s", err)
 	}
 
 	testFailingWrite(c, unicastV6, tcpip.ErrClosedForSend)
@@ -1807,3 +2239,192 @@ func (c *testContext) checkEndpointReadStats(incr uint64, want tcpip.TransportEn
 		c.t.Errorf("Endpoint stats not matching for error %s got %+v want %+v", err, got, want)
 	}
 }
+
+func TestOutgoingSubnetBroadcast(t *testing.T) {
+	const nicID1 = 1
+
+	ipv4Addr := tcpip.AddressWithPrefix{
+		Address:   "\xc0\xa8\x01\x3a",
+		PrefixLen: 24,
+	}
+	ipv4Subnet := ipv4Addr.Subnet()
+	ipv4SubnetBcast := ipv4Subnet.Broadcast()
+	ipv4Gateway := tcpip.Address("\xc0\xa8\x01\x01")
+	ipv4AddrPrefix31 := tcpip.AddressWithPrefix{
+		Address:   "\xc0\xa8\x01\x3a",
+		PrefixLen: 31,
+	}
+	ipv4Subnet31 := ipv4AddrPrefix31.Subnet()
+	ipv4Subnet31Bcast := ipv4Subnet31.Broadcast()
+	ipv4AddrPrefix32 := tcpip.AddressWithPrefix{
+		Address:   "\xc0\xa8\x01\x3a",
+		PrefixLen: 32,
+	}
+	ipv4Subnet32 := ipv4AddrPrefix32.Subnet()
+	ipv4Subnet32Bcast := ipv4Subnet32.Broadcast()
+	ipv6Addr := tcpip.AddressWithPrefix{
+		Address:   "\x20\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01",
+		PrefixLen: 64,
+	}
+	ipv6Subnet := ipv6Addr.Subnet()
+	ipv6SubnetBcast := ipv6Subnet.Broadcast()
+	remNetAddr := tcpip.AddressWithPrefix{
+		Address:   "\x64\x0a\x7b\x18",
+		PrefixLen: 24,
+	}
+	remNetSubnet := remNetAddr.Subnet()
+	remNetSubnetBcast := remNetSubnet.Broadcast()
+
+	tests := []struct {
+		name                 string
+		nicAddr              tcpip.ProtocolAddress
+		routes               []tcpip.Route
+		remoteAddr           tcpip.Address
+		requiresBroadcastOpt bool
+	}{
+		{
+			name: "IPv4 Broadcast to local subnet",
+			nicAddr: tcpip.ProtocolAddress{
+				Protocol:          header.IPv4ProtocolNumber,
+				AddressWithPrefix: ipv4Addr,
+			},
+			routes: []tcpip.Route{
+				{
+					Destination: ipv4Subnet,
+					NIC:         nicID1,
+				},
+			},
+			remoteAddr:           ipv4SubnetBcast,
+			requiresBroadcastOpt: true,
+		},
+		{
+			name: "IPv4 Broadcast to local /31 subnet",
+			nicAddr: tcpip.ProtocolAddress{
+				Protocol:          header.IPv4ProtocolNumber,
+				AddressWithPrefix: ipv4AddrPrefix31,
+			},
+			routes: []tcpip.Route{
+				{
+					Destination: ipv4Subnet31,
+					NIC:         nicID1,
+				},
+			},
+			remoteAddr:           ipv4Subnet31Bcast,
+			requiresBroadcastOpt: false,
+		},
+		{
+			name: "IPv4 Broadcast to local /32 subnet",
+			nicAddr: tcpip.ProtocolAddress{
+				Protocol:          header.IPv4ProtocolNumber,
+				AddressWithPrefix: ipv4AddrPrefix32,
+			},
+			routes: []tcpip.Route{
+				{
+					Destination: ipv4Subnet32,
+					NIC:         nicID1,
+				},
+			},
+			remoteAddr:           ipv4Subnet32Bcast,
+			requiresBroadcastOpt: false,
+		},
+		// IPv6 has no notion of a broadcast.
+		{
+			name: "IPv6 'Broadcast' to local subnet",
+			nicAddr: tcpip.ProtocolAddress{
+				Protocol:          header.IPv6ProtocolNumber,
+				AddressWithPrefix: ipv6Addr,
+			},
+			routes: []tcpip.Route{
+				{
+					Destination: ipv6Subnet,
+					NIC:         nicID1,
+				},
+			},
+			remoteAddr:           ipv6SubnetBcast,
+			requiresBroadcastOpt: false,
+		},
+		{
+			name: "IPv4 Broadcast to remote subnet",
+			nicAddr: tcpip.ProtocolAddress{
+				Protocol:          header.IPv4ProtocolNumber,
+				AddressWithPrefix: ipv4Addr,
+			},
+			routes: []tcpip.Route{
+				{
+					Destination: remNetSubnet,
+					Gateway:     ipv4Gateway,
+					NIC:         nicID1,
+				},
+			},
+			remoteAddr:           remNetSubnetBcast,
+			requiresBroadcastOpt: true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			e := channel.New(0, defaultMTU, "")
+			if err := s.CreateNIC(nicID1, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
+			}
+			if err := s.AddProtocolAddress(nicID1, test.nicAddr); err != nil {
+				t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID1, test.nicAddr, err)
+			}
+
+			s.SetRouteTable(test.routes)
+
+			var netProto tcpip.NetworkProtocolNumber
+			switch l := len(test.remoteAddr); l {
+			case header.IPv4AddressSize:
+				netProto = header.IPv4ProtocolNumber
+			case header.IPv6AddressSize:
+				netProto = header.IPv6ProtocolNumber
+			default:
+				t.Fatalf("got unexpected address length = %d bytes", l)
+			}
+
+			wq := waiter.Queue{}
+			ep, err := s.NewEndpoint(udp.ProtocolNumber, netProto, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, netProto, err)
+			}
+			defer ep.Close()
+
+			data := tcpip.SlicePayload([]byte{1, 2, 3, 4})
+			to := tcpip.FullAddress{
+				Addr: test.remoteAddr,
+				Port: 80,
+			}
+			opts := tcpip.WriteOptions{To: &to}
+			expectedErrWithoutBcastOpt := tcpip.ErrBroadcastDisabled
+			if !test.requiresBroadcastOpt {
+				expectedErrWithoutBcastOpt = nil
+			}
+
+			if n, _, err := ep.Write(data, opts); err != expectedErrWithoutBcastOpt {
+				t.Fatalf("got ep.Write(_, _) = (%d, _, %v), want = (_, _, %v)", n, err, expectedErrWithoutBcastOpt)
+			}
+
+			if err := ep.SetSockOptBool(tcpip.BroadcastOption, true); err != nil {
+				t.Fatalf("got SetSockOptBool(BroadcastOption, true): %s", err)
+			}
+
+			if n, _, err := ep.Write(data, opts); err != nil {
+				t.Fatalf("got ep.Write(_, _) = (%d, _, %s), want = (_, _, nil)", n, err)
+			}
+
+			if err := ep.SetSockOptBool(tcpip.BroadcastOption, false); err != nil {
+				t.Fatalf("got SetSockOptBool(BroadcastOption, false): %s", err)
+			}
+
+			if n, _, err := ep.Write(data, opts); err != expectedErrWithoutBcastOpt {
+				t.Fatalf("got ep.Write(_, _) = (%d, _, %v), want = (_, _, %v)", n, err, expectedErrWithoutBcastOpt)
+			}
+		})
+	}
+}