212 files changed, 80762 insertions, 0 deletions
diff --git a/pkg/tcpip/BUILD b/pkg/tcpip/BUILD
new file mode 100644
index 000000000..454e07662
--- /dev/null
+++ b/pkg/tcpip/BUILD
@@ -0,0 +1,32 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "tcpip",
+    srcs = [
+        "tcpip.go",
+        "time_unsafe.go",
+        "timer.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sync",
+        "//pkg/tcpip/buffer",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "tcpip_test",
+    size = "small",
+    srcs = ["tcpip_test.go"],
+    library = ":tcpip",
+)
+
+go_test(
+    name = "tcpip_x_test",
+    size = "small",
+    srcs = ["timer_test.go"],
+    deps = [":tcpip"],
+)
diff --git a/pkg/tcpip/adapters/gonet/BUILD b/pkg/tcpip/adapters/gonet/BUILD
new file mode 100644
index 000000000..a984f1712
--- /dev/null
+++ b/pkg/tcpip/adapters/gonet/BUILD
@@ -0,0 +1,37 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "gonet",
+    srcs = ["gonet.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/tcpip/transport/udp",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "gonet_test",
+    size = "small",
+    srcs = ["gonet_test.go"],
+    library = ":gonet",
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/tcpip/transport/udp",
+        "//pkg/waiter",
+        "@org_golang_x_net//nettest:go_default_library",
+    ],
+)
diff --git a/pkg/tcpip/adapters/gonet/gonet.go b/pkg/tcpip/adapters/gonet/gonet.go
new file mode 100644
index 000000000..d82ed5205
--- /dev/null
+++ b/pkg/tcpip/adapters/gonet/gonet.go
@@ -0,0 +1,738 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gonet provides a Go net package compatible wrapper for a tcpip stack.
+package gonet
+
+import (
+	"context"
+	"errors"
+	"io"
+	"net"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+var (
+	errCanceled   = errors.New("operation canceled")
+	errWouldBlock = errors.New("operation would block")
+)
+
+// timeoutError is how the net package reports timeouts.
+type timeoutError struct{}
+
+func (e *timeoutError) Error() string   { return "i/o timeout" }
+func (e *timeoutError) Timeout() bool   { return true }
+func (e *timeoutError) Temporary() bool { return true }
+
+// A TCPListener is a wrapper around a TCP tcpip.Endpoint that implements
+// net.Listener.
+type TCPListener struct {
+	stack  *stack.Stack
+	ep     tcpip.Endpoint
+	wq     *waiter.Queue
+	cancel chan struct{}
+}
+
+// NewTCPListener creates a new TCPListener from a listening tcpip.Endpoint.
+func NewTCPListener(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *TCPListener {
+	return &TCPListener{
+		stack:  s,
+		ep:     ep,
+		wq:     wq,
+		cancel: make(chan struct{}),
+	}
+}
+
+// ListenTCP creates a new TCPListener.
+func ListenTCP(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPListener, error) {
+	// Create a TCP endpoint, bind it, then start listening.
+	var wq waiter.Queue
+	ep, err := s.NewEndpoint(tcp.ProtocolNumber, network, &wq)
+	if err != nil {
+		return nil, errors.New(err.String())
+	}
+
+	if err := ep.Bind(addr); err != nil {
+		ep.Close()
+		return nil, &net.OpError{
+			Op:   "bind",
+			Net:  "tcp",
+			Addr: fullToTCPAddr(addr),
+			Err:  errors.New(err.String()),
+		}
+	}
+
+	if err := ep.Listen(10); err != nil {
+		ep.Close()
+		return nil, &net.OpError{
+			Op:   "listen",
+			Net:  "tcp",
+			Addr: fullToTCPAddr(addr),
+			Err:  errors.New(err.String()),
+		}
+	}
+
+	return NewTCPListener(s, &wq, ep), nil
+}
+
+// Close implements net.Listener.Close.
+func (l *TCPListener) Close() error {
+	l.ep.Close()
+	return nil
+}
+
+// Shutdown stops the HTTP server.
+func (l *TCPListener) Shutdown() {
+	l.ep.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
+	close(l.cancel) // broadcast cancellation
+}
+
+// Addr implements net.Listener.Addr.
+func (l *TCPListener) Addr() net.Addr {
+	a, err := l.ep.GetLocalAddress()
+	if err != nil {
+		return nil
+	}
+	return fullToTCPAddr(a)
+}
+
+type deadlineTimer struct {
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	readTimer     *time.Timer
+	readCancelCh  chan struct{}
+	writeTimer    *time.Timer
+	writeCancelCh chan struct{}
+}
+
+func (d *deadlineTimer) init() {
+	d.readCancelCh = make(chan struct{})
+	d.writeCancelCh = make(chan struct{})
+}
+
+func (d *deadlineTimer) readCancel() <-chan struct{} {
+	d.mu.Lock()
+	c := d.readCancelCh
+	d.mu.Unlock()
+	return c
+}
+func (d *deadlineTimer) writeCancel() <-chan struct{} {
+	d.mu.Lock()
+	c := d.writeCancelCh
+	d.mu.Unlock()
+	return c
+}
+
+// setDeadline contains the shared logic for setting a deadline.
+//
+// cancelCh and timer must be pointers to deadlineTimer.readCancelCh and
+// deadlineTimer.readTimer or deadlineTimer.writeCancelCh and
+// deadlineTimer.writeTimer.
+//
+// setDeadline must only be called while holding d.mu.
+func (d *deadlineTimer) setDeadline(cancelCh *chan struct{}, timer **time.Timer, t time.Time) {
+	if *timer != nil && !(*timer).Stop() {
+		*cancelCh = make(chan struct{})
+	}
+
+	// Create a new channel if we already closed it due to setting an already
+	// expired time. We won't race with the timer because we already handled
+	// that above.
+	select {
+	case <-*cancelCh:
+		*cancelCh = make(chan struct{})
+	default:
+	}
+
+	// "A zero value for t means I/O operations will not time out."
+	// - net.Conn.SetDeadline
+	if t.IsZero() {
+		return
+	}
+
+	timeout := t.Sub(time.Now())
+	if timeout <= 0 {
+		close(*cancelCh)
+		return
+	}
+
+	// Timer.Stop returns whether or not the AfterFunc has started, but
+	// does not indicate whether or not it has completed. Make a copy of
+	// the cancel channel to prevent this code from racing with the next
+	// call of setDeadline replacing *cancelCh.
+	ch := *cancelCh
+	*timer = time.AfterFunc(timeout, func() {
+		close(ch)
+	})
+}
+
+// SetReadDeadline implements net.Conn.SetReadDeadline and
+// net.PacketConn.SetReadDeadline.
+func (d *deadlineTimer) SetReadDeadline(t time.Time) error {
+	d.mu.Lock()
+	d.setDeadline(&d.readCancelCh, &d.readTimer, t)
+	d.mu.Unlock()
+	return nil
+}
+
+// SetWriteDeadline implements net.Conn.SetWriteDeadline and
+// net.PacketConn.SetWriteDeadline.
+func (d *deadlineTimer) SetWriteDeadline(t time.Time) error {
+	d.mu.Lock()
+	d.setDeadline(&d.writeCancelCh, &d.writeTimer, t)
+	d.mu.Unlock()
+	return nil
+}
+
+// SetDeadline implements net.Conn.SetDeadline and net.PacketConn.SetDeadline.
+func (d *deadlineTimer) SetDeadline(t time.Time) error {
+	d.mu.Lock()
+	d.setDeadline(&d.readCancelCh, &d.readTimer, t)
+	d.setDeadline(&d.writeCancelCh, &d.writeTimer, t)
+	d.mu.Unlock()
+	return nil
+}
+
+// A TCPConn is a wrapper around a TCP tcpip.Endpoint that implements the net.Conn
+// interface.
+type TCPConn struct {
+	deadlineTimer
+
+	wq *waiter.Queue
+	ep tcpip.Endpoint
+
+	// readMu serializes reads and implicitly protects read.
+	//
+	// Lock ordering:
+	// If both readMu and deadlineTimer.mu are to be used in a single
+	// request, readMu must be acquired before deadlineTimer.mu.
+	readMu sync.Mutex
+
+	// read contains bytes that have been read from the endpoint,
+	// but haven't yet been returned.
+	read buffer.View
+}
+
+// NewTCPConn creates a new TCPConn.
+func NewTCPConn(wq *waiter.Queue, ep tcpip.Endpoint) *TCPConn {
+	c := &TCPConn{
+		wq: wq,
+		ep: ep,
+	}
+	c.deadlineTimer.init()
+	return c
+}
+
+// Accept implements net.Conn.Accept.
+func (l *TCPListener) Accept() (net.Conn, error) {
+	n, wq, err := l.ep.Accept()
+
+	if err == tcpip.ErrWouldBlock {
+		// Create wait queue entry that notifies a channel.
+		waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+		l.wq.EventRegister(&waitEntry, waiter.EventIn)
+		defer l.wq.EventUnregister(&waitEntry)
+
+		for {
+			n, wq, err = l.ep.Accept()
+
+			if err != tcpip.ErrWouldBlock {
+				break
+			}
+
+			select {
+			case <-l.cancel:
+				return nil, errCanceled
+			case <-notifyCh:
+			}
+		}
+	}
+
+	if err != nil {
+		return nil, &net.OpError{
+			Op:   "accept",
+			Net:  "tcp",
+			Addr: l.Addr(),
+			Err:  errors.New(err.String()),
+		}
+	}
+
+	return NewTCPConn(wq, n), nil
+}
+
+type opErrorer interface {
+	newOpError(op string, err error) *net.OpError
+}
+
+// commonRead implements the common logic between net.Conn.Read and
+// net.PacketConn.ReadFrom.
+func commonRead(ep tcpip.Endpoint, wq *waiter.Queue, deadline <-chan struct{}, addr *tcpip.FullAddress, errorer opErrorer, dontWait bool) ([]byte, error) {
+	select {
+	case <-deadline:
+		return nil, errorer.newOpError("read", &timeoutError{})
+	default:
+	}
+
+	read, _, err := ep.Read(addr)
+
+	if err == tcpip.ErrWouldBlock {
+		if dontWait {
+			return nil, errWouldBlock
+		}
+		// Create wait queue entry that notifies a channel.
+		waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+		wq.EventRegister(&waitEntry, waiter.EventIn)
+		defer wq.EventUnregister(&waitEntry)
+		for {
+			read, _, err = ep.Read(addr)
+			if err != tcpip.ErrWouldBlock {
+				break
+			}
+			select {
+			case <-deadline:
+				return nil, errorer.newOpError("read", &timeoutError{})
+			case <-notifyCh:
+			}
+		}
+	}
+
+	if err == tcpip.ErrClosedForReceive {
+		return nil, io.EOF
+	}
+
+	if err != nil {
+		return nil, errorer.newOpError("read", errors.New(err.String()))
+	}
+
+	return read, nil
+}
+
+// Read implements net.Conn.Read.
+func (c *TCPConn) Read(b []byte) (int, error) {
+	c.readMu.Lock()
+	defer c.readMu.Unlock()
+
+	deadline := c.readCancel()
+
+	numRead := 0
+	defer func() {
+		if numRead != 0 {
+			c.ep.ModerateRecvBuf(numRead)
+		}
+	}()
+	for numRead != len(b) {
+		if len(c.read) == 0 {
+			var err error
+			c.read, err = commonRead(c.ep, c.wq, deadline, nil, c, numRead != 0)
+			if err != nil {
+				if numRead != 0 {
+					return numRead, nil
+				}
+				return numRead, err
+			}
+		}
+		n := copy(b[numRead:], c.read)
+		c.read.TrimFront(n)
+		numRead += n
+		if len(c.read) == 0 {
+			c.read = nil
+		}
+	}
+	return numRead, nil
+}
+
+// Write implements net.Conn.Write.
+func (c *TCPConn) Write(b []byte) (int, error) {
+	deadline := c.writeCancel()
+
+	// Check if deadlineTimer has already expired.
+	select {
+	case <-deadline:
+		return 0, c.newOpError("write", &timeoutError{})
+	default:
+	}
+
+	v := buffer.NewViewFromBytes(b)
+
+	// We must handle two soft failure conditions simultaneously:
+	//  1. Write may write nothing and return tcpip.ErrWouldBlock.
+	//     If this happens, we need to register for notifications if we have
+	//     not already and wait to try again.
+	//  2. Write may write fewer than the full number of bytes and return
+	//     without error. In this case we need to try writing the remaining
+	//     bytes again. I do not need to register for notifications.
+	//
+	// What is more, these two soft failure conditions can be interspersed.
+	// There is no guarantee that all of the condition #1s will occur before
+	// all of the condition #2s or visa-versa.
+	var (
+		err      *tcpip.Error
+		nbytes   int
+		reg      bool
+		notifyCh chan struct{}
+	)
+	for nbytes < len(b) && (err == tcpip.ErrWouldBlock || err == nil) {
+		if err == tcpip.ErrWouldBlock {
+			if !reg {
+				// Only register once.
+				reg = true
+
+				// Create wait queue entry that notifies a channel.
+				var waitEntry waiter.Entry
+				waitEntry, notifyCh = waiter.NewChannelEntry(nil)
+				c.wq.EventRegister(&waitEntry, waiter.EventOut)
+				defer c.wq.EventUnregister(&waitEntry)
+			} else {
+				// Don't wait immediately after registration in case more data
+				// became available between when we last checked and when we setup
+				// the notification.
+				select {
+				case <-deadline:
+					return nbytes, c.newOpError("write", &timeoutError{})
+				case <-notifyCh:
+				}
+			}
+		}
+
+		var n int64
+		var resCh <-chan struct{}
+		n, resCh, err = c.ep.Write(tcpip.SlicePayload(v), tcpip.WriteOptions{})
+		nbytes += int(n)
+		v.TrimFront(int(n))
+
+		if resCh != nil {
+			select {
+			case <-deadline:
+				return nbytes, c.newOpError("write", &timeoutError{})
+			case <-resCh:
+			}
+
+			n, _, err = c.ep.Write(tcpip.SlicePayload(v), tcpip.WriteOptions{})
+			nbytes += int(n)
+			v.TrimFront(int(n))
+		}
+	}
+
+	if err == nil {
+		return nbytes, nil
+	}
+
+	return nbytes, c.newOpError("write", errors.New(err.String()))
+}
+
+// Close implements net.Conn.Close.
+func (c *TCPConn) Close() error {
+	c.ep.Close()
+	return nil
+}
+
+// CloseRead shuts down the reading side of the TCP connection. Most callers
+// should just use Close.
+//
+// A TCP Half-Close is performed the same as CloseRead for *net.TCPConn.
+func (c *TCPConn) CloseRead() error {
+	if terr := c.ep.Shutdown(tcpip.ShutdownRead); terr != nil {
+		return c.newOpError("close", errors.New(terr.String()))
+	}
+	return nil
+}
+
+// CloseWrite shuts down the writing side of the TCP connection. Most callers
+// should just use Close.
+//
+// A TCP Half-Close is performed the same as CloseWrite for *net.TCPConn.
+func (c *TCPConn) CloseWrite() error {
+	if terr := c.ep.Shutdown(tcpip.ShutdownWrite); terr != nil {
+		return c.newOpError("close", errors.New(terr.String()))
+	}
+	return nil
+}
+
+// LocalAddr implements net.Conn.LocalAddr.
+func (c *TCPConn) LocalAddr() net.Addr {
+	a, err := c.ep.GetLocalAddress()
+	if err != nil {
+		return nil
+	}
+	return fullToTCPAddr(a)
+}
+
+// RemoteAddr implements net.Conn.RemoteAddr.
+func (c *TCPConn) RemoteAddr() net.Addr {
+	a, err := c.ep.GetRemoteAddress()
+	if err != nil {
+		return nil
+	}
+	return fullToTCPAddr(a)
+}
+
+func (c *TCPConn) newOpError(op string, err error) *net.OpError {
+	return &net.OpError{
+		Op:     op,
+		Net:    "tcp",
+		Source: c.LocalAddr(),
+		Addr:   c.RemoteAddr(),
+		Err:    err,
+	}
+}
+
+func fullToTCPAddr(addr tcpip.FullAddress) *net.TCPAddr {
+	return &net.TCPAddr{IP: net.IP(addr.Addr), Port: int(addr.Port)}
+}
+
+func fullToUDPAddr(addr tcpip.FullAddress) *net.UDPAddr {
+	return &net.UDPAddr{IP: net.IP(addr.Addr), Port: int(addr.Port)}
+}
+
+// DialTCP creates a new TCPConn connected to the specified address.
+func DialTCP(s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPConn, error) {
+	return DialContextTCP(context.Background(), s, addr, network)
+}
+
+// DialContextTCP creates a new TCPConn connected to the specified address
+// with the option of adding cancellation and timeouts.
+func DialContextTCP(ctx context.Context, s *stack.Stack, addr tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*TCPConn, error) {
+	// Create TCP endpoint, then connect.
+	var wq waiter.Queue
+	ep, err := s.NewEndpoint(tcp.ProtocolNumber, network, &wq)
+	if err != nil {
+		return nil, errors.New(err.String())
+	}
+
+	// Create wait queue entry that notifies a channel.
+	//
+	// We do this unconditionally as Connect will always return an error.
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&waitEntry, waiter.EventOut)
+	defer wq.EventUnregister(&waitEntry)
+
+	select {
+	case <-ctx.Done():
+		return nil, ctx.Err()
+	default:
+	}
+
+	err = ep.Connect(addr)
+	if err == tcpip.ErrConnectStarted {
+		select {
+		case <-ctx.Done():
+			ep.Close()
+			return nil, ctx.Err()
+		case <-notifyCh:
+		}
+
+		err = ep.GetSockOpt(tcpip.ErrorOption{})
+	}
+	if err != nil {
+		ep.Close()
+		return nil, &net.OpError{
+			Op:   "connect",
+			Net:  "tcp",
+			Addr: fullToTCPAddr(addr),
+			Err:  errors.New(err.String()),
+		}
+	}
+
+	return NewTCPConn(&wq, ep), nil
+}
+
+// A UDPConn is a wrapper around a UDP tcpip.Endpoint that implements
+// net.Conn and net.PacketConn.
+type UDPConn struct {
+	deadlineTimer
+
+	stack *stack.Stack
+	ep    tcpip.Endpoint
+	wq    *waiter.Queue
+}
+
+// NewUDPConn creates a new UDPConn.
+func NewUDPConn(s *stack.Stack, wq *waiter.Queue, ep tcpip.Endpoint) *UDPConn {
+	c := &UDPConn{
+		stack: s,
+		ep:    ep,
+		wq:    wq,
+	}
+	c.deadlineTimer.init()
+	return c
+}
+
+// DialUDP creates a new UDPConn.
+//
+// If laddr is nil, a local address is automatically chosen.
+//
+// If raddr is nil, the UDPConn is left unconnected.
+func DialUDP(s *stack.Stack, laddr, raddr *tcpip.FullAddress, network tcpip.NetworkProtocolNumber) (*UDPConn, error) {
+	var wq waiter.Queue
+	ep, err := s.NewEndpoint(udp.ProtocolNumber, network, &wq)
+	if err != nil {
+		return nil, errors.New(err.String())
+	}
+
+	if laddr != nil {
+		if err := ep.Bind(*laddr); err != nil {
+			ep.Close()
+			return nil, &net.OpError{
+				Op:   "bind",
+				Net:  "udp",
+				Addr: fullToUDPAddr(*laddr),
+				Err:  errors.New(err.String()),
+			}
+		}
+	}
+
+	c := NewUDPConn(s, &wq, ep)
+
+	if raddr != nil {
+		if err := c.ep.Connect(*raddr); err != nil {
+			c.ep.Close()
+			return nil, &net.OpError{
+				Op:   "connect",
+				Net:  "udp",
+				Addr: fullToUDPAddr(*raddr),
+				Err:  errors.New(err.String()),
+			}
+		}
+	}
+
+	return c, nil
+}
+
+func (c *UDPConn) newOpError(op string, err error) *net.OpError {
+	return c.newRemoteOpError(op, nil, err)
+}
+
+func (c *UDPConn) newRemoteOpError(op string, remote net.Addr, err error) *net.OpError {
+	return &net.OpError{
+		Op:     op,
+		Net:    "udp",
+		Source: c.LocalAddr(),
+		Addr:   remote,
+		Err:    err,
+	}
+}
+
+// RemoteAddr implements net.Conn.RemoteAddr.
+func (c *UDPConn) RemoteAddr() net.Addr {
+	a, err := c.ep.GetRemoteAddress()
+	if err != nil {
+		return nil
+	}
+	return fullToUDPAddr(a)
+}
+
+// Read implements net.Conn.Read
+func (c *UDPConn) Read(b []byte) (int, error) {
+	bytesRead, _, err := c.ReadFrom(b)
+	return bytesRead, err
+}
+
+// ReadFrom implements net.PacketConn.ReadFrom.
+func (c *UDPConn) ReadFrom(b []byte) (int, net.Addr, error) {
+	deadline := c.readCancel()
+
+	var addr tcpip.FullAddress
+	read, err := commonRead(c.ep, c.wq, deadline, &addr, c, false)
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return copy(b, read), fullToUDPAddr(addr), nil
+}
+
+func (c *UDPConn) Write(b []byte) (int, error) {
+	return c.WriteTo(b, nil)
+}
+
+// WriteTo implements net.PacketConn.WriteTo.
+func (c *UDPConn) WriteTo(b []byte, addr net.Addr) (int, error) {
+	deadline := c.writeCancel()
+
+	// Check if deadline has already expired.
+	select {
+	case <-deadline:
+		return 0, c.newRemoteOpError("write", addr, &timeoutError{})
+	default:
+	}
+
+	// If we're being called by Write, there is no addr
+	wopts := tcpip.WriteOptions{}
+	if addr != nil {
+		ua := addr.(*net.UDPAddr)
+		wopts.To = &tcpip.FullAddress{Addr: tcpip.Address(ua.IP), Port: uint16(ua.Port)}
+	}
+
+	v := buffer.NewView(len(b))
+	copy(v, b)
+
+	n, resCh, err := c.ep.Write(tcpip.SlicePayload(v), wopts)
+	if resCh != nil {
+		select {
+		case <-deadline:
+			return int(n), c.newRemoteOpError("write", addr, &timeoutError{})
+		case <-resCh:
+		}
+
+		n, _, err = c.ep.Write(tcpip.SlicePayload(v), wopts)
+	}
+
+	if err == tcpip.ErrWouldBlock {
+		// Create wait queue entry that notifies a channel.
+		waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+		c.wq.EventRegister(&waitEntry, waiter.EventOut)
+		defer c.wq.EventUnregister(&waitEntry)
+		for {
+			select {
+			case <-deadline:
+				return int(n), c.newRemoteOpError("write", addr, &timeoutError{})
+			case <-notifyCh:
+			}
+
+			n, _, err = c.ep.Write(tcpip.SlicePayload(v), wopts)
+			if err != tcpip.ErrWouldBlock {
+				break
+			}
+		}
+	}
+
+	if err == nil {
+		return int(n), nil
+	}
+
+	return int(n), c.newRemoteOpError("write", addr, errors.New(err.String()))
+}
+
+// Close implements net.PacketConn.Close.
+func (c *UDPConn) Close() error {
+	c.ep.Close()
+	return nil
+}
+
+// LocalAddr implements net.PacketConn.LocalAddr.
+func (c *UDPConn) LocalAddr() net.Addr {
+	a, err := c.ep.GetLocalAddress()
+	if err != nil {
+		return nil
+	}
+	return fullToUDPAddr(a)
+}
diff --git a/pkg/tcpip/adapters/gonet/gonet_test.go b/pkg/tcpip/adapters/gonet/gonet_test.go
new file mode 100644
index 000000000..3c552988a
--- /dev/null
+++ b/pkg/tcpip/adapters/gonet/gonet_test.go
@@ -0,0 +1,716 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gonet
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"net"
+	"reflect"
+	"strings"
+	"testing"
+	"time"
+
+	"golang.org/x/net/nettest"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	NICID = 1
+)
+
+func TestTimeouts(t *testing.T) {
+	nc := NewTCPConn(nil, nil)
+	dlfs := []struct {
+		name string
+		f    func(time.Time) error
+	}{
+		{"SetDeadline", nc.SetDeadline},
+		{"SetReadDeadline", nc.SetReadDeadline},
+		{"SetWriteDeadline", nc.SetWriteDeadline},
+	}
+
+	for _, dlf := range dlfs {
+		if err := dlf.f(time.Time{}); err != nil {
+			t.Errorf("got %s(time.Time{}) = %v, want = %v", dlf.name, err, nil)
+		}
+	}
+}
+
+func newLoopbackStack() (*stack.Stack, *tcpip.Error) {
+	// Create the stack and add a NIC.
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol()},
+	})
+
+	if err := s.CreateNIC(NICID, loopback.New()); err != nil {
+		return nil, err
+	}
+
+	// Add default route.
+	s.SetRouteTable([]tcpip.Route{
+		// IPv4
+		{
+			Destination: header.IPv4EmptySubnet,
+			NIC:         NICID,
+		},
+
+		// IPv6
+		{
+			Destination: header.IPv6EmptySubnet,
+			NIC:         NICID,
+		},
+	})
+
+	return s, nil
+}
+
+type testConnection struct {
+	wq *waiter.Queue
+	e  *waiter.Entry
+	ch chan struct{}
+	ep tcpip.Endpoint
+}
+
+func connect(s *stack.Stack, addr tcpip.FullAddress) (*testConnection, *tcpip.Error) {
+	wq := &waiter.Queue{}
+	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+
+	entry, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&entry, waiter.EventOut)
+
+	err = ep.Connect(addr)
+	if err == tcpip.ErrConnectStarted {
+		<-ch
+		err = ep.GetSockOpt(tcpip.ErrorOption{})
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	wq.EventUnregister(&entry)
+	wq.EventRegister(&entry, waiter.EventIn)
+
+	return &testConnection{wq, &entry, ch, ep}, nil
+}
+
+func (c *testConnection) close() {
+	c.wq.EventUnregister(c.e)
+	c.ep.Close()
+}
+
+// TestCloseReader tests that Conn.Close() causes Conn.Read() to unblock.
+func TestCloseReader(t *testing.T) {
+	s, err := newLoopbackStack()
+	if err != nil {
+		t.Fatalf("newLoopbackStack() = %v", err)
+	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
+
+	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
+
+	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
+
+	l, e := ListenTCP(s, addr, ipv4.ProtocolNumber)
+	if e != nil {
+		t.Fatalf("NewListener() = %v", e)
+	}
+	done := make(chan struct{})
+	go func() {
+		defer close(done)
+		c, err := l.Accept()
+		if err != nil {
+			t.Fatalf("l.Accept() = %v", err)
+		}
+
+		// Give c.Read() a chance to block before closing the connection.
+		time.AfterFunc(time.Millisecond*50, func() {
+			c.Close()
+		})
+
+		buf := make([]byte, 256)
+		n, err := c.Read(buf)
+		if n != 0 || err != io.EOF {
+			t.Errorf("c.Read() = (%d, %v), want (0, EOF)", n, err)
+		}
+	}()
+	sender, err := connect(s, addr)
+	if err != nil {
+		t.Fatalf("connect() = %v", err)
+	}
+
+	select {
+	case <-done:
+	case <-time.After(5 * time.Second):
+		t.Errorf("c.Read() didn't unblock")
+	}
+	sender.close()
+}
+
+// TestCloseReaderWithForwarder tests that TCPConn.Close wakes TCPConn.Read when
+// using tcp.Forwarder.
+func TestCloseReaderWithForwarder(t *testing.T) {
+	s, err := newLoopbackStack()
+	if err != nil {
+		t.Fatalf("newLoopbackStack() = %v", err)
+	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
+
+	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
+	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
+
+	done := make(chan struct{})
+
+	fwd := tcp.NewForwarder(s, 30000, 10, func(r *tcp.ForwarderRequest) {
+		defer close(done)
+
+		var wq waiter.Queue
+		ep, err := r.CreateEndpoint(&wq)
+		if err != nil {
+			t.Fatalf("r.CreateEndpoint() = %v", err)
+		}
+		defer ep.Close()
+		r.Complete(false)
+
+		c := NewTCPConn(&wq, ep)
+
+		// Give c.Read() a chance to block before closing the connection.
+		time.AfterFunc(time.Millisecond*50, func() {
+			c.Close()
+		})
+
+		buf := make([]byte, 256)
+		n, e := c.Read(buf)
+		if n != 0 || e != io.EOF {
+			t.Errorf("c.Read() = (%d, %v), want (0, EOF)", n, e)
+		}
+	})
+	s.SetTransportProtocolHandler(tcp.ProtocolNumber, fwd.HandlePacket)
+
+	sender, err := connect(s, addr)
+	if err != nil {
+		t.Fatalf("connect() = %v", err)
+	}
+
+	select {
+	case <-done:
+	case <-time.After(5 * time.Second):
+		t.Errorf("c.Read() didn't unblock")
+	}
+	sender.close()
+}
+
+func TestCloseRead(t *testing.T) {
+	s, terr := newLoopbackStack()
+	if terr != nil {
+		t.Fatalf("newLoopbackStack() = %v", terr)
+	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
+
+	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
+	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
+
+	fwd := tcp.NewForwarder(s, 30000, 10, func(r *tcp.ForwarderRequest) {
+		var wq waiter.Queue
+		_, err := r.CreateEndpoint(&wq)
+		if err != nil {
+			t.Fatalf("r.CreateEndpoint() = %v", err)
+		}
+		// Endpoint will be closed in deferred s.Close (above).
+	})
+
+	s.SetTransportProtocolHandler(tcp.ProtocolNumber, fwd.HandlePacket)
+
+	tc, terr := connect(s, addr)
+	if terr != nil {
+		t.Fatalf("connect() = %v", terr)
+	}
+	c := NewTCPConn(tc.wq, tc.ep)
+
+	if err := c.CloseRead(); err != nil {
+		t.Errorf("c.CloseRead() = %v", err)
+	}
+
+	buf := make([]byte, 256)
+	if n, err := c.Read(buf); err != io.EOF {
+		t.Errorf("c.Read() = (%d, %v), want (0, io.EOF)", n, err)
+	}
+
+	if n, err := c.Write([]byte("abc123")); n != 6 || err != nil {
+		t.Errorf("c.Write() = (%d, %v), want (6, nil)", n, err)
+	}
+}
+
+func TestCloseWrite(t *testing.T) {
+	s, terr := newLoopbackStack()
+	if terr != nil {
+		t.Fatalf("newLoopbackStack() = %v", terr)
+	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
+
+	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
+	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
+
+	fwd := tcp.NewForwarder(s, 30000, 10, func(r *tcp.ForwarderRequest) {
+		var wq waiter.Queue
+		ep, err := r.CreateEndpoint(&wq)
+		if err != nil {
+			t.Fatalf("r.CreateEndpoint() = %v", err)
+		}
+		defer ep.Close()
+		r.Complete(false)
+
+		c := NewTCPConn(&wq, ep)
+
+		n, e := c.Read(make([]byte, 256))
+		if n != 0 || e != io.EOF {
+			t.Errorf("c.Read() = (%d, %v), want (0, io.EOF)", n, e)
+		}
+
+		if n, e = c.Write([]byte("abc123")); n != 6 || e != nil {
+			t.Errorf("c.Write() = (%d, %v), want (6, nil)", n, e)
+		}
+	})
+
+	s.SetTransportProtocolHandler(tcp.ProtocolNumber, fwd.HandlePacket)
+
+	tc, terr := connect(s, addr)
+	if terr != nil {
+		t.Fatalf("connect() = %v", terr)
+	}
+	c := NewTCPConn(tc.wq, tc.ep)
+
+	if err := c.CloseWrite(); err != nil {
+		t.Errorf("c.CloseWrite() = %v", err)
+	}
+
+	buf := make([]byte, 256)
+	n, err := c.Read(buf)
+	if err != nil || string(buf[:n]) != "abc123" {
+		t.Fatalf("c.Read() = (%d, %v), want (6, nil)", n, err)
+	}
+
+	n, err = c.Write([]byte("abc123"))
+	got, ok := err.(*net.OpError)
+	want := "endpoint is closed for send"
+	if n != 0 || !ok || got.Op != "write" || got.Err == nil || !strings.HasSuffix(got.Err.Error(), want) {
+		t.Errorf("c.Write() = (%d, %v), want (0, OpError(Op: write, Err: %s))", n, err, want)
+	}
+}
+
+func TestUDPForwarder(t *testing.T) {
+	s, terr := newLoopbackStack()
+	if terr != nil {
+		t.Fatalf("newLoopbackStack() = %v", terr)
+	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
+
+	ip1 := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
+	addr1 := tcpip.FullAddress{NICID, ip1, 11211}
+	s.AddAddress(NICID, ipv4.ProtocolNumber, ip1)
+	ip2 := tcpip.Address(net.IPv4(169, 254, 10, 2).To4())
+	addr2 := tcpip.FullAddress{NICID, ip2, 11311}
+	s.AddAddress(NICID, ipv4.ProtocolNumber, ip2)
+
+	done := make(chan struct{})
+	fwd := udp.NewForwarder(s, func(r *udp.ForwarderRequest) {
+		defer close(done)
+
+		var wq waiter.Queue
+		ep, err := r.CreateEndpoint(&wq)
+		if err != nil {
+			t.Fatalf("r.CreateEndpoint() = %v", err)
+		}
+		defer ep.Close()
+
+		c := NewTCPConn(&wq, ep)
+
+		buf := make([]byte, 256)
+		n, e := c.Read(buf)
+		if e != nil {
+			t.Errorf("c.Read() = %v", e)
+		}
+
+		if _, e := c.Write(buf[:n]); e != nil {
+			t.Errorf("c.Write() = %v", e)
+		}
+	})
+	s.SetTransportProtocolHandler(udp.ProtocolNumber, fwd.HandlePacket)
+
+	c2, err := DialUDP(s, &addr2, nil, ipv4.ProtocolNumber)
+	if err != nil {
+		t.Fatal("DialUDP(bind port 5):", err)
+	}
+
+	sent := "abc123"
+	sendAddr := fullToUDPAddr(addr1)
+	if n, err := c2.WriteTo([]byte(sent), sendAddr); err != nil || n != len(sent) {
+		t.Errorf("c1.WriteTo(%q, %v) = %d, %v, want = %d, %v", sent, sendAddr, n, err, len(sent), nil)
+	}
+
+	buf := make([]byte, 256)
+	n, recvAddr, err := c2.ReadFrom(buf)
+	if err != nil || recvAddr.String() != sendAddr.String() {
+		t.Errorf("c1.ReadFrom() = %d, %v, %v, want = %d, %v, %v", n, recvAddr, err, len(sent), sendAddr, nil)
+	}
+}
+
+// TestDeadlineChange tests that changing the deadline affects currently blocked reads.
+func TestDeadlineChange(t *testing.T) {
+	s, err := newLoopbackStack()
+	if err != nil {
+		t.Fatalf("newLoopbackStack() = %v", err)
+	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
+
+	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
+
+	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
+
+	l, e := ListenTCP(s, addr, ipv4.ProtocolNumber)
+	if e != nil {
+		t.Fatalf("NewListener() = %v", e)
+	}
+	done := make(chan struct{})
+	go func() {
+		defer close(done)
+		c, err := l.Accept()
+		if err != nil {
+			t.Fatalf("l.Accept() = %v", err)
+		}
+
+		c.SetDeadline(time.Now().Add(time.Minute))
+		// Give c.Read() a chance to block before closing the connection.
+		time.AfterFunc(time.Millisecond*50, func() {
+			c.SetDeadline(time.Now().Add(time.Millisecond * 10))
+		})
+
+		buf := make([]byte, 256)
+		n, err := c.Read(buf)
+		got, ok := err.(*net.OpError)
+		want := "i/o timeout"
+		if n != 0 || !ok || got.Err == nil || got.Err.Error() != want {
+			t.Errorf("c.Read() = (%d, %v), want (0, OpError(%s))", n, err, want)
+		}
+	}()
+	sender, err := connect(s, addr)
+	if err != nil {
+		t.Fatalf("connect() = %v", err)
+	}
+
+	select {
+	case <-done:
+	case <-time.After(time.Millisecond * 500):
+		t.Errorf("c.Read() didn't unblock")
+	}
+	sender.close()
+}
+
+func TestPacketConnTransfer(t *testing.T) {
+	s, e := newLoopbackStack()
+	if e != nil {
+		t.Fatalf("newLoopbackStack() = %v", e)
+	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
+
+	ip1 := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
+	addr1 := tcpip.FullAddress{NICID, ip1, 11211}
+	s.AddAddress(NICID, ipv4.ProtocolNumber, ip1)
+	ip2 := tcpip.Address(net.IPv4(169, 254, 10, 2).To4())
+	addr2 := tcpip.FullAddress{NICID, ip2, 11311}
+	s.AddAddress(NICID, ipv4.ProtocolNumber, ip2)
+
+	c1, err := DialUDP(s, &addr1, nil, ipv4.ProtocolNumber)
+	if err != nil {
+		t.Fatal("DialUDP(bind port 4):", err)
+	}
+	c2, err := DialUDP(s, &addr2, nil, ipv4.ProtocolNumber)
+	if err != nil {
+		t.Fatal("DialUDP(bind port 5):", err)
+	}
+
+	c1.SetDeadline(time.Now().Add(time.Second))
+	c2.SetDeadline(time.Now().Add(time.Second))
+
+	sent := "abc123"
+	sendAddr := fullToUDPAddr(addr2)
+	if n, err := c1.WriteTo([]byte(sent), sendAddr); err != nil || n != len(sent) {
+		t.Errorf("got c1.WriteTo(%q, %v) = %d, %v, want = %d, %v", sent, sendAddr, n, err, len(sent), nil)
+	}
+	recv := make([]byte, len(sent))
+	n, recvAddr, err := c2.ReadFrom(recv)
+	if err != nil || n != len(recv) {
+		t.Errorf("got c2.ReadFrom() = %d, %v, want = %d, %v", n, err, len(recv), nil)
+	}
+
+	if recv := string(recv); recv != sent {
+		t.Errorf("got recv = %q, want = %q", recv, sent)
+	}
+
+	if want := fullToUDPAddr(addr1); !reflect.DeepEqual(recvAddr, want) {
+		t.Errorf("got recvAddr = %v, want = %v", recvAddr, want)
+	}
+
+	if err := c1.Close(); err != nil {
+		t.Error("c1.Close():", err)
+	}
+	if err := c2.Close(); err != nil {
+		t.Error("c2.Close():", err)
+	}
+}
+
+func TestConnectedPacketConnTransfer(t *testing.T) {
+	s, e := newLoopbackStack()
+	if e != nil {
+		t.Fatalf("newLoopbackStack() = %v", e)
+	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
+
+	ip := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
+	addr := tcpip.FullAddress{NICID, ip, 11211}
+	s.AddAddress(NICID, ipv4.ProtocolNumber, ip)
+
+	c1, err := DialUDP(s, &addr, nil, ipv4.ProtocolNumber)
+	if err != nil {
+		t.Fatal("DialUDP(bind port 4):", err)
+	}
+	c2, err := DialUDP(s, nil, &addr, ipv4.ProtocolNumber)
+	if err != nil {
+		t.Fatal("DialUDP(bind port 5):", err)
+	}
+
+	c1.SetDeadline(time.Now().Add(time.Second))
+	c2.SetDeadline(time.Now().Add(time.Second))
+
+	sent := "abc123"
+	if n, err := c2.Write([]byte(sent)); err != nil || n != len(sent) {
+		t.Errorf("got c2.Write(%q) = %d, %v, want = %d, %v", sent, n, err, len(sent), nil)
+	}
+	recv := make([]byte, len(sent))
+	n, err := c1.Read(recv)
+	if err != nil || n != len(recv) {
+		t.Errorf("got c1.Read() = %d, %v, want = %d, %v", n, err, len(recv), nil)
+	}
+
+	if recv := string(recv); recv != sent {
+		t.Errorf("got recv = %q, want = %q", recv, sent)
+	}
+
+	if err := c1.Close(); err != nil {
+		t.Error("c1.Close():", err)
+	}
+	if err := c2.Close(); err != nil {
+		t.Error("c2.Close():", err)
+	}
+}
+
+func makePipe() (c1, c2 net.Conn, stop func(), err error) {
+	s, e := newLoopbackStack()
+	if e != nil {
+		return nil, nil, nil, fmt.Errorf("newLoopbackStack() = %v", e)
+	}
+
+	ip := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
+	addr := tcpip.FullAddress{NICID, ip, 11211}
+	s.AddAddress(NICID, ipv4.ProtocolNumber, ip)
+
+	l, err := ListenTCP(s, addr, ipv4.ProtocolNumber)
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("NewListener: %v", err)
+	}
+
+	c1, err = DialTCP(s, addr, ipv4.ProtocolNumber)
+	if err != nil {
+		l.Close()
+		return nil, nil, nil, fmt.Errorf("DialTCP: %v", err)
+	}
+
+	c2, err = l.Accept()
+	if err != nil {
+		l.Close()
+		c1.Close()
+		return nil, nil, nil, fmt.Errorf("l.Accept: %v", err)
+	}
+
+	stop = func() {
+		c1.Close()
+		c2.Close()
+		s.Close()
+		s.Wait()
+	}
+
+	if err := l.Close(); err != nil {
+		stop()
+		return nil, nil, nil, fmt.Errorf("l.Close(): %v", err)
+	}
+
+	return c1, c2, stop, nil
+}
+
+func TestTCPConnTransfer(t *testing.T) {
+	c1, c2, _, err := makePipe()
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer func() {
+		if err := c1.Close(); err != nil {
+			t.Error("c1.Close():", err)
+		}
+		if err := c2.Close(); err != nil {
+			t.Error("c2.Close():", err)
+		}
+	}()
+
+	c1.SetDeadline(time.Now().Add(time.Second))
+	c2.SetDeadline(time.Now().Add(time.Second))
+
+	const sent = "abc123"
+
+	tests := []struct {
+		name string
+		c1   net.Conn
+		c2   net.Conn
+	}{
+		{"connected to accepted", c1, c2},
+		{"accepted to connected", c2, c1},
+	}
+
+	for _, test := range tests {
+		if n, err := test.c1.Write([]byte(sent)); err != nil || n != len(sent) {
+			t.Errorf("%s: got test.c1.Write(%q) = %d, %v, want = %d, %v", test.name, sent, n, err, len(sent), nil)
+			continue
+		}
+
+		recv := make([]byte, len(sent))
+		n, err := test.c2.Read(recv)
+		if err != nil || n != len(recv) {
+			t.Errorf("%s: got test.c2.Read() = %d, %v, want = %d, %v", test.name, n, err, len(recv), nil)
+			continue
+		}
+
+		if recv := string(recv); recv != sent {
+			t.Errorf("%s: got recv = %q, want = %q", test.name, recv, sent)
+		}
+	}
+}
+
+func TestTCPDialError(t *testing.T) {
+	s, e := newLoopbackStack()
+	if e != nil {
+		t.Fatalf("newLoopbackStack() = %v", e)
+	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
+
+	ip := tcpip.Address(net.IPv4(169, 254, 10, 1).To4())
+	addr := tcpip.FullAddress{NICID, ip, 11211}
+
+	_, err := DialTCP(s, addr, ipv4.ProtocolNumber)
+	got, ok := err.(*net.OpError)
+	want := tcpip.ErrNoRoute
+	if !ok || got.Err.Error() != want.String() {
+		t.Errorf("Got DialTCP() = %v, want = %v", err, tcpip.ErrNoRoute)
+	}
+}
+
+func TestDialContextTCPCanceled(t *testing.T) {
+	s, err := newLoopbackStack()
+	if err != nil {
+		t.Fatalf("newLoopbackStack() = %v", err)
+	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
+
+	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
+	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
+
+	ctx := context.Background()
+	ctx, cancel := context.WithCancel(ctx)
+	cancel()
+
+	if _, err := DialContextTCP(ctx, s, addr, ipv4.ProtocolNumber); err != context.Canceled {
+		t.Errorf("got DialContextTCP(...) = %v, want = %v", err, context.Canceled)
+	}
+}
+
+func TestDialContextTCPTimeout(t *testing.T) {
+	s, err := newLoopbackStack()
+	if err != nil {
+		t.Fatalf("newLoopbackStack() = %v", err)
+	}
+	defer func() {
+		s.Close()
+		s.Wait()
+	}()
+
+	addr := tcpip.FullAddress{NICID, tcpip.Address(net.IPv4(169, 254, 10, 1).To4()), 11211}
+	s.AddAddress(NICID, ipv4.ProtocolNumber, addr.Addr)
+
+	fwd := tcp.NewForwarder(s, 30000, 10, func(r *tcp.ForwarderRequest) {
+		time.Sleep(time.Second)
+		r.Complete(true)
+	})
+	s.SetTransportProtocolHandler(tcp.ProtocolNumber, fwd.HandlePacket)
+
+	ctx := context.Background()
+	ctx, cancel := context.WithDeadline(ctx, time.Now().Add(100*time.Millisecond))
+	defer cancel()
+
+	if _, err := DialContextTCP(ctx, s, addr, ipv4.ProtocolNumber); err != context.DeadlineExceeded {
+		t.Errorf("got DialContextTCP(...) = %v, want = %v", err, context.DeadlineExceeded)
+	}
+}
+
+func TestNetTest(t *testing.T) {
+	nettest.TestConn(t, makePipe)
+}
diff --git a/pkg/tcpip/buffer/BUILD b/pkg/tcpip/buffer/BUILD
new file mode 100644
index 000000000..563bc78ea
--- /dev/null
+++ b/pkg/tcpip/buffer/BUILD
@@ -0,0 +1,19 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "buffer",
+    srcs = [
+        "prependable.go",
+        "view.go",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+go_test(
+    name = "buffer_test",
+    size = "small",
+    srcs = ["view_test.go"],
+    library = ":buffer",
+)
diff --git a/pkg/tcpip/buffer/prependable.go b/pkg/tcpip/buffer/prependable.go
new file mode 100644
index 000000000..ba21f4eca
--- /dev/null
+++ b/pkg/tcpip/buffer/prependable.go
@@ -0,0 +1,85 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package buffer
+
+// Prependable is a buffer that grows backwards, that is, more data can be
+// prepended to it. It is useful when building networking packets, where each
+// protocol adds its own headers to the front of the higher-level protocol
+// header and payload; for example, TCP would prepend its header to the payload,
+// then IP would prepend its own, then ethernet.
+type Prependable struct {
+	// Buf is the buffer backing the prependable buffer.
+	buf View
+
+	// usedIdx is the index where the used part of the buffer begins.
+	usedIdx int
+}
+
+// NewPrependable allocates a new prependable buffer with the given size.
+func NewPrependable(size int) Prependable {
+	return Prependable{buf: NewView(size), usedIdx: size}
+}
+
+// NewPrependableFromView creates an entirely-used Prependable from a View.
+//
+// NewPrependableFromView takes ownership of v. Note that since the entire
+// prependable is used, further attempts to call Prepend will note that size >
+// p.usedIdx and return nil.
+func NewPrependableFromView(v View) Prependable {
+	return Prependable{buf: v, usedIdx: 0}
+}
+
+// NewEmptyPrependableFromView creates a new prependable buffer from a View.
+func NewEmptyPrependableFromView(v View) Prependable {
+	return Prependable{buf: v, usedIdx: len(v)}
+}
+
+// View returns a View of the backing buffer that contains all prepended
+// data so far.
+func (p Prependable) View() View {
+	return p.buf[p.usedIdx:]
+}
+
+// UsedLength returns the number of bytes used so far.
+func (p Prependable) UsedLength() int {
+	return len(p.buf) - p.usedIdx
+}
+
+// AvailableLength returns the number of bytes used so far.
+func (p Prependable) AvailableLength() int {
+	return p.usedIdx
+}
+
+// TrimBack removes size bytes from the end.
+func (p *Prependable) TrimBack(size int) {
+	p.buf = p.buf[:len(p.buf)-size]
+}
+
+// Prepend reserves the requested space in front of the buffer, returning a
+// slice that represents the reserved space.
+func (p *Prependable) Prepend(size int) []byte {
+	if size > p.usedIdx {
+		return nil
+	}
+
+	p.usedIdx -= size
+	return p.View()[:size:size]
+}
+
+// DeepCopy copies p and the bytes backing it.
+func (p Prependable) DeepCopy() Prependable {
+	p.buf = append(View(nil), p.buf...)
+	return p
+}
diff --git a/pkg/tcpip/buffer/view.go b/pkg/tcpip/buffer/view.go
new file mode 100644
index 000000000..9a3c5d6c3
--- /dev/null
+++ b/pkg/tcpip/buffer/view.go
@@ -0,0 +1,256 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package buffer provides the implementation of a buffer view.
+package buffer
+
+import (
+	"bytes"
+	"io"
+)
+
+// View is a slice of a buffer, with convenience methods.
+type View []byte
+
+// NewView allocates a new buffer and returns an initialized view that covers
+// the whole buffer.
+func NewView(size int) View {
+	return make(View, size)
+}
+
+// NewViewFromBytes allocates a new buffer and copies in the given bytes.
+func NewViewFromBytes(b []byte) View {
+	return append(View(nil), b...)
+}
+
+// TrimFront removes the first "count" bytes from the visible section of the
+// buffer.
+func (v *View) TrimFront(count int) {
+	*v = (*v)[count:]
+}
+
+// CapLength irreversibly reduces the length of the visible section of the
+// buffer to the value specified.
+func (v *View) CapLength(length int) {
+	// We also set the slice cap because if we don't, one would be able to
+	// expand the view back to include the region just excluded. We want to
+	// prevent that to avoid potential data leak if we have uninitialized
+	// data in excluded region.
+	*v = (*v)[:length:length]
+}
+
+// Reader returns a bytes.Reader for v.
+func (v *View) Reader() bytes.Reader {
+	var r bytes.Reader
+	r.Reset(*v)
+	return r
+}
+
+// ToVectorisedView returns a VectorisedView containing the receiver.
+func (v View) ToVectorisedView() VectorisedView {
+	if len(v) == 0 {
+		return VectorisedView{}
+	}
+	return NewVectorisedView(len(v), []View{v})
+}
+
+// VectorisedView is a vectorised version of View using non contiguous memory.
+// It supports all the convenience methods supported by View.
+//
+// +stateify savable
+type VectorisedView struct {
+	views []View
+	size  int
+}
+
+// NewVectorisedView creates a new vectorised view from an already-allocated slice
+// of View and sets its size.
+func NewVectorisedView(size int, views []View) VectorisedView {
+	return VectorisedView{views: views, size: size}
+}
+
+// TrimFront removes the first "count" bytes of the vectorised view. It panics
+// if count > vv.Size().
+func (vv *VectorisedView) TrimFront(count int) {
+	for count > 0 && len(vv.views) > 0 {
+		if count < len(vv.views[0]) {
+			vv.size -= count
+			vv.views[0].TrimFront(count)
+			return
+		}
+		count -= len(vv.views[0])
+		vv.removeFirst()
+	}
+}
+
+// Read implements io.Reader.
+func (vv *VectorisedView) Read(v View) (copied int, err error) {
+	count := len(v)
+	for count > 0 && len(vv.views) > 0 {
+		if count < len(vv.views[0]) {
+			vv.size -= count
+			copy(v[copied:], vv.views[0][:count])
+			vv.views[0].TrimFront(count)
+			copied += count
+			return copied, nil
+		}
+		count -= len(vv.views[0])
+		copy(v[copied:], vv.views[0])
+		copied += len(vv.views[0])
+		vv.removeFirst()
+	}
+	if copied == 0 {
+		return 0, io.EOF
+	}
+	return copied, nil
+}
+
+// ReadToVV reads up to n bytes from vv to dstVV and removes them from vv. It
+// returns the number of bytes copied.
+func (vv *VectorisedView) ReadToVV(dstVV *VectorisedView, count int) (copied int) {
+	for count > 0 && len(vv.views) > 0 {
+		if count < len(vv.views[0]) {
+			vv.size -= count
+			dstVV.AppendView(vv.views[0][:count])
+			vv.views[0].TrimFront(count)
+			copied += count
+			return
+		}
+		count -= len(vv.views[0])
+		dstVV.AppendView(vv.views[0])
+		copied += len(vv.views[0])
+		vv.removeFirst()
+	}
+	return copied
+}
+
+// CapLength irreversibly reduces the length of the vectorised view.
+func (vv *VectorisedView) CapLength(length int) {
+	if length < 0 {
+		length = 0
+	}
+	if vv.size < length {
+		return
+	}
+	vv.size = length
+	for i := range vv.views {
+		v := &vv.views[i]
+		if len(*v) >= length {
+			if length == 0 {
+				vv.views = vv.views[:i]
+			} else {
+				v.CapLength(length)
+				vv.views = vv.views[:i+1]
+			}
+			return
+		}
+		length -= len(*v)
+	}
+}
+
+// Clone returns a clone of this VectorisedView.
+// If the buffer argument is large enough to contain all the Views of this VectorisedView,
+// the method will avoid allocations and use the buffer to store the Views of the clone.
+func (vv *VectorisedView) Clone(buffer []View) VectorisedView {
+	return VectorisedView{views: append(buffer[:0], vv.views...), size: vv.size}
+}
+
+// PullUp returns the first "count" bytes of the vectorised view. If those
+// bytes aren't already contiguous inside the vectorised view, PullUp will
+// reallocate as needed to make them contiguous. PullUp fails and returns false
+// when count > vv.Size().
+func (vv *VectorisedView) PullUp(count int) (View, bool) {
+	if len(vv.views) == 0 {
+		return nil, count == 0
+	}
+	if count <= len(vv.views[0]) {
+		return vv.views[0][:count], true
+	}
+	if count > vv.size {
+		return nil, false
+	}
+
+	newFirst := NewView(count)
+	i := 0
+	for offset := 0; offset < count; i++ {
+		copy(newFirst[offset:], vv.views[i])
+		if count-offset < len(vv.views[i]) {
+			vv.views[i].TrimFront(count - offset)
+			break
+		}
+		offset += len(vv.views[i])
+		vv.views[i] = nil
+	}
+	// We're guaranteed that i > 0, since count is too large for the first
+	// view.
+	vv.views[i-1] = newFirst
+	vv.views = vv.views[i-1:]
+	return newFirst, true
+}
+
+// Size returns the size in bytes of the entire content stored in the vectorised view.
+func (vv *VectorisedView) Size() int {
+	return vv.size
+}
+
+// ToView returns a single view containing the content of the vectorised view.
+//
+// If the vectorised view contains a single view, that view will be returned
+// directly.
+func (vv *VectorisedView) ToView() View {
+	if len(vv.views) == 1 {
+		return vv.views[0]
+	}
+	u := make([]byte, 0, vv.size)
+	for _, v := range vv.views {
+		u = append(u, v...)
+	}
+	return u
+}
+
+// Views returns the slice containing the all views.
+func (vv *VectorisedView) Views() []View {
+	return vv.views
+}
+
+// Append appends the views in a vectorised view to this vectorised view.
+func (vv *VectorisedView) Append(vv2 VectorisedView) {
+	vv.views = append(vv.views, vv2.views...)
+	vv.size += vv2.size
+}
+
+// AppendView appends the given view into this vectorised view.
+func (vv *VectorisedView) AppendView(v View) {
+	if len(v) == 0 {
+		return
+	}
+	vv.views = append(vv.views, v)
+	vv.size += len(v)
+}
+
+// Readers returns a bytes.Reader for each of vv's views.
+func (vv *VectorisedView) Readers() []bytes.Reader {
+	readers := make([]bytes.Reader, 0, len(vv.views))
+	for _, v := range vv.views {
+		readers = append(readers, v.Reader())
+	}
+	return readers
+}
+
+// removeFirst panics when len(vv.views) < 1.
+func (vv *VectorisedView) removeFirst() {
+	vv.size -= len(vv.views[0])
+	vv.views[0] = nil
+	vv.views = vv.views[1:]
+}
diff --git a/pkg/tcpip/buffer/view_test.go b/pkg/tcpip/buffer/view_test.go
new file mode 100644
index 000000000..726e54de9
--- /dev/null
+++ b/pkg/tcpip/buffer/view_test.go
@@ -0,0 +1,521 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package buffer_test contains tests for the VectorisedView type.
+package buffer
+
+import (
+	"bytes"
+	"reflect"
+	"testing"
+)
+
+// copy returns a deep-copy of the vectorised view.
+func (vv VectorisedView) copy() VectorisedView {
+	uu := VectorisedView{
+		views: make([]View, 0, len(vv.views)),
+		size:  vv.size,
+	}
+	for _, v := range vv.views {
+		uu.views = append(uu.views, append(View(nil), v...))
+	}
+	return uu
+}
+
+// vv is an helper to build VectorisedView from different strings.
+func vv(size int, pieces ...string) VectorisedView {
+	views := make([]View, len(pieces))
+	for i, p := range pieces {
+		views[i] = []byte(p)
+	}
+
+	return NewVectorisedView(size, views)
+}
+
+var capLengthTestCases = []struct {
+	comment string
+	in      VectorisedView
+	length  int
+	want    VectorisedView
+}{
+	{
+		comment: "Simple case",
+		in:      vv(2, "12"),
+		length:  1,
+		want:    vv(1, "1"),
+	},
+	{
+		comment: "Case spanning across two Views",
+		in:      vv(4, "123", "4"),
+		length:  2,
+		want:    vv(2, "12"),
+	},
+	{
+		comment: "Corner case with negative length",
+		in:      vv(1, "1"),
+		length:  -1,
+		want:    vv(0),
+	},
+	{
+		comment: "Corner case with length = 0",
+		in:      vv(3, "12", "3"),
+		length:  0,
+		want:    vv(0),
+	},
+	{
+		comment: "Corner case with length = size",
+		in:      vv(1, "1"),
+		length:  1,
+		want:    vv(1, "1"),
+	},
+	{
+		comment: "Corner case with length > size",
+		in:      vv(1, "1"),
+		length:  2,
+		want:    vv(1, "1"),
+	},
+}
+
+func TestCapLength(t *testing.T) {
+	for _, c := range capLengthTestCases {
+		orig := c.in.copy()
+		c.in.CapLength(c.length)
+		if !reflect.DeepEqual(c.in, c.want) {
+			t.Errorf("Test \"%s\" failed when calling CapLength(%d) on %v. Got %v. Want %v",
+				c.comment, c.length, orig, c.in, c.want)
+		}
+	}
+}
+
+var trimFrontTestCases = []struct {
+	comment string
+	in      VectorisedView
+	count   int
+	want    VectorisedView
+}{
+	{
+		comment: "Simple case",
+		in:      vv(2, "12"),
+		count:   1,
+		want:    vv(1, "2"),
+	},
+	{
+		comment: "Case where we trim an entire View",
+		in:      vv(2, "1", "2"),
+		count:   1,
+		want:    vv(1, "2"),
+	},
+	{
+		comment: "Case spanning across two Views",
+		in:      vv(3, "1", "23"),
+		count:   2,
+		want:    vv(1, "3"),
+	},
+	{
+		comment: "Corner case with negative count",
+		in:      vv(1, "1"),
+		count:   -1,
+		want:    vv(1, "1"),
+	},
+	{
+		comment: " Corner case with count = 0",
+		in:      vv(1, "1"),
+		count:   0,
+		want:    vv(1, "1"),
+	},
+	{
+		comment: "Corner case with count = size",
+		in:      vv(1, "1"),
+		count:   1,
+		want:    vv(0),
+	},
+	{
+		comment: "Corner case with count > size",
+		in:      vv(1, "1"),
+		count:   2,
+		want:    vv(0),
+	},
+}
+
+func TestTrimFront(t *testing.T) {
+	for _, c := range trimFrontTestCases {
+		orig := c.in.copy()
+		c.in.TrimFront(c.count)
+		if !reflect.DeepEqual(c.in, c.want) {
+			t.Errorf("Test \"%s\" failed when calling TrimFront(%d) on %v. Got %v. Want %v",
+				c.comment, c.count, orig, c.in, c.want)
+		}
+	}
+}
+
+var toViewCases = []struct {
+	comment string
+	in      VectorisedView
+	want    View
+}{
+	{
+		comment: "Simple case",
+		in:      vv(2, "12"),
+		want:    []byte("12"),
+	},
+	{
+		comment: "Case with multiple views",
+		in:      vv(2, "1", "2"),
+		want:    []byte("12"),
+	},
+	{
+		comment: "Empty case",
+		in:      vv(0),
+		want:    []byte(""),
+	},
+}
+
+func TestToView(t *testing.T) {
+	for _, c := range toViewCases {
+		got := c.in.ToView()
+		if !reflect.DeepEqual(got, c.want) {
+			t.Errorf("Test \"%s\" failed when calling ToView() on %v. Got %v. Want %v",
+				c.comment, c.in, got, c.want)
+		}
+	}
+}
+
+var toCloneCases = []struct {
+	comment  string
+	inView   VectorisedView
+	inBuffer []View
+}{
+	{
+		comment:  "Simple case",
+		inView:   vv(1, "1"),
+		inBuffer: make([]View, 1),
+	},
+	{
+		comment:  "Case with multiple views",
+		inView:   vv(2, "1", "2"),
+		inBuffer: make([]View, 2),
+	},
+	{
+		comment:  "Case with buffer too small",
+		inView:   vv(2, "1", "2"),
+		inBuffer: make([]View, 1),
+	},
+	{
+		comment:  "Case with buffer larger than needed",
+		inView:   vv(1, "1"),
+		inBuffer: make([]View, 2),
+	},
+	{
+		comment:  "Case with nil buffer",
+		inView:   vv(1, "1"),
+		inBuffer: nil,
+	},
+}
+
+func TestToClone(t *testing.T) {
+	for _, c := range toCloneCases {
+		t.Run(c.comment, func(t *testing.T) {
+			got := c.inView.Clone(c.inBuffer)
+			if !reflect.DeepEqual(got, c.inView) {
+				t.Fatalf("got (%+v).Clone(%+v) = %+v, want = %+v",
+					c.inView, c.inBuffer, got, c.inView)
+			}
+		})
+	}
+}
+
+func TestVVReadToVV(t *testing.T) {
+	testCases := []struct {
+		comment     string
+		vv          VectorisedView
+		bytesToRead int
+		wantBytes   string
+		leftVV      VectorisedView
+	}{
+		{
+			comment:     "large VV, short read",
+			vv:          vv(30, "012345678901234567890123456789"),
+			bytesToRead: 10,
+			wantBytes:   "0123456789",
+			leftVV:      vv(20, "01234567890123456789"),
+		},
+		{
+			comment:     "largeVV, multiple views, short read",
+			vv:          vv(13, "123", "345", "567", "8910"),
+			bytesToRead: 6,
+			wantBytes:   "123345",
+			leftVV:      vv(7, "567", "8910"),
+		},
+		{
+			comment:     "smallVV (multiple views), large read",
+			vv:          vv(3, "1", "2", "3"),
+			bytesToRead: 10,
+			wantBytes:   "123",
+			leftVV:      vv(0, ""),
+		},
+		{
+			comment:     "smallVV (single view), large read",
+			vv:          vv(1, "1"),
+			bytesToRead: 10,
+			wantBytes:   "1",
+			leftVV:      vv(0, ""),
+		},
+		{
+			comment:     "emptyVV, large read",
+			vv:          vv(0, ""),
+			bytesToRead: 10,
+			wantBytes:   "",
+			leftVV:      vv(0, ""),
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.comment, func(t *testing.T) {
+			var readTo VectorisedView
+			inSize := tc.vv.Size()
+			copied := tc.vv.ReadToVV(&readTo, tc.bytesToRead)
+			if got, want := copied, len(tc.wantBytes); got != want {
+				t.Errorf("incorrect number of bytes copied returned in ReadToVV got: %d, want: %d, tc: %+v", got, want, tc)
+			}
+			if got, want := string(readTo.ToView()), tc.wantBytes; got != want {
+				t.Errorf("unexpected content in readTo got: %s, want: %s", got, want)
+			}
+			if got, want := tc.vv.Size(), inSize-copied; got != want {
+				t.Errorf("test VV has incorrect size after reading got: %d, want: %d, tc.vv: %+v", got, want, tc.vv)
+			}
+			if got, want := string(tc.vv.ToView()), string(tc.leftVV.ToView()); got != want {
+				t.Errorf("unexpected data left in vv after read got: %+v, want: %+v", got, want)
+			}
+		})
+	}
+}
+
+func TestVVRead(t *testing.T) {
+	testCases := []struct {
+		comment     string
+		vv          VectorisedView
+		bytesToRead int
+		readBytes   string
+		leftBytes   string
+		wantError   bool
+	}{
+		{
+			comment:     "large VV, short read",
+			vv:          vv(30, "012345678901234567890123456789"),
+			bytesToRead: 10,
+			readBytes:   "0123456789",
+			leftBytes:   "01234567890123456789",
+		},
+		{
+			comment:     "largeVV, multiple buffers, short read",
+			vv:          vv(13, "123", "345", "567", "8910"),
+			bytesToRead: 6,
+			readBytes:   "123345",
+			leftBytes:   "5678910",
+		},
+		{
+			comment:     "smallVV, large read",
+			vv:          vv(3, "1", "2", "3"),
+			bytesToRead: 10,
+			readBytes:   "123",
+			leftBytes:   "",
+		},
+		{
+			comment:     "smallVV, large read",
+			vv:          vv(1, "1"),
+			bytesToRead: 10,
+			readBytes:   "1",
+			leftBytes:   "",
+		},
+		{
+			comment:     "emptyVV, large read",
+			vv:          vv(0, ""),
+			bytesToRead: 10,
+			readBytes:   "",
+			wantError:   true,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.comment, func(t *testing.T) {
+			readTo := NewView(tc.bytesToRead)
+			inSize := tc.vv.Size()
+			copied, err := tc.vv.Read(readTo)
+			if !tc.wantError && err != nil {
+				t.Fatalf("unexpected error in tc.vv.Read(..) = %s", err)
+			}
+			readTo = readTo[:copied]
+			if got, want := copied, len(tc.readBytes); got != want {
+				t.Errorf("incorrect number of bytes copied returned in ReadToVV got: %d, want: %d, tc.vv: %+v", got, want, tc.vv)
+			}
+			if got, want := string(readTo), tc.readBytes; got != want {
+				t.Errorf("unexpected data in readTo got: %s, want: %s", got, want)
+			}
+			if got, want := tc.vv.Size(), inSize-copied; got != want {
+				t.Errorf("test VV has incorrect size after reading got: %d, want: %d, tc.vv: %+v", got, want, tc.vv)
+			}
+			if got, want := string(tc.vv.ToView()), tc.leftBytes; got != want {
+				t.Errorf("vv has incorrect data after Read got: %s, want: %s", got, want)
+			}
+		})
+	}
+}
+
+var pullUpTestCases = []struct {
+	comment string
+	in      VectorisedView
+	count   int
+	want    []byte
+	result  VectorisedView
+	ok      bool
+}{
+	{
+		comment: "simple case",
+		in:      vv(2, "12"),
+		count:   1,
+		want:    []byte("1"),
+		result:  vv(2, "12"),
+		ok:      true,
+	},
+	{
+		comment: "entire View",
+		in:      vv(2, "1", "2"),
+		count:   1,
+		want:    []byte("1"),
+		result:  vv(2, "1", "2"),
+		ok:      true,
+	},
+	{
+		comment: "spanning across two Views",
+		in:      vv(3, "1", "23"),
+		count:   2,
+		want:    []byte("12"),
+		result:  vv(3, "12", "3"),
+		ok:      true,
+	},
+	{
+		comment: "spanning across all Views",
+		in:      vv(5, "1", "23", "45"),
+		count:   5,
+		want:    []byte("12345"),
+		result:  vv(5, "12345"),
+		ok:      true,
+	},
+	{
+		comment: "count = 0",
+		in:      vv(1, "1"),
+		count:   0,
+		want:    []byte{},
+		result:  vv(1, "1"),
+		ok:      true,
+	},
+	{
+		comment: "count = size",
+		in:      vv(1, "1"),
+		count:   1,
+		want:    []byte("1"),
+		result:  vv(1, "1"),
+		ok:      true,
+	},
+	{
+		comment: "count too large",
+		in:      vv(3, "1", "23"),
+		count:   4,
+		want:    nil,
+		result:  vv(3, "1", "23"),
+		ok:      false,
+	},
+	{
+		comment: "empty vv",
+		in:      vv(0, ""),
+		count:   1,
+		want:    nil,
+		result:  vv(0, ""),
+		ok:      false,
+	},
+	{
+		comment: "empty vv, count = 0",
+		in:      vv(0, ""),
+		count:   0,
+		want:    nil,
+		result:  vv(0, ""),
+		ok:      true,
+	},
+	{
+		comment: "empty views",
+		in:      vv(3, "", "1", "", "23"),
+		count:   2,
+		want:    []byte("12"),
+		result:  vv(3, "12", "3"),
+		ok:      true,
+	},
+}
+
+func TestPullUp(t *testing.T) {
+	for _, c := range pullUpTestCases {
+		got, ok := c.in.PullUp(c.count)
+
+		// Is the return value right?
+		if ok != c.ok {
+			t.Errorf("Test %q failed when calling PullUp(%d) on %v. Got an ok of %t. Want %t",
+				c.comment, c.count, c.in, ok, c.ok)
+		}
+		if bytes.Compare(got, View(c.want)) != 0 {
+			t.Errorf("Test %q failed when calling PullUp(%d) on %v. Got %v. Want %v",
+				c.comment, c.count, c.in, got, c.want)
+		}
+
+		// Is the underlying structure right?
+		if !reflect.DeepEqual(c.in, c.result) {
+			t.Errorf("Test %q failed when calling PullUp(%d). Got vv with structure %v. Wanted %v",
+				c.comment, c.count, c.in, c.result)
+		}
+	}
+}
+
+func TestToVectorisedView(t *testing.T) {
+	testCases := []struct {
+		in   View
+		want VectorisedView
+	}{
+		{nil, VectorisedView{}},
+		{View{}, VectorisedView{}},
+		{View{'a'}, VectorisedView{size: 1, views: []View{{'a'}}}},
+	}
+	for _, tc := range testCases {
+		if got, want := tc.in.ToVectorisedView(), tc.want; !reflect.DeepEqual(got, want) {
+			t.Errorf("(%v).ToVectorisedView failed got: %+v, want: %+v", tc.in, got, want)
+		}
+	}
+}
+
+func TestAppendView(t *testing.T) {
+	testCases := []struct {
+		vv   VectorisedView
+		in   View
+		want VectorisedView
+	}{
+		{VectorisedView{}, nil, VectorisedView{}},
+		{VectorisedView{}, View{}, VectorisedView{}},
+		{VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}, nil, VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}},
+		{VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}, View{}, VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}},
+		{VectorisedView{[]View{{'a', 'b', 'c', 'd'}}, 4}, View{'e'}, VectorisedView{[]View{{'a', 'b', 'c', 'd'}, {'e'}}, 5}},
+	}
+	for _, tc := range testCases {
+		tc.vv.AppendView(tc.in)
+		if got, want := tc.vv, tc.want; !reflect.DeepEqual(got, want) {
+			t.Errorf("(%v).ToVectorisedView failed got: %+v, want: %+v", tc.in, got, want)
+		}
+	}
+}
diff --git a/pkg/tcpip/checker/BUILD b/pkg/tcpip/checker/BUILD
new file mode 100644
index 000000000..ed434807f
--- /dev/null
+++ b/pkg/tcpip/checker/BUILD
@@ -0,0 +1,16 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "checker",
+    testonly = 1,
+    srcs = ["checker.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/seqnum",
+    ],
+)
diff --git a/pkg/tcpip/checker/checker.go b/pkg/tcpip/checker/checker.go
new file mode 100644
index 000000000..ee264b726
--- /dev/null
+++ b/pkg/tcpip/checker/checker.go
@@ -0,0 +1,976 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package checker provides helper functions to check networking packets for
+// validity.
+package checker
+
+import (
+	"encoding/binary"
+	"reflect"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+)
+
+// NetworkChecker is a function to check a property of a network packet.
+type NetworkChecker func(*testing.T, []header.Network)
+
+// TransportChecker is a function to check a property of a transport packet.
+type TransportChecker func(*testing.T, header.Transport)
+
+// ControlMessagesChecker is a function to check a property of ancillary data.
+type ControlMessagesChecker func(*testing.T, tcpip.ControlMessages)
+
+// IPv4 checks the validity and properties of the given IPv4 packet. It is
+// expected to be used in conjunction with other network checkers for specific
+// properties. For example, to check the source and destination address, one
+// would call:
+//
+// checker.IPv4(t, b, checker.SrcAddr(x), checker.DstAddr(y))
+func IPv4(t *testing.T, b []byte, checkers ...NetworkChecker) {
+	t.Helper()
+
+	ipv4 := header.IPv4(b)
+
+	if !ipv4.IsValid(len(b)) {
+		t.Error("Not a valid IPv4 packet")
+	}
+
+	xsum := ipv4.CalculateChecksum()
+	if xsum != 0 && xsum != 0xffff {
+		t.Errorf("Bad checksum: 0x%x, checksum in packet: 0x%x", xsum, ipv4.Checksum())
+	}
+
+	for _, f := range checkers {
+		f(t, []header.Network{ipv4})
+	}
+	if t.Failed() {
+		t.FailNow()
+	}
+}
+
+// IPv6 checks the validity and properties of the given IPv6 packet. The usage
+// is similar to IPv4.
+func IPv6(t *testing.T, b []byte, checkers ...NetworkChecker) {
+	t.Helper()
+
+	ipv6 := header.IPv6(b)
+	if !ipv6.IsValid(len(b)) {
+		t.Error("Not a valid IPv6 packet")
+	}
+
+	for _, f := range checkers {
+		f(t, []header.Network{ipv6})
+	}
+	if t.Failed() {
+		t.FailNow()
+	}
+}
+
+// SrcAddr creates a checker that checks the source address.
+func SrcAddr(addr tcpip.Address) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		if a := h[0].SourceAddress(); a != addr {
+			t.Errorf("Bad source address, got %v, want %v", a, addr)
+		}
+	}
+}
+
+// DstAddr creates a checker that checks the destination address.
+func DstAddr(addr tcpip.Address) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		if a := h[0].DestinationAddress(); a != addr {
+			t.Errorf("Bad destination address, got %v, want %v", a, addr)
+		}
+	}
+}
+
+// TTL creates a checker that checks the TTL (ipv4) or HopLimit (ipv6).
+func TTL(ttl uint8) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		var v uint8
+		switch ip := h[0].(type) {
+		case header.IPv4:
+			v = ip.TTL()
+		case header.IPv6:
+			v = ip.HopLimit()
+		}
+		if v != ttl {
+			t.Fatalf("Bad TTL, got %v, want %v", v, ttl)
+		}
+	}
+}
+
+// PayloadLen creates a checker that checks the payload length.
+func PayloadLen(plen int) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		if l := len(h[0].Payload()); l != plen {
+			t.Errorf("Bad payload length, got %v, want %v", l, plen)
+		}
+	}
+}
+
+// FragmentOffset creates a checker that checks the FragmentOffset field.
+func FragmentOffset(offset uint16) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		// We only do this of IPv4 for now.
+		switch ip := h[0].(type) {
+		case header.IPv4:
+			if v := ip.FragmentOffset(); v != offset {
+				t.Errorf("Bad fragment offset, got %v, want %v", v, offset)
+			}
+		}
+	}
+}
+
+// FragmentFlags creates a checker that checks the fragment flags field.
+func FragmentFlags(flags uint8) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		// We only do this of IPv4 for now.
+		switch ip := h[0].(type) {
+		case header.IPv4:
+			if v := ip.Flags(); v != flags {
+				t.Errorf("Bad fragment offset, got %v, want %v", v, flags)
+			}
+		}
+	}
+}
+
+// ReceiveTClass creates a checker that checks the TCLASS field in
+// ControlMessages.
+func ReceiveTClass(want uint32) ControlMessagesChecker {
+	return func(t *testing.T, cm tcpip.ControlMessages) {
+		t.Helper()
+		if !cm.HasTClass {
+			t.Fatalf("got cm.HasTClass = %t, want cm.TClass = %d", cm.HasTClass, want)
+		}
+		if got := cm.TClass; got != want {
+			t.Fatalf("got cm.TClass = %d, want %d", got, want)
+		}
+	}
+}
+
+// ReceiveTOS creates a checker that checks the TOS field in ControlMessages.
+func ReceiveTOS(want uint8) ControlMessagesChecker {
+	return func(t *testing.T, cm tcpip.ControlMessages) {
+		t.Helper()
+		if !cm.HasTOS {
+			t.Fatalf("got cm.HasTOS = %t, want cm.TOS = %d", cm.HasTOS, want)
+		}
+		if got := cm.TOS; got != want {
+			t.Fatalf("got cm.TOS = %d, want %d", got, want)
+		}
+	}
+}
+
+// TOS creates a checker that checks the TOS field.
+func TOS(tos uint8, label uint32) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		if v, l := h[0].TOS(); v != tos || l != label {
+			t.Errorf("Bad TOS, got (%v, %v), want (%v,%v)", v, l, tos, label)
+		}
+	}
+}
+
+// Raw creates a checker that checks the bytes of payload.
+// The checker always checks the payload of the last network header.
+// For instance, in case of IPv6 fragments, the payload that will be checked
+// is the one containing the actual data that the packet is carrying, without
+// the bytes added by the IPv6 fragmentation.
+func Raw(want []byte) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		if got := h[len(h)-1].Payload(); !reflect.DeepEqual(got, want) {
+			t.Errorf("Wrong payload, got %v, want %v", got, want)
+		}
+	}
+}
+
+// IPv6Fragment creates a checker that validates an IPv6 fragment.
+func IPv6Fragment(checkers ...NetworkChecker) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		if p := h[0].TransportProtocol(); p != header.IPv6FragmentHeader {
+			t.Errorf("Bad protocol, got %v, want %v", p, header.UDPProtocolNumber)
+		}
+
+		ipv6Frag := header.IPv6Fragment(h[0].Payload())
+		if !ipv6Frag.IsValid() {
+			t.Error("Not a valid IPv6 fragment")
+		}
+
+		for _, f := range checkers {
+			f(t, []header.Network{h[0], ipv6Frag})
+		}
+		if t.Failed() {
+			t.FailNow()
+		}
+	}
+}
+
+// TCP creates a checker that checks that the transport protocol is TCP and
+// potentially additional transport header fields.
+func TCP(checkers ...TransportChecker) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		first := h[0]
+		last := h[len(h)-1]
+
+		if p := last.TransportProtocol(); p != header.TCPProtocolNumber {
+			t.Errorf("Bad protocol, got %v, want %v", p, header.TCPProtocolNumber)
+		}
+
+		// Verify the checksum.
+		tcp := header.TCP(last.Payload())
+		l := uint16(len(tcp))
+
+		xsum := header.Checksum([]byte(first.SourceAddress()), 0)
+		xsum = header.Checksum([]byte(first.DestinationAddress()), xsum)
+		xsum = header.Checksum([]byte{0, byte(last.TransportProtocol())}, xsum)
+		xsum = header.Checksum([]byte{byte(l >> 8), byte(l)}, xsum)
+		xsum = header.Checksum(tcp, xsum)
+
+		if xsum != 0 && xsum != 0xffff {
+			t.Errorf("Bad checksum: 0x%x, checksum in segment: 0x%x", xsum, tcp.Checksum())
+		}
+
+		// Run the transport checkers.
+		for _, f := range checkers {
+			f(t, tcp)
+		}
+		if t.Failed() {
+			t.FailNow()
+		}
+	}
+}
+
+// UDP creates a checker that checks that the transport protocol is UDP and
+// potentially additional transport header fields.
+func UDP(checkers ...TransportChecker) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		last := h[len(h)-1]
+
+		if p := last.TransportProtocol(); p != header.UDPProtocolNumber {
+			t.Errorf("Bad protocol, got %v, want %v", p, header.UDPProtocolNumber)
+		}
+
+		udp := header.UDP(last.Payload())
+		for _, f := range checkers {
+			f(t, udp)
+		}
+		if t.Failed() {
+			t.FailNow()
+		}
+	}
+}
+
+// SrcPort creates a checker that checks the source port.
+func SrcPort(port uint16) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		if p := h.SourcePort(); p != port {
+			t.Errorf("Bad source port, got %v, want %v", p, port)
+		}
+	}
+}
+
+// DstPort creates a checker that checks the destination port.
+func DstPort(port uint16) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		if p := h.DestinationPort(); p != port {
+			t.Errorf("Bad destination port, got %v, want %v", p, port)
+		}
+	}
+}
+
+// NoChecksum creates a checker that checks if the checksum is zero.
+func NoChecksum(noChecksum bool) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		udp, ok := h.(header.UDP)
+		if !ok {
+			return
+		}
+
+		if b := udp.Checksum() == 0; b != noChecksum {
+			t.Errorf("bad checksum state, got %t, want %t", b, noChecksum)
+		}
+	}
+}
+
+// SeqNum creates a checker that checks the sequence number.
+func SeqNum(seq uint32) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		tcp, ok := h.(header.TCP)
+		if !ok {
+			return
+		}
+
+		if s := tcp.SequenceNumber(); s != seq {
+			t.Errorf("Bad sequence number, got %v, want %v", s, seq)
+		}
+	}
+}
+
+// AckNum creates a checker that checks the ack number.
+func AckNum(seq uint32) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		tcp, ok := h.(header.TCP)
+		if !ok {
+			return
+		}
+
+		if s := tcp.AckNumber(); s != seq {
+			t.Errorf("Bad ack number, got %v, want %v", s, seq)
+		}
+	}
+}
+
+// Window creates a checker that checks the tcp window.
+func Window(window uint16) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		tcp, ok := h.(header.TCP)
+		if !ok {
+			return
+		}
+
+		if w := tcp.WindowSize(); w != window {
+			t.Errorf("Bad window, got 0x%x, want 0x%x", w, window)
+		}
+	}
+}
+
+// TCPFlags creates a checker that checks the tcp flags.
+func TCPFlags(flags uint8) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		tcp, ok := h.(header.TCP)
+		if !ok {
+			return
+		}
+
+		if f := tcp.Flags(); f != flags {
+			t.Errorf("Bad flags, got 0x%x, want 0x%x", f, flags)
+		}
+	}
+}
+
+// TCPFlagsMatch creates a checker that checks that the tcp flags, masked by the
+// given mask, match the supplied flags.
+func TCPFlagsMatch(flags, mask uint8) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		tcp, ok := h.(header.TCP)
+		if !ok {
+			return
+		}
+
+		if f := tcp.Flags(); (f & mask) != (flags & mask) {
+			t.Errorf("Bad masked flags, got 0x%x, want 0x%x, mask 0x%x", f, flags, mask)
+		}
+	}
+}
+
+// TCPSynOptions creates a checker that checks the presence of TCP options in
+// SYN segments.
+//
+// If wndscale is negative, the window scale option must not be present.
+func TCPSynOptions(wantOpts header.TCPSynOptions) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		tcp, ok := h.(header.TCP)
+		if !ok {
+			return
+		}
+		opts := tcp.Options()
+		limit := len(opts)
+		foundMSS := false
+		foundWS := false
+		foundTS := false
+		foundSACKPermitted := false
+		tsVal := uint32(0)
+		tsEcr := uint32(0)
+		for i := 0; i < limit; {
+			switch opts[i] {
+			case header.TCPOptionEOL:
+				i = limit
+			case header.TCPOptionNOP:
+				i++
+			case header.TCPOptionMSS:
+				v := uint16(opts[i+2])<<8 | uint16(opts[i+3])
+				if wantOpts.MSS != v {
+					t.Errorf("Bad MSS: got %v, want %v", v, wantOpts.MSS)
+				}
+				foundMSS = true
+				i += 4
+			case header.TCPOptionWS:
+				if wantOpts.WS < 0 {
+					t.Error("WS present when it shouldn't be")
+				}
+				v := int(opts[i+2])
+				if v != wantOpts.WS {
+					t.Errorf("Bad WS: got %v, want %v", v, wantOpts.WS)
+				}
+				foundWS = true
+				i += 3
+			case header.TCPOptionTS:
+				if i+9 >= limit {
+					t.Errorf("TS Option truncated , option is only: %d bytes, want 10", limit-i)
+				}
+				if opts[i+1] != 10 {
+					t.Errorf("Bad length %d for TS option, limit: %d", opts[i+1], limit)
+				}
+				tsVal = binary.BigEndian.Uint32(opts[i+2:])
+				tsEcr = uint32(0)
+				if tcp.Flags()&header.TCPFlagAck != 0 {
+					// If the syn is an SYN-ACK then read
+					// the tsEcr value as well.
+					tsEcr = binary.BigEndian.Uint32(opts[i+6:])
+				}
+				foundTS = true
+				i += 10
+			case header.TCPOptionSACKPermitted:
+				if i+1 >= limit {
+					t.Errorf("SACKPermitted option truncated, option is only : %d bytes, want 2", limit-i)
+				}
+				if opts[i+1] != 2 {
+					t.Errorf("Bad length %d for SACKPermitted option, limit: %d", opts[i+1], limit)
+				}
+				foundSACKPermitted = true
+				i += 2
+
+			default:
+				i += int(opts[i+1])
+			}
+		}
+
+		if !foundMSS {
+			t.Errorf("MSS option not found. Options: %x", opts)
+		}
+
+		if !foundWS && wantOpts.WS >= 0 {
+			t.Errorf("WS option not found. Options: %x", opts)
+		}
+		if wantOpts.TS && !foundTS {
+			t.Errorf("TS option not found. Options: %x", opts)
+		}
+		if foundTS && tsVal == 0 {
+			t.Error("TS option specified but the timestamp value is zero")
+		}
+		if foundTS && tsEcr == 0 && wantOpts.TSEcr != 0 {
+			t.Errorf("TS option specified but TSEcr is incorrect: got %d, want: %d", tsEcr, wantOpts.TSEcr)
+		}
+		if wantOpts.SACKPermitted && !foundSACKPermitted {
+			t.Errorf("SACKPermitted option not found. Options: %x", opts)
+		}
+	}
+}
+
+// TCPTimestampChecker creates a checker that validates that a TCP segment has a
+// TCP Timestamp option if wantTS is true, it also compares the wantTSVal and
+// wantTSEcr values with those in the TCP segment (if present).
+//
+// If wantTSVal or wantTSEcr is zero then the corresponding comparison is
+// skipped.
+func TCPTimestampChecker(wantTS bool, wantTSVal uint32, wantTSEcr uint32) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		tcp, ok := h.(header.TCP)
+		if !ok {
+			return
+		}
+		opts := []byte(tcp.Options())
+		limit := len(opts)
+		foundTS := false
+		tsVal := uint32(0)
+		tsEcr := uint32(0)
+		for i := 0; i < limit; {
+			switch opts[i] {
+			case header.TCPOptionEOL:
+				i = limit
+			case header.TCPOptionNOP:
+				i++
+			case header.TCPOptionTS:
+				if i+9 >= limit {
+					t.Errorf("TS option found, but option is truncated, option length: %d, want 10 bytes", limit-i)
+				}
+				if opts[i+1] != 10 {
+					t.Errorf("TS option found, but bad length specified: %d, want: 10", opts[i+1])
+				}
+				tsVal = binary.BigEndian.Uint32(opts[i+2:])
+				tsEcr = binary.BigEndian.Uint32(opts[i+6:])
+				foundTS = true
+				i += 10
+			default:
+				// We don't recognize this option, just skip over it.
+				if i+2 > limit {
+					return
+				}
+				l := int(opts[i+1])
+				if i < 2 || i+l > limit {
+					return
+				}
+				i += l
+			}
+		}
+
+		if wantTS != foundTS {
+			t.Errorf("TS Option mismatch: got TS= %v, want TS= %v", foundTS, wantTS)
+		}
+		if wantTS && wantTSVal != 0 && wantTSVal != tsVal {
+			t.Errorf("Timestamp value is incorrect: got: %d, want: %d", tsVal, wantTSVal)
+		}
+		if wantTS && wantTSEcr != 0 && tsEcr != wantTSEcr {
+			t.Errorf("Timestamp Echo Reply is incorrect: got: %d, want: %d", tsEcr, wantTSEcr)
+		}
+	}
+}
+
+// TCPNoSACKBlockChecker creates a checker that verifies that the segment does not
+// contain any SACK blocks in the TCP options.
+func TCPNoSACKBlockChecker() TransportChecker {
+	return TCPSACKBlockChecker(nil)
+}
+
+// TCPSACKBlockChecker creates a checker that verifies that the segment does
+// contain the specified SACK blocks in the TCP options.
+func TCPSACKBlockChecker(sackBlocks []header.SACKBlock) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+		tcp, ok := h.(header.TCP)
+		if !ok {
+			return
+		}
+		var gotSACKBlocks []header.SACKBlock
+
+		opts := []byte(tcp.Options())
+		limit := len(opts)
+		for i := 0; i < limit; {
+			switch opts[i] {
+			case header.TCPOptionEOL:
+				i = limit
+			case header.TCPOptionNOP:
+				i++
+			case header.TCPOptionSACK:
+				if i+2 > limit {
+					// Malformed SACK block.
+					t.Errorf("malformed SACK option in options: %v", opts)
+				}
+				sackOptionLen := int(opts[i+1])
+				if i+sackOptionLen > limit || (sackOptionLen-2)%8 != 0 {
+					// Malformed SACK block.
+					t.Errorf("malformed SACK option length in options: %v", opts)
+				}
+				numBlocks := sackOptionLen / 8
+				for j := 0; j < numBlocks; j++ {
+					start := binary.BigEndian.Uint32(opts[i+2+j*8:])
+					end := binary.BigEndian.Uint32(opts[i+2+j*8+4:])
+					gotSACKBlocks = append(gotSACKBlocks, header.SACKBlock{
+						Start: seqnum.Value(start),
+						End:   seqnum.Value(end),
+					})
+				}
+				i += sackOptionLen
+			default:
+				// We don't recognize this option, just skip over it.
+				if i+2 > limit {
+					break
+				}
+				l := int(opts[i+1])
+				if l < 2 || i+l > limit {
+					break
+				}
+				i += l
+			}
+		}
+
+		if !reflect.DeepEqual(gotSACKBlocks, sackBlocks) {
+			t.Errorf("SACKBlocks are not equal, got: %v, want: %v", gotSACKBlocks, sackBlocks)
+		}
+	}
+}
+
+// Payload creates a checker that checks the payload.
+func Payload(want []byte) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		if got := h.Payload(); !reflect.DeepEqual(got, want) {
+			t.Errorf("Wrong payload, got %v, want %v", got, want)
+		}
+	}
+}
+
+// ICMPv4 creates a checker that checks that the transport protocol is ICMPv4 and
+// potentially additional ICMPv4 header fields.
+func ICMPv4(checkers ...TransportChecker) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		last := h[len(h)-1]
+
+		if p := last.TransportProtocol(); p != header.ICMPv4ProtocolNumber {
+			t.Fatalf("Bad protocol, got %d, want %d", p, header.ICMPv4ProtocolNumber)
+		}
+
+		icmp := header.ICMPv4(last.Payload())
+		for _, f := range checkers {
+			f(t, icmp)
+		}
+		if t.Failed() {
+			t.FailNow()
+		}
+	}
+}
+
+// ICMPv4Type creates a checker that checks the ICMPv4 Type field.
+func ICMPv4Type(want header.ICMPv4Type) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmpv4, ok := h.(header.ICMPv4)
+		if !ok {
+			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv4", h)
+		}
+		if got := icmpv4.Type(); got != want {
+			t.Fatalf("unexpected icmp type got: %d, want: %d", got, want)
+		}
+	}
+}
+
+// ICMPv4Code creates a checker that checks the ICMPv4 Code field.
+func ICMPv4Code(want byte) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmpv4, ok := h.(header.ICMPv4)
+		if !ok {
+			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv4", h)
+		}
+		if got := icmpv4.Code(); got != want {
+			t.Fatalf("unexpected ICMP code got: %d, want: %d", got, want)
+		}
+	}
+}
+
+// ICMPv6 creates a checker that checks that the transport protocol is ICMPv6 and
+// potentially additional ICMPv6 header fields.
+//
+// ICMPv6 will validate the checksum field before calling checkers.
+func ICMPv6(checkers ...TransportChecker) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		last := h[len(h)-1]
+
+		if p := last.TransportProtocol(); p != header.ICMPv6ProtocolNumber {
+			t.Fatalf("Bad protocol, got %d, want %d", p, header.ICMPv6ProtocolNumber)
+		}
+
+		icmp := header.ICMPv6(last.Payload())
+		if got, want := icmp.Checksum(), header.ICMPv6Checksum(icmp, last.SourceAddress(), last.DestinationAddress(), buffer.VectorisedView{}); got != want {
+			t.Fatalf("Bad ICMPv6 checksum; got %d, want %d", got, want)
+		}
+
+		for _, f := range checkers {
+			f(t, icmp)
+		}
+		if t.Failed() {
+			t.FailNow()
+		}
+	}
+}
+
+// ICMPv6Type creates a checker that checks the ICMPv6 Type field.
+func ICMPv6Type(want header.ICMPv6Type) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmpv6, ok := h.(header.ICMPv6)
+		if !ok {
+			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv6", h)
+		}
+		if got := icmpv6.Type(); got != want {
+			t.Fatalf("unexpected icmp type got: %d, want: %d", got, want)
+		}
+	}
+}
+
+// ICMPv6Code creates a checker that checks the ICMPv6 Code field.
+func ICMPv6Code(want byte) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmpv6, ok := h.(header.ICMPv6)
+		if !ok {
+			t.Fatalf("unexpected transport header passed to checker got: %+v, want: header.ICMPv6", h)
+		}
+		if got := icmpv6.Code(); got != want {
+			t.Fatalf("unexpected ICMP code got: %d, want: %d", got, want)
+		}
+	}
+}
+
+// NDP creates a checker that checks that the packet contains a valid NDP
+// message for type of ty, with potentially additional checks specified by
+// checkers.
+//
+// Checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDP message as far as the size of the message (minSize) is concerned. The
+// values within the message are up to checkers to validate.
+func NDP(msgType header.ICMPv6Type, minSize int, checkers ...TransportChecker) NetworkChecker {
+	return func(t *testing.T, h []header.Network) {
+		t.Helper()
+
+		// Check normal ICMPv6 first.
+		ICMPv6(
+			ICMPv6Type(msgType),
+			ICMPv6Code(0))(t, h)
+
+		last := h[len(h)-1]
+
+		icmp := header.ICMPv6(last.Payload())
+		if got := len(icmp.NDPPayload()); got < minSize {
+			t.Fatalf("ICMPv6 NDP (type = %d) payload size of %d is less than the minimum size of %d", msgType, got, minSize)
+		}
+
+		for _, f := range checkers {
+			f(t, icmp)
+		}
+		if t.Failed() {
+			t.FailNow()
+		}
+	}
+}
+
+// NDPNS creates a checker that checks that the packet contains a valid NDP
+// Neighbor Solicitation message (as per the raw wire format), with potentially
+// additional checks specified by checkers.
+//
+// Checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDPNS message as far as the size of the message is concerned. The values
+// within the message are up to checkers to validate.
+func NDPNS(checkers ...TransportChecker) NetworkChecker {
+	return NDP(header.ICMPv6NeighborSolicit, header.NDPNSMinimumSize, checkers...)
+}
+
+// NDPNSTargetAddress creates a checker that checks the Target Address field of
+// a header.NDPNeighborSolicit.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPNS message as far as the size is concerned.
+func NDPNSTargetAddress(want tcpip.Address) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+
+		if got := ns.TargetAddress(); got != want {
+			t.Errorf("got %T.TargetAddress() = %s, want = %s", ns, got, want)
+		}
+	}
+}
+
+// NDPNA creates a checker that checks that the packet contains a valid NDP
+// Neighbor Advertisement message (as per the raw wire format), with potentially
+// additional checks specified by checkers.
+//
+// Checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDPNA message as far as the size of the message is concerned. The values
+// within the message are up to checkers to validate.
+func NDPNA(checkers ...TransportChecker) NetworkChecker {
+	return NDP(header.ICMPv6NeighborAdvert, header.NDPNAMinimumSize, checkers...)
+}
+
+// NDPNATargetAddress creates a checker that checks the Target Address field of
+// a header.NDPNeighborAdvert.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPNA message as far as the size is concerned.
+func NDPNATargetAddress(want tcpip.Address) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		na := header.NDPNeighborAdvert(icmp.NDPPayload())
+
+		if got := na.TargetAddress(); got != want {
+			t.Errorf("got %T.TargetAddress() = %s, want = %s", na, got, want)
+		}
+	}
+}
+
+// NDPNASolicitedFlag creates a checker that checks the Solicited field of
+// a header.NDPNeighborAdvert.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPNA message as far as the size is concerned.
+func NDPNASolicitedFlag(want bool) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		na := header.NDPNeighborAdvert(icmp.NDPPayload())
+
+		if got := na.SolicitedFlag(); got != want {
+			t.Errorf("got %T.SolicitedFlag = %t, want = %t", na, got, want)
+		}
+	}
+}
+
+// ndpOptions checks that optsBuf only contains opts.
+func ndpOptions(t *testing.T, optsBuf header.NDPOptions, opts []header.NDPOption) {
+	t.Helper()
+
+	it, err := optsBuf.Iter(true)
+	if err != nil {
+		t.Errorf("optsBuf.Iter(true): %s", err)
+		return
+	}
+
+	i := 0
+	for {
+		opt, done, err := it.Next()
+		if err != nil {
+			// This should never happen as Iter(true) above did not return an error.
+			t.Fatalf("unexpected error when iterating over NDP options: %s", err)
+		}
+		if done {
+			break
+		}
+
+		if i >= len(opts) {
+			t.Errorf("got unexpected option: %s", opt)
+			continue
+		}
+
+		switch wantOpt := opts[i].(type) {
+		case header.NDPSourceLinkLayerAddressOption:
+			gotOpt, ok := opt.(header.NDPSourceLinkLayerAddressOption)
+			if !ok {
+				t.Errorf("got type = %T at index = %d; want = %T", opt, i, wantOpt)
+			} else if got, want := gotOpt.EthernetAddress(), wantOpt.EthernetAddress(); got != want {
+				t.Errorf("got EthernetAddress() = %s at index %d, want = %s", got, i, want)
+			}
+		case header.NDPTargetLinkLayerAddressOption:
+			gotOpt, ok := opt.(header.NDPTargetLinkLayerAddressOption)
+			if !ok {
+				t.Errorf("got type = %T at index = %d; want = %T", opt, i, wantOpt)
+			} else if got, want := gotOpt.EthernetAddress(), wantOpt.EthernetAddress(); got != want {
+				t.Errorf("got EthernetAddress() = %s at index %d, want = %s", got, i, want)
+			}
+		default:
+			t.Fatalf("checker not implemented for expected NDP option: %T", wantOpt)
+		}
+
+		i++
+	}
+
+	if missing := opts[i:]; len(missing) > 0 {
+		t.Errorf("missing options: %s", missing)
+	}
+}
+
+// NDPNAOptions creates a checker that checks that the packet contains the
+// provided NDP options within an NDP Neighbor Solicitation message.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPNA message as far as the size is concerned.
+func NDPNAOptions(opts []header.NDPOption) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		na := header.NDPNeighborAdvert(icmp.NDPPayload())
+		ndpOptions(t, na.Options(), opts)
+	}
+}
+
+// NDPNSOptions creates a checker that checks that the packet contains the
+// provided NDP options within an NDP Neighbor Solicitation message.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPNS message as far as the size is concerned.
+func NDPNSOptions(opts []header.NDPOption) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		ns := header.NDPNeighborSolicit(icmp.NDPPayload())
+		ndpOptions(t, ns.Options(), opts)
+	}
+}
+
+// NDPRS creates a checker that checks that the packet contains a valid NDP
+// Router Solicitation message (as per the raw wire format).
+//
+// Checkers may assume that a valid ICMPv6 is passed to it containing a valid
+// NDPRS as far as the size of the message is concerned. The values within the
+// message are up to checkers to validate.
+func NDPRS(checkers ...TransportChecker) NetworkChecker {
+	return NDP(header.ICMPv6RouterSolicit, header.NDPRSMinimumSize, checkers...)
+}
+
+// NDPRSOptions creates a checker that checks that the packet contains the
+// provided NDP options within an NDP Router Solicitation message.
+//
+// The returned TransportChecker assumes that a valid ICMPv6 is passed to it
+// containing a valid NDPRS message as far as the size is concerned.
+func NDPRSOptions(opts []header.NDPOption) TransportChecker {
+	return func(t *testing.T, h header.Transport) {
+		t.Helper()
+
+		icmp := h.(header.ICMPv6)
+		rs := header.NDPRouterSolicit(icmp.NDPPayload())
+		ndpOptions(t, rs.Options(), opts)
+	}
+}
diff --git a/pkg/tcpip/hash/jenkins/BUILD b/pkg/tcpip/hash/jenkins/BUILD
new file mode 100644
index 000000000..ff2719291
--- /dev/null
+++ b/pkg/tcpip/hash/jenkins/BUILD
@@ -0,0 +1,18 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "jenkins",
+    srcs = ["jenkins.go"],
+    visibility = ["//visibility:public"],
+)
+
+go_test(
+    name = "jenkins_test",
+    size = "small",
+    srcs = [
+        "jenkins_test.go",
+    ],
+    library = ":jenkins",
+)
diff --git a/pkg/tcpip/hash/jenkins/jenkins.go b/pkg/tcpip/hash/jenkins/jenkins.go
new file mode 100644
index 000000000..52c22230e
--- /dev/null
+++ b/pkg/tcpip/hash/jenkins/jenkins.go
@@ -0,0 +1,80 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package jenkins implements Jenkins's one_at_a_time, non-cryptographic hash
+// functions created by by Bob Jenkins.
+//
+// See https://en.wikipedia.org/wiki/Jenkins_hash_function#cite_note-dobbsx-1
+//
+package jenkins
+
+import (
+	"hash"
+)
+
+// Sum32 represents Jenkins's one_at_a_time hash.
+//
+// Use the Sum32 type directly (as opposed to New32 below)
+// to avoid allocations.
+type Sum32 uint32
+
+// New32 returns a new 32-bit Jenkins's one_at_a_time hash.Hash.
+//
+// Its Sum method will lay the value out in big-endian byte order.
+func New32() hash.Hash32 {
+	var s Sum32
+	return &s
+}
+
+// Reset resets the hash to its initial state.
+func (s *Sum32) Reset() { *s = 0 }
+
+// Sum32 returns the hash value
+func (s *Sum32) Sum32() uint32 {
+	hash := *s
+
+	hash += (hash << 3)
+	hash ^= hash >> 11
+	hash += hash << 15
+
+	return uint32(hash)
+}
+
+// Write adds more data to the running hash.
+//
+// It never returns an error.
+func (s *Sum32) Write(data []byte) (int, error) {
+	hash := *s
+	for _, b := range data {
+		hash += Sum32(b)
+		hash += hash << 10
+		hash ^= hash >> 6
+	}
+	*s = hash
+	return len(data), nil
+}
+
+// Size returns the number of bytes Sum will return.
+func (s *Sum32) Size() int { return 4 }
+
+// BlockSize returns the hash's underlying block size.
+func (s *Sum32) BlockSize() int { return 1 }
+
+// Sum appends the current hash to in and returns the resulting slice.
+//
+// It does not change the underlying hash state.
+func (s *Sum32) Sum(in []byte) []byte {
+	v := s.Sum32()
+	return append(in, byte(v>>24), byte(v>>16), byte(v>>8), byte(v))
+}
diff --git a/pkg/tcpip/hash/jenkins/jenkins_test.go b/pkg/tcpip/hash/jenkins/jenkins_test.go
new file mode 100644
index 000000000..4c78b5808
--- /dev/null
+++ b/pkg/tcpip/hash/jenkins/jenkins_test.go
@@ -0,0 +1,176 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+package jenkins
+
+import (
+	"bytes"
+	"encoding/binary"
+	"hash"
+	"hash/fnv"
+	"math"
+	"testing"
+)
+
+func TestGolden32(t *testing.T) {
+	var golden32 = []struct {
+		out []byte
+		in  string
+	}{
+		{[]byte{0x00, 0x00, 0x00, 0x00}, ""},
+		{[]byte{0xca, 0x2e, 0x94, 0x42}, "a"},
+		{[]byte{0x45, 0xe6, 0x1e, 0x58}, "ab"},
+		{[]byte{0xed, 0x13, 0x1f, 0x5b}, "abc"},
+	}
+
+	hash := New32()
+
+	for _, g := range golden32 {
+		hash.Reset()
+		done, error := hash.Write([]byte(g.in))
+		if error != nil {
+			t.Fatalf("write error: %s", error)
+		}
+		if done != len(g.in) {
+			t.Fatalf("wrote only %d out of %d bytes", done, len(g.in))
+		}
+		if actual := hash.Sum(nil); !bytes.Equal(g.out, actual) {
+			t.Errorf("hash(%q) = 0x%x want 0x%x", g.in, actual, g.out)
+		}
+	}
+}
+
+func TestIntegrity32(t *testing.T) {
+	data := []byte{'1', '2', 3, 4, 5}
+
+	h := New32()
+	h.Write(data)
+	sum := h.Sum(nil)
+
+	if size := h.Size(); size != len(sum) {
+		t.Fatalf("Size()=%d but len(Sum())=%d", size, len(sum))
+	}
+
+	if a := h.Sum(nil); !bytes.Equal(sum, a) {
+		t.Fatalf("first Sum()=0x%x, second Sum()=0x%x", sum, a)
+	}
+
+	h.Reset()
+	h.Write(data)
+	if a := h.Sum(nil); !bytes.Equal(sum, a) {
+		t.Fatalf("Sum()=0x%x, but after Reset() Sum()=0x%x", sum, a)
+	}
+
+	h.Reset()
+	h.Write(data[:2])
+	h.Write(data[2:])
+	if a := h.Sum(nil); !bytes.Equal(sum, a) {
+		t.Fatalf("Sum()=0x%x, but with partial writes, Sum()=0x%x", sum, a)
+	}
+
+	sum32 := h.(hash.Hash32).Sum32()
+	if sum32 != binary.BigEndian.Uint32(sum) {
+		t.Fatalf("Sum()=0x%x, but Sum32()=0x%x", sum, sum32)
+	}
+}
+
+func BenchmarkJenkins32KB(b *testing.B) {
+	h := New32()
+
+	b.SetBytes(1024)
+	data := make([]byte, 1024)
+	for i := range data {
+		data[i] = byte(i)
+	}
+	in := make([]byte, 0, h.Size())
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		h.Reset()
+		h.Write(data)
+		h.Sum(in)
+	}
+}
+
+func BenchmarkFnv32(b *testing.B) {
+	arr := make([]int64, 1000)
+	for i := 0; i < b.N; i++ {
+		var payload [8]byte
+		binary.BigEndian.PutUint32(payload[:4], uint32(i))
+		binary.BigEndian.PutUint32(payload[4:], uint32(i))
+
+		h := fnv.New32()
+		h.Write(payload[:])
+		idx := int(h.Sum32()) % len(arr)
+		arr[idx]++
+	}
+	b.StopTimer()
+	c := 0
+	if b.N > 1000000 {
+		for i := 0; i < len(arr)-1; i++ {
+			if math.Abs(float64(arr[i]-arr[i+1]))/float64(arr[i]) > float64(0.1) {
+				if c == 0 {
+					b.Logf("i %d val[i] %d val[i+1] %d b.N %b\n", i, arr[i], arr[i+1], b.N)
+				}
+				c++
+			}
+		}
+		if c > 0 {
+			b.Logf("Unbalanced buckets: %d", c)
+		}
+	}
+}
+
+func BenchmarkSum32(b *testing.B) {
+	arr := make([]int64, 1000)
+	for i := 0; i < b.N; i++ {
+		var payload [8]byte
+		binary.BigEndian.PutUint32(payload[:4], uint32(i))
+		binary.BigEndian.PutUint32(payload[4:], uint32(i))
+		h := Sum32(0)
+		h.Write(payload[:])
+		idx := int(h.Sum32()) % len(arr)
+		arr[idx]++
+	}
+	b.StopTimer()
+	if b.N > 1000000 {
+		for i := 0; i < len(arr)-1; i++ {
+			if math.Abs(float64(arr[i]-arr[i+1]))/float64(arr[i]) > float64(0.1) {
+				b.Logf("val[%3d]=%8d\tval[%3d]=%8d\tb.N=%b\n", i, arr[i], i+1, arr[i+1], b.N)
+				break
+			}
+		}
+	}
+}
+
+func BenchmarkNew32(b *testing.B) {
+	arr := make([]int64, 1000)
+	for i := 0; i < b.N; i++ {
+		var payload [8]byte
+		binary.BigEndian.PutUint32(payload[:4], uint32(i))
+		binary.BigEndian.PutUint32(payload[4:], uint32(i))
+		h := New32()
+		h.Write(payload[:])
+		idx := int(h.Sum32()) % len(arr)
+		arr[idx]++
+	}
+	b.StopTimer()
+	if b.N > 1000000 {
+		for i := 0; i < len(arr)-1; i++ {
+			if math.Abs(float64(arr[i]-arr[i+1]))/float64(arr[i]) > float64(0.1) {
+				b.Logf("val[%3d]=%8d\tval[%3d]=%8d\tb.N=%b\n", i, arr[i], i+1, arr[i+1], b.N)
+				break
+			}
+		}
+	}
+}
diff --git a/pkg/tcpip/header/BUILD b/pkg/tcpip/header/BUILD
new file mode 100644
index 000000000..0cde694dc
--- /dev/null
+++ b/pkg/tcpip/header/BUILD
@@ -0,0 +1,69 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "header",
+    srcs = [
+        "arp.go",
+        "checksum.go",
+        "eth.go",
+        "gue.go",
+        "icmpv4.go",
+        "icmpv6.go",
+        "interfaces.go",
+        "ipv4.go",
+        "ipv6.go",
+        "ipv6_extension_headers.go",
+        "ipv6_fragment.go",
+        "ndp_neighbor_advert.go",
+        "ndp_neighbor_solicit.go",
+        "ndp_options.go",
+        "ndp_router_advert.go",
+        "ndp_router_solicit.go",
+        "ndpoptionidentifier_string.go",
+        "tcp.go",
+        "udp.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/seqnum",
+        "@com_github_google_btree//:go_default_library",
+    ],
+)
+
+go_test(
+    name = "header_x_test",
+    size = "small",
+    srcs = [
+        "checksum_test.go",
+        "ipv6_test.go",
+        "ipversion_test.go",
+        "tcp_test.go",
+    ],
+    deps = [
+        ":header",
+        "//pkg/rand",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+    ],
+)
+
+go_test(
+    name = "header_test",
+    size = "small",
+    srcs = [
+        "eth_test.go",
+        "ipv6_extension_headers_test.go",
+        "ndp_test.go",
+    ],
+    library = ":header",
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+    ],
+)
diff --git a/pkg/tcpip/header/arp.go b/pkg/tcpip/header/arp.go
new file mode 100644
index 000000000..718a4720a
--- /dev/null
+++ b/pkg/tcpip/header/arp.go
@@ -0,0 +1,100 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import "gvisor.dev/gvisor/pkg/tcpip"
+
+const (
+	// ARPProtocolNumber is the ARP network protocol number.
+	ARPProtocolNumber tcpip.NetworkProtocolNumber = 0x0806
+
+	// ARPSize is the size of an IPv4-over-Ethernet ARP packet.
+	ARPSize = 2 + 2 + 1 + 1 + 2 + 2*6 + 2*4
+)
+
+// ARPOp is an ARP opcode.
+type ARPOp uint16
+
+// Typical ARP opcodes defined in RFC 826.
+const (
+	ARPRequest ARPOp = 1
+	ARPReply   ARPOp = 2
+)
+
+// ARP is an ARP packet stored in a byte array as described in RFC 826.
+type ARP []byte
+
+func (a ARP) hardwareAddressSpace() uint16 { return uint16(a[0])<<8 | uint16(a[1]) }
+func (a ARP) protocolAddressSpace() uint16 { return uint16(a[2])<<8 | uint16(a[3]) }
+func (a ARP) hardwareAddressSize() int     { return int(a[4]) }
+func (a ARP) protocolAddressSize() int     { return int(a[5]) }
+
+// Op is the ARP opcode.
+func (a ARP) Op() ARPOp { return ARPOp(a[6])<<8 | ARPOp(a[7]) }
+
+// SetOp sets the ARP opcode.
+func (a ARP) SetOp(op ARPOp) {
+	a[6] = uint8(op >> 8)
+	a[7] = uint8(op)
+}
+
+// SetIPv4OverEthernet configures the ARP packet for IPv4-over-Ethernet.
+func (a ARP) SetIPv4OverEthernet() {
+	a[0], a[1] = 0, 1       // htypeEthernet
+	a[2], a[3] = 0x08, 0x00 // IPv4ProtocolNumber
+	a[4] = 6                // macSize
+	a[5] = uint8(IPv4AddressSize)
+}
+
+// HardwareAddressSender is the link address of the sender.
+// It is a view on to the ARP packet so it can be used to set the value.
+func (a ARP) HardwareAddressSender() []byte {
+	const s = 8
+	return a[s : s+6]
+}
+
+// ProtocolAddressSender is the protocol address of the sender.
+// It is a view on to the ARP packet so it can be used to set the value.
+func (a ARP) ProtocolAddressSender() []byte {
+	const s = 8 + 6
+	return a[s : s+4]
+}
+
+// HardwareAddressTarget is the link address of the target.
+// It is a view on to the ARP packet so it can be used to set the value.
+func (a ARP) HardwareAddressTarget() []byte {
+	const s = 8 + 6 + 4
+	return a[s : s+6]
+}
+
+// ProtocolAddressTarget is the protocol address of the target.
+// It is a view on to the ARP packet so it can be used to set the value.
+func (a ARP) ProtocolAddressTarget() []byte {
+	const s = 8 + 6 + 4 + 6
+	return a[s : s+4]
+}
+
+// IsValid reports whether this is an ARP packet for IPv4 over Ethernet.
+func (a ARP) IsValid() bool {
+	if len(a) < ARPSize {
+		return false
+	}
+	const htypeEthernet = 1
+	const macSize = 6
+	return a.hardwareAddressSpace() == htypeEthernet &&
+		a.protocolAddressSpace() == uint16(IPv4ProtocolNumber) &&
+		a.hardwareAddressSize() == macSize &&
+		a.protocolAddressSize() == IPv4AddressSize
+}
diff --git a/pkg/tcpip/header/checksum.go b/pkg/tcpip/header/checksum.go
new file mode 100644
index 000000000..14a4b2b44
--- /dev/null
+++ b/pkg/tcpip/header/checksum.go
@@ -0,0 +1,249 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package header provides the implementation of the encoding and decoding of
+// network protocol headers.
+package header
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+func calculateChecksum(buf []byte, odd bool, initial uint32) (uint16, bool) {
+	v := initial
+
+	if odd {
+		v += uint32(buf[0])
+		buf = buf[1:]
+	}
+
+	l := len(buf)
+	odd = l&1 != 0
+	if odd {
+		l--
+		v += uint32(buf[l]) << 8
+	}
+
+	for i := 0; i < l; i += 2 {
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+	}
+
+	return ChecksumCombine(uint16(v), uint16(v>>16)), odd
+}
+
+func unrolledCalculateChecksum(buf []byte, odd bool, initial uint32) (uint16, bool) {
+	v := initial
+
+	if odd {
+		v += uint32(buf[0])
+		buf = buf[1:]
+	}
+
+	l := len(buf)
+	odd = l&1 != 0
+	if odd {
+		l--
+		v += uint32(buf[l]) << 8
+	}
+	for (l - 64) >= 0 {
+		i := 0
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
+		v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
+		v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
+		v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
+		i += 16
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
+		v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
+		v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
+		v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
+		i += 16
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
+		v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
+		v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
+		v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
+		i += 16
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
+		v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
+		v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
+		v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
+		buf = buf[64:]
+		l = l - 64
+	}
+	if (l - 32) >= 0 {
+		i := 0
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
+		v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
+		v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
+		v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
+		i += 16
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
+		v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
+		v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
+		v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
+		buf = buf[32:]
+		l = l - 32
+	}
+	if (l - 16) >= 0 {
+		i := 0
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		v += (uint32(buf[i+8]) << 8) + uint32(buf[i+9])
+		v += (uint32(buf[i+10]) << 8) + uint32(buf[i+11])
+		v += (uint32(buf[i+12]) << 8) + uint32(buf[i+13])
+		v += (uint32(buf[i+14]) << 8) + uint32(buf[i+15])
+		buf = buf[16:]
+		l = l - 16
+	}
+	if (l - 8) >= 0 {
+		i := 0
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		v += (uint32(buf[i+4]) << 8) + uint32(buf[i+5])
+		v += (uint32(buf[i+6]) << 8) + uint32(buf[i+7])
+		buf = buf[8:]
+		l = l - 8
+	}
+	if (l - 4) >= 0 {
+		i := 0
+		v += (uint32(buf[i]) << 8) + uint32(buf[i+1])
+		v += (uint32(buf[i+2]) << 8) + uint32(buf[i+3])
+		buf = buf[4:]
+		l = l - 4
+	}
+
+	// At this point since l was even before we started unrolling
+	// there can be only two bytes left to add.
+	if l != 0 {
+		v += (uint32(buf[0]) << 8) + uint32(buf[1])
+	}
+
+	return ChecksumCombine(uint16(v), uint16(v>>16)), odd
+}
+
+// ChecksumOld calculates the checksum (as defined in RFC 1071) of the bytes in
+// the given byte array. This function uses a non-optimized implementation. Its
+// only retained for reference and to use as a benchmark/test. Most code should
+// use the header.Checksum function.
+//
+// The initial checksum must have been computed on an even number of bytes.
+func ChecksumOld(buf []byte, initial uint16) uint16 {
+	s, _ := calculateChecksum(buf, false, uint32(initial))
+	return s
+}
+
+// Checksum calculates the checksum (as defined in RFC 1071) of the bytes in the
+// given byte array. This function uses an optimized unrolled version of the
+// checksum algorithm.
+//
+// The initial checksum must have been computed on an even number of bytes.
+func Checksum(buf []byte, initial uint16) uint16 {
+	s, _ := unrolledCalculateChecksum(buf, false, uint32(initial))
+	return s
+}
+
+// ChecksumVV calculates the checksum (as defined in RFC 1071) of the bytes in
+// the given VectorizedView.
+//
+// The initial checksum must have been computed on an even number of bytes.
+func ChecksumVV(vv buffer.VectorisedView, initial uint16) uint16 {
+	return ChecksumVVWithOffset(vv, initial, 0, vv.Size())
+}
+
+// ChecksumVVWithOffset calculates the checksum (as defined in RFC 1071) of the
+// bytes in the given VectorizedView.
+//
+// The initial checksum must have been computed on an even number of bytes.
+func ChecksumVVWithOffset(vv buffer.VectorisedView, initial uint16, off int, size int) uint16 {
+	odd := false
+	sum := initial
+	for _, v := range vv.Views() {
+		if len(v) == 0 {
+			continue
+		}
+
+		if off >= len(v) {
+			off -= len(v)
+			continue
+		}
+		v = v[off:]
+
+		l := len(v)
+		if l > size {
+			l = size
+		}
+		v = v[:l]
+
+		sum, odd = unrolledCalculateChecksum(v, odd, uint32(sum))
+
+		size -= len(v)
+		if size == 0 {
+			break
+		}
+		off = 0
+	}
+	return sum
+}
+
+// ChecksumCombine combines the two uint16 to form their checksum. This is done
+// by adding them and the carry.
+//
+// Note that checksum a must have been computed on an even number of bytes.
+func ChecksumCombine(a, b uint16) uint16 {
+	v := uint32(a) + uint32(b)
+	return uint16(v + v>>16)
+}
+
+// PseudoHeaderChecksum calculates the pseudo-header checksum for the given
+// destination protocol and network address. Pseudo-headers are needed by
+// transport layers when calculating their own checksum.
+func PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, srcAddr tcpip.Address, dstAddr tcpip.Address, totalLen uint16) uint16 {
+	xsum := Checksum([]byte(srcAddr), 0)
+	xsum = Checksum([]byte(dstAddr), xsum)
+
+	// Add the length portion of the checksum to the pseudo-checksum.
+	tmp := make([]byte, 2)
+	binary.BigEndian.PutUint16(tmp, totalLen)
+	xsum = Checksum(tmp, xsum)
+
+	return Checksum([]byte{0, uint8(protocol)}, xsum)
+}
diff --git a/pkg/tcpip/header/checksum_test.go b/pkg/tcpip/header/checksum_test.go
new file mode 100644
index 000000000..309403482
--- /dev/null
+++ b/pkg/tcpip/header/checksum_test.go
@@ -0,0 +1,171 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package header provides the implementation of the encoding and decoding of
+// network protocol headers.
+package header_test
+
+import (
+	"fmt"
+	"math/rand"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+func TestChecksumVVWithOffset(t *testing.T) {
+	testCases := []struct {
+		name      string
+		vv        buffer.VectorisedView
+		off, size int
+		initial   uint16
+		want      uint16
+	}{
+		{
+			name: "empty",
+			vv: buffer.NewVectorisedView(0, []buffer.View{
+				buffer.NewViewFromBytes([]byte{1, 9, 0, 5, 4}),
+			}),
+			off:  0,
+			size: 0,
+			want: 0,
+		},
+		{
+			name: "OneView",
+			vv: buffer.NewVectorisedView(0, []buffer.View{
+				buffer.NewViewFromBytes([]byte{1, 9, 0, 5, 4}),
+			}),
+			off:  0,
+			size: 5,
+			want: 1294,
+		},
+		{
+			name: "TwoViews",
+			vv: buffer.NewVectorisedView(0, []buffer.View{
+				buffer.NewViewFromBytes([]byte{1, 9, 0, 5, 4}),
+				buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123}),
+			}),
+			off:  0,
+			size: 11,
+			want: 33819,
+		},
+		{
+			name: "TwoViewsWithOffset",
+			vv: buffer.NewVectorisedView(0, []buffer.View{
+				buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}),
+				buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123}),
+			}),
+			off:  1,
+			size: 11,
+			want: 33819,
+		},
+		{
+			name: "ThreeViewsWithOffset",
+			vv: buffer.NewVectorisedView(0, []buffer.View{
+				buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}),
+				buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}),
+				buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123}),
+			}),
+			off:  7,
+			size: 11,
+			want: 33819,
+		},
+		{
+			name: "ThreeViewsWithInitial",
+			vv: buffer.NewVectorisedView(0, []buffer.View{
+				buffer.NewViewFromBytes([]byte{77, 11, 33, 0, 55, 44}),
+				buffer.NewViewFromBytes([]byte{98, 1, 9, 0, 5, 4}),
+				buffer.NewViewFromBytes([]byte{4, 3, 7, 1, 2, 123, 99}),
+			}),
+			initial: 77,
+			off:     7,
+			size:    11,
+			want:    33896,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			if got, want := header.ChecksumVVWithOffset(tc.vv, tc.initial, tc.off, tc.size), tc.want; got != want {
+				t.Errorf("header.ChecksumVVWithOffset(%v) = %v, want: %v", tc, got, tc.want)
+			}
+			v := tc.vv.ToView()
+			v.TrimFront(tc.off)
+			v.CapLength(tc.size)
+			if got, want := header.Checksum(v, tc.initial), tc.want; got != want {
+				t.Errorf("header.Checksum(%v) = %v, want: %v", tc, got, tc.want)
+			}
+		})
+	}
+}
+
+func TestChecksum(t *testing.T) {
+	var bufSizes = []int{0, 1, 2, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 255, 256, 257, 1023, 1024}
+	type testCase struct {
+		buf      []byte
+		initial  uint16
+		csumOrig uint16
+		csumNew  uint16
+	}
+	testCases := make([]testCase, 100000)
+	// Ensure same buffer generation for test consistency.
+	rnd := rand.New(rand.NewSource(42))
+	for i := range testCases {
+		testCases[i].buf = make([]byte, bufSizes[i%len(bufSizes)])
+		testCases[i].initial = uint16(rnd.Intn(65536))
+		rnd.Read(testCases[i].buf)
+	}
+
+	for i := range testCases {
+		testCases[i].csumOrig = header.ChecksumOld(testCases[i].buf, testCases[i].initial)
+		testCases[i].csumNew = header.Checksum(testCases[i].buf, testCases[i].initial)
+		if got, want := testCases[i].csumNew, testCases[i].csumOrig; got != want {
+			t.Fatalf("new checksum for (buf = %x, initial = %d) does not match old got: %d, want: %d", testCases[i].buf, testCases[i].initial, got, want)
+		}
+	}
+}
+
+func BenchmarkChecksum(b *testing.B) {
+	var bufSizes = []int{64, 128, 256, 512, 1024, 1500, 2048, 4096, 8192, 16384, 32767, 32768, 65535, 65536}
+
+	checkSumImpls := []struct {
+		fn   func([]byte, uint16) uint16
+		name string
+	}{
+		{header.ChecksumOld, fmt.Sprintf("checksum_old")},
+		{header.Checksum, fmt.Sprintf("checksum")},
+	}
+
+	for _, csumImpl := range checkSumImpls {
+		// Ensure same buffer generation for test consistency.
+		rnd := rand.New(rand.NewSource(42))
+		for _, bufSz := range bufSizes {
+			b.Run(fmt.Sprintf("%s_%d", csumImpl.name, bufSz), func(b *testing.B) {
+				tc := struct {
+					buf     []byte
+					initial uint16
+					csum    uint16
+				}{
+					buf:     make([]byte, bufSz),
+					initial: uint16(rnd.Intn(65536)),
+				}
+				rnd.Read(tc.buf)
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					tc.csum = csumImpl.fn(tc.buf, tc.initial)
+				}
+			})
+		}
+	}
+}
diff --git a/pkg/tcpip/header/eth.go b/pkg/tcpip/header/eth.go
new file mode 100644
index 000000000..b1e92d2d7
--- /dev/null
+++ b/pkg/tcpip/header/eth.go
@@ -0,0 +1,177 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	dstMAC  = 0
+	srcMAC  = 6
+	ethType = 12
+)
+
+// EthernetFields contains the fields of an ethernet frame header. It is used to
+// describe the fields of a frame that needs to be encoded.
+type EthernetFields struct {
+	// SrcAddr is the "MAC source" field of an ethernet frame header.
+	SrcAddr tcpip.LinkAddress
+
+	// DstAddr is the "MAC destination" field of an ethernet frame header.
+	DstAddr tcpip.LinkAddress
+
+	// Type is the "ethertype" field of an ethernet frame header.
+	Type tcpip.NetworkProtocolNumber
+}
+
+// Ethernet represents an ethernet frame header stored in a byte array.
+type Ethernet []byte
+
+const (
+	// EthernetMinimumSize is the minimum size of a valid ethernet frame.
+	EthernetMinimumSize = 14
+
+	// EthernetAddressSize is the size, in bytes, of an ethernet address.
+	EthernetAddressSize = 6
+
+	// unspecifiedEthernetAddress is the unspecified ethernet address
+	// (all bits set to 0).
+	unspecifiedEthernetAddress = tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00")
+
+	// unicastMulticastFlagMask is the mask of the least significant bit in
+	// the first octet (in network byte order) of an ethernet address that
+	// determines whether the ethernet address is a unicast or multicast. If
+	// the masked bit is a 1, then the address is a multicast, unicast
+	// otherwise.
+	//
+	// See the IEEE Std 802-2001 document for more details. Specifically,
+	// section 9.2.1 of http://ieee802.org/secmail/pdfocSP2xXA6d.pdf:
+	// "A 48-bit universal address consists of two parts. The first 24 bits
+	// correspond to the OUI as assigned by the IEEE, expect that the
+	// assignee may set the LSB of the first octet to 1 for group addresses
+	// or set it to 0 for individual addresses."
+	unicastMulticastFlagMask = 1
+
+	// unicastMulticastFlagByteIdx is the byte that holds the
+	// unicast/multicast flag. See unicastMulticastFlagMask.
+	unicastMulticastFlagByteIdx = 0
+)
+
+const (
+	// EthernetProtocolAll is a catch-all for all protocols carried inside
+	// an ethernet frame. It is mainly used to create packet sockets that
+	// capture all traffic.
+	EthernetProtocolAll tcpip.NetworkProtocolNumber = 0x0003
+
+	// EthernetProtocolPUP is the PARC Universial Packet protocol ethertype.
+	EthernetProtocolPUP tcpip.NetworkProtocolNumber = 0x0200
+)
+
+// Ethertypes holds the protocol numbers describing the payload of an ethernet
+// frame. These types aren't necessarily supported by netstack, but can be used
+// to catch all traffic of a type via packet endpoints.
+var Ethertypes = []tcpip.NetworkProtocolNumber{
+	EthernetProtocolAll,
+	EthernetProtocolPUP,
+}
+
+// SourceAddress returns the "MAC source" field of the ethernet frame header.
+func (b Ethernet) SourceAddress() tcpip.LinkAddress {
+	return tcpip.LinkAddress(b[srcMAC:][:EthernetAddressSize])
+}
+
+// DestinationAddress returns the "MAC destination" field of the ethernet frame
+// header.
+func (b Ethernet) DestinationAddress() tcpip.LinkAddress {
+	return tcpip.LinkAddress(b[dstMAC:][:EthernetAddressSize])
+}
+
+// Type returns the "ethertype" field of the ethernet frame header.
+func (b Ethernet) Type() tcpip.NetworkProtocolNumber {
+	return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(b[ethType:]))
+}
+
+// Encode encodes all the fields of the ethernet frame header.
+func (b Ethernet) Encode(e *EthernetFields) {
+	binary.BigEndian.PutUint16(b[ethType:], uint16(e.Type))
+	copy(b[srcMAC:][:EthernetAddressSize], e.SrcAddr)
+	copy(b[dstMAC:][:EthernetAddressSize], e.DstAddr)
+}
+
+// IsValidUnicastEthernetAddress returns true if addr is a valid unicast
+// ethernet address.
+func IsValidUnicastEthernetAddress(addr tcpip.LinkAddress) bool {
+	// Must be of the right length.
+	if len(addr) != EthernetAddressSize {
+		return false
+	}
+
+	// Must not be unspecified.
+	if addr == unspecifiedEthernetAddress {
+		return false
+	}
+
+	// Must not be a multicast.
+	if addr[unicastMulticastFlagByteIdx]&unicastMulticastFlagMask != 0 {
+		return false
+	}
+
+	// addr is a valid unicast ethernet address.
+	return true
+}
+
+// EthernetAddressFromMulticastIPv4Address returns a multicast Ethernet address
+// for a multicast IPv4 address.
+//
+// addr MUST be a multicast IPv4 address.
+func EthernetAddressFromMulticastIPv4Address(addr tcpip.Address) tcpip.LinkAddress {
+	var linkAddrBytes [EthernetAddressSize]byte
+	// RFC 1112 Host Extensions for IP Multicasting
+	//
+	// 6.4. Extensions to an Ethernet Local Network Module:
+	//
+	// An IP host group address is mapped to an Ethernet multicast
+	// address by placing the low-order 23-bits of the IP address
+	// into the low-order 23 bits of the Ethernet multicast address
+	// 01-00-5E-00-00-00 (hex).
+	linkAddrBytes[0] = 0x1
+	linkAddrBytes[2] = 0x5e
+	linkAddrBytes[3] = addr[1] & 0x7F
+	copy(linkAddrBytes[4:], addr[IPv4AddressSize-2:])
+	return tcpip.LinkAddress(linkAddrBytes[:])
+}
+
+// EthernetAddressFromMulticastIPv6Address returns a multicast Ethernet address
+// for a multicast IPv6 address.
+//
+// addr MUST be a multicast IPv6 address.
+func EthernetAddressFromMulticastIPv6Address(addr tcpip.Address) tcpip.LinkAddress {
+	// RFC 2464 Transmission of IPv6 Packets over Ethernet Networks
+	//
+	// 7. Address Mapping -- Multicast
+	//
+	// An IPv6 packet with a multicast destination address DST,
+	// consisting of the sixteen octets DST[1] through DST[16], is
+	// transmitted to the Ethernet multicast address whose first
+	// two octets are the value 3333 hexadecimal and whose last
+	// four octets are the last four octets of DST.
+	linkAddrBytes := []byte(addr[IPv6AddressSize-EthernetAddressSize:])
+	linkAddrBytes[0] = 0x33
+	linkAddrBytes[1] = 0x33
+	return tcpip.LinkAddress(linkAddrBytes[:])
+}
diff --git a/pkg/tcpip/header/eth_test.go b/pkg/tcpip/header/eth_test.go
new file mode 100644
index 000000000..14413f2ce
--- /dev/null
+++ b/pkg/tcpip/header/eth_test.go
@@ -0,0 +1,102 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+func TestIsValidUnicastEthernetAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		addr     tcpip.LinkAddress
+		expected bool
+	}{
+		{
+			"Nil",
+			tcpip.LinkAddress([]byte(nil)),
+			false,
+		},
+		{
+			"Empty",
+			tcpip.LinkAddress(""),
+			false,
+		},
+		{
+			"InvalidLength",
+			tcpip.LinkAddress("\x01\x02\x03"),
+			false,
+		},
+		{
+			"Unspecified",
+			unspecifiedEthernetAddress,
+			false,
+		},
+		{
+			"Multicast",
+			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+			false,
+		},
+		{
+			"Valid",
+			tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06"),
+			true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := IsValidUnicastEthernetAddress(test.addr); got != test.expected {
+				t.Fatalf("got IsValidUnicastEthernetAddress = %t, want = %t", got, test.expected)
+			}
+		})
+	}
+}
+
+func TestEthernetAddressFromMulticastIPv4Address(t *testing.T) {
+	tests := []struct {
+		name             string
+		addr             tcpip.Address
+		expectedLinkAddr tcpip.LinkAddress
+	}{
+		{
+			name:             "IPv4 Multicast without 24th bit set",
+			addr:             "\xe0\x7e\xdc\xba",
+			expectedLinkAddr: "\x01\x00\x5e\x7e\xdc\xba",
+		},
+		{
+			name:             "IPv4 Multicast with 24th bit set",
+			addr:             "\xe0\xfe\xdc\xba",
+			expectedLinkAddr: "\x01\x00\x5e\x7e\xdc\xba",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := EthernetAddressFromMulticastIPv4Address(test.addr); got != test.expectedLinkAddr {
+				t.Fatalf("got EthernetAddressFromMulticastIPv4Address(%s) = %s, want = %s", test.addr, got, test.expectedLinkAddr)
+			}
+		})
+	}
+}
+
+func TestEthernetAddressFromMulticastIPv6Address(t *testing.T) {
+	addr := tcpip.Address("\xff\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x1a")
+	if got, want := EthernetAddressFromMulticastIPv6Address(addr), tcpip.LinkAddress("\x33\x33\x0d\x0e\x0f\x1a"); got != want {
+		t.Fatalf("got EthernetAddressFromMulticastIPv6Address(%s) = %s, want = %s", addr, got, want)
+	}
+}
diff --git a/pkg/tcpip/header/gue.go b/pkg/tcpip/header/gue.go
new file mode 100644
index 000000000..10d358c0e
--- /dev/null
+++ b/pkg/tcpip/header/gue.go
@@ -0,0 +1,73 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+const (
+	typeHLen   = 0
+	encapProto = 1
+)
+
+// GUEFields contains the fields of a GUE packet. It is used to describe the
+// fields of a packet that needs to be encoded.
+type GUEFields struct {
+	// Type is the "type" field of the GUE header.
+	Type uint8
+
+	// Control is the "control" field of the GUE header.
+	Control bool
+
+	// HeaderLength is the "header length" field of the GUE header. It must
+	// be at least 4 octets, and a multiple of 4 as well.
+	HeaderLength uint8
+
+	// Protocol is the "protocol" field of the GUE header. This is one of
+	// the IPPROTO_* values.
+	Protocol uint8
+}
+
+// GUE represents a Generic UDP Encapsulation header stored in a byte array, the
+// fields are described in https://tools.ietf.org/html/draft-ietf-nvo3-gue-01.
+type GUE []byte
+
+const (
+	// GUEMinimumSize is the minimum size of a valid GUE packet.
+	GUEMinimumSize = 4
+)
+
+// TypeAndControl returns the GUE packet type (top 3 bits of the first byte,
+// which includes the control bit).
+func (b GUE) TypeAndControl() uint8 {
+	return b[typeHLen] >> 5
+}
+
+// HeaderLength returns the total length of the GUE header.
+func (b GUE) HeaderLength() uint8 {
+	return 4 + 4*(b[typeHLen]&0x1f)
+}
+
+// Protocol returns the protocol field of the GUE header.
+func (b GUE) Protocol() uint8 {
+	return b[encapProto]
+}
+
+// Encode encodes all the fields of the GUE header.
+func (b GUE) Encode(i *GUEFields) {
+	ctl := uint8(0)
+	if i.Control {
+		ctl = 1 << 5
+	}
+	b[typeHLen] = ctl | i.Type<<6 | (i.HeaderLength-4)/4
+	b[encapProto] = i.Protocol
+}
diff --git a/pkg/tcpip/header/icmpv4.go b/pkg/tcpip/header/icmpv4.go
new file mode 100644
index 000000000..7908c5744
--- /dev/null
+++ b/pkg/tcpip/header/icmpv4.go
@@ -0,0 +1,170 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+// ICMPv4 represents an ICMPv4 header stored in a byte array.
+type ICMPv4 []byte
+
+const (
+	// ICMPv4PayloadOffset defines the start of ICMP payload.
+	ICMPv4PayloadOffset = 8
+
+	// ICMPv4MinimumSize is the minimum size of a valid ICMP packet.
+	ICMPv4MinimumSize = 8
+
+	// ICMPv4ProtocolNumber is the ICMP transport protocol number.
+	ICMPv4ProtocolNumber tcpip.TransportProtocolNumber = 1
+
+	// icmpv4ChecksumOffset is the offset of the checksum field
+	// in an ICMPv4 message.
+	icmpv4ChecksumOffset = 2
+
+	// icmpv4MTUOffset is the offset of the MTU field
+	// in a ICMPv4FragmentationNeeded message.
+	icmpv4MTUOffset = 6
+
+	// icmpv4IdentOffset is the offset of the ident field
+	// in a ICMPv4EchoRequest/Reply message.
+	icmpv4IdentOffset = 4
+
+	// icmpv4SequenceOffset is the offset of the sequence field
+	// in a ICMPv4EchoRequest/Reply message.
+	icmpv4SequenceOffset = 6
+)
+
+// ICMPv4Type is the ICMP type field described in RFC 792.
+type ICMPv4Type byte
+
+// Typical values of ICMPv4Type defined in RFC 792.
+const (
+	ICMPv4EchoReply      ICMPv4Type = 0
+	ICMPv4DstUnreachable ICMPv4Type = 3
+	ICMPv4SrcQuench      ICMPv4Type = 4
+	ICMPv4Redirect       ICMPv4Type = 5
+	ICMPv4Echo           ICMPv4Type = 8
+	ICMPv4TimeExceeded   ICMPv4Type = 11
+	ICMPv4ParamProblem   ICMPv4Type = 12
+	ICMPv4Timestamp      ICMPv4Type = 13
+	ICMPv4TimestampReply ICMPv4Type = 14
+	ICMPv4InfoRequest    ICMPv4Type = 15
+	ICMPv4InfoReply      ICMPv4Type = 16
+)
+
+// Values for ICMP code as defined in RFC 792.
+const (
+	ICMPv4TTLExceeded         = 0
+	ICMPv4PortUnreachable     = 3
+	ICMPv4FragmentationNeeded = 4
+)
+
+// Type is the ICMP type field.
+func (b ICMPv4) Type() ICMPv4Type { return ICMPv4Type(b[0]) }
+
+// SetType sets the ICMP type field.
+func (b ICMPv4) SetType(t ICMPv4Type) { b[0] = byte(t) }
+
+// Code is the ICMP code field. Its meaning depends on the value of Type.
+func (b ICMPv4) Code() byte { return b[1] }
+
+// SetCode sets the ICMP code field.
+func (b ICMPv4) SetCode(c byte) { b[1] = c }
+
+// Checksum is the ICMP checksum field.
+func (b ICMPv4) Checksum() uint16 {
+	return binary.BigEndian.Uint16(b[icmpv4ChecksumOffset:])
+}
+
+// SetChecksum sets the ICMP checksum field.
+func (b ICMPv4) SetChecksum(checksum uint16) {
+	binary.BigEndian.PutUint16(b[icmpv4ChecksumOffset:], checksum)
+}
+
+// SourcePort implements Transport.SourcePort.
+func (ICMPv4) SourcePort() uint16 {
+	return 0
+}
+
+// DestinationPort implements Transport.DestinationPort.
+func (ICMPv4) DestinationPort() uint16 {
+	return 0
+}
+
+// SetSourcePort implements Transport.SetSourcePort.
+func (ICMPv4) SetSourcePort(uint16) {
+}
+
+// SetDestinationPort implements Transport.SetDestinationPort.
+func (ICMPv4) SetDestinationPort(uint16) {
+}
+
+// Payload implements Transport.Payload.
+func (b ICMPv4) Payload() []byte {
+	return b[ICMPv4PayloadOffset:]
+}
+
+// MTU retrieves the MTU field from an ICMPv4 message.
+func (b ICMPv4) MTU() uint16 {
+	return binary.BigEndian.Uint16(b[icmpv4MTUOffset:])
+}
+
+// SetMTU sets the MTU field from an ICMPv4 message.
+func (b ICMPv4) SetMTU(mtu uint16) {
+	binary.BigEndian.PutUint16(b[icmpv4MTUOffset:], mtu)
+}
+
+// Ident retrieves the Ident field from an ICMPv4 message.
+func (b ICMPv4) Ident() uint16 {
+	return binary.BigEndian.Uint16(b[icmpv4IdentOffset:])
+}
+
+// SetIdent sets the Ident field from an ICMPv4 message.
+func (b ICMPv4) SetIdent(ident uint16) {
+	binary.BigEndian.PutUint16(b[icmpv4IdentOffset:], ident)
+}
+
+// Sequence retrieves the Sequence field from an ICMPv4 message.
+func (b ICMPv4) Sequence() uint16 {
+	return binary.BigEndian.Uint16(b[icmpv4SequenceOffset:])
+}
+
+// SetSequence sets the Sequence field from an ICMPv4 message.
+func (b ICMPv4) SetSequence(sequence uint16) {
+	binary.BigEndian.PutUint16(b[icmpv4SequenceOffset:], sequence)
+}
+
+// ICMPv4Checksum calculates the ICMP checksum over the provided ICMP header,
+// and payload.
+func ICMPv4Checksum(h ICMPv4, vv buffer.VectorisedView) uint16 {
+	// Calculate the IPv6 pseudo-header upper-layer checksum.
+	xsum := uint16(0)
+	for _, v := range vv.Views() {
+		xsum = Checksum(v, xsum)
+	}
+
+	// h[2:4] is the checksum itself, set it aside to avoid checksumming the checksum.
+	h2, h3 := h[2], h[3]
+	h[2], h[3] = 0, 0
+	xsum = ^Checksum(h, xsum)
+	h[2], h[3] = h2, h3
+
+	return xsum
+}
diff --git a/pkg/tcpip/header/icmpv6.go b/pkg/tcpip/header/icmpv6.go
new file mode 100644
index 000000000..c7ee2de57
--- /dev/null
+++ b/pkg/tcpip/header/icmpv6.go
@@ -0,0 +1,221 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+// ICMPv6 represents an ICMPv6 header stored in a byte array.
+type ICMPv6 []byte
+
+const (
+	// ICMPv6HeaderSize is the size of the ICMPv6 header. That is, the
+	// sum of the size of the ICMPv6 Type, Code and Checksum fields, as
+	// per RFC 4443 section 2.1. After the ICMPv6 header, the ICMPv6
+	// message body begins.
+	ICMPv6HeaderSize = 4
+
+	// ICMPv6MinimumSize is the minimum size of a valid ICMP packet.
+	ICMPv6MinimumSize = 8
+
+	// ICMPv6PayloadOffset is the offset of the payload in an
+	// ICMP packet.
+	ICMPv6PayloadOffset = 8
+
+	// ICMPv6ProtocolNumber is the ICMP transport protocol number.
+	ICMPv6ProtocolNumber tcpip.TransportProtocolNumber = 58
+
+	// ICMPv6NeighborSolicitMinimumSize is the minimum size of a
+	// neighbor solicitation packet.
+	ICMPv6NeighborSolicitMinimumSize = ICMPv6HeaderSize + NDPNSMinimumSize
+
+	// ICMPv6NeighborAdvertMinimumSize is the minimum size of a
+	// neighbor advertisement packet.
+	ICMPv6NeighborAdvertMinimumSize = ICMPv6HeaderSize + NDPNAMinimumSize
+
+	// ICMPv6NeighborAdvertSize is size of a neighbor advertisement
+	// including the NDP Target Link Layer option for an Ethernet
+	// address.
+	ICMPv6NeighborAdvertSize = ICMPv6HeaderSize + NDPNAMinimumSize + NDPLinkLayerAddressSize
+
+	// ICMPv6EchoMinimumSize is the minimum size of a valid ICMP echo packet.
+	ICMPv6EchoMinimumSize = 8
+
+	// ICMPv6DstUnreachableMinimumSize is the minimum size of a valid ICMP
+	// destination unreachable packet.
+	ICMPv6DstUnreachableMinimumSize = ICMPv6MinimumSize
+
+	// ICMPv6PacketTooBigMinimumSize is the minimum size of a valid ICMP
+	// packet-too-big packet.
+	ICMPv6PacketTooBigMinimumSize = ICMPv6MinimumSize
+
+	// icmpv6ChecksumOffset is the offset of the checksum field
+	// in an ICMPv6 message.
+	icmpv6ChecksumOffset = 2
+
+	// icmpv6MTUOffset is the offset of the MTU field in an ICMPv6
+	// PacketTooBig message.
+	icmpv6MTUOffset = 4
+
+	// icmpv6IdentOffset is the offset of the ident field
+	// in a ICMPv6 Echo Request/Reply message.
+	icmpv6IdentOffset = 4
+
+	// icmpv6SequenceOffset is the offset of the sequence field
+	// in a ICMPv6 Echo Request/Reply message.
+	icmpv6SequenceOffset = 6
+
+	// NDPHopLimit is the expected IP hop limit value of 255 for received
+	// NDP packets, as per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1,
+	// 7.1.2 and 8.1. If the hop limit value is not 255, nodes MUST silently
+	// drop the NDP packet. All outgoing NDP packets must use this value for
+	// its IP hop limit field.
+	NDPHopLimit = 255
+)
+
+// ICMPv6Type is the ICMP type field described in RFC 4443 and friends.
+type ICMPv6Type byte
+
+// Typical values of ICMPv6Type defined in RFC 4443.
+const (
+	ICMPv6DstUnreachable ICMPv6Type = 1
+	ICMPv6PacketTooBig   ICMPv6Type = 2
+	ICMPv6TimeExceeded   ICMPv6Type = 3
+	ICMPv6ParamProblem   ICMPv6Type = 4
+	ICMPv6EchoRequest    ICMPv6Type = 128
+	ICMPv6EchoReply      ICMPv6Type = 129
+
+	// Neighbor Discovery Protocol (NDP) messages, see RFC 4861.
+
+	ICMPv6RouterSolicit   ICMPv6Type = 133
+	ICMPv6RouterAdvert    ICMPv6Type = 134
+	ICMPv6NeighborSolicit ICMPv6Type = 135
+	ICMPv6NeighborAdvert  ICMPv6Type = 136
+	ICMPv6RedirectMsg     ICMPv6Type = 137
+)
+
+// Values for ICMP code as defined in RFC 4443.
+const (
+	ICMPv6PortUnreachable = 4
+)
+
+// Type is the ICMP type field.
+func (b ICMPv6) Type() ICMPv6Type { return ICMPv6Type(b[0]) }
+
+// SetType sets the ICMP type field.
+func (b ICMPv6) SetType(t ICMPv6Type) { b[0] = byte(t) }
+
+// Code is the ICMP code field. Its meaning depends on the value of Type.
+func (b ICMPv6) Code() byte { return b[1] }
+
+// SetCode sets the ICMP code field.
+func (b ICMPv6) SetCode(c byte) { b[1] = c }
+
+// Checksum is the ICMP checksum field.
+func (b ICMPv6) Checksum() uint16 {
+	return binary.BigEndian.Uint16(b[icmpv6ChecksumOffset:])
+}
+
+// SetChecksum sets the ICMP checksum field.
+func (b ICMPv6) SetChecksum(checksum uint16) {
+	binary.BigEndian.PutUint16(b[icmpv6ChecksumOffset:], checksum)
+}
+
+// SourcePort implements Transport.SourcePort.
+func (ICMPv6) SourcePort() uint16 {
+	return 0
+}
+
+// DestinationPort implements Transport.DestinationPort.
+func (ICMPv6) DestinationPort() uint16 {
+	return 0
+}
+
+// SetSourcePort implements Transport.SetSourcePort.
+func (ICMPv6) SetSourcePort(uint16) {
+}
+
+// SetDestinationPort implements Transport.SetDestinationPort.
+func (ICMPv6) SetDestinationPort(uint16) {
+}
+
+// MTU retrieves the MTU field from an ICMPv6 message.
+func (b ICMPv6) MTU() uint32 {
+	return binary.BigEndian.Uint32(b[icmpv6MTUOffset:])
+}
+
+// SetMTU sets the MTU field from an ICMPv6 message.
+func (b ICMPv6) SetMTU(mtu uint32) {
+	binary.BigEndian.PutUint32(b[icmpv6MTUOffset:], mtu)
+}
+
+// Ident retrieves the Ident field from an ICMPv6 message.
+func (b ICMPv6) Ident() uint16 {
+	return binary.BigEndian.Uint16(b[icmpv6IdentOffset:])
+}
+
+// SetIdent sets the Ident field from an ICMPv6 message.
+func (b ICMPv6) SetIdent(ident uint16) {
+	binary.BigEndian.PutUint16(b[icmpv6IdentOffset:], ident)
+}
+
+// Sequence retrieves the Sequence field from an ICMPv6 message.
+func (b ICMPv6) Sequence() uint16 {
+	return binary.BigEndian.Uint16(b[icmpv6SequenceOffset:])
+}
+
+// SetSequence sets the Sequence field from an ICMPv6 message.
+func (b ICMPv6) SetSequence(sequence uint16) {
+	binary.BigEndian.PutUint16(b[icmpv6SequenceOffset:], sequence)
+}
+
+// NDPPayload returns the NDP payload buffer. That is, it returns the ICMPv6
+// packet's message body as defined by RFC 4443 section 2.1; the portion of the
+// ICMPv6 buffer after the first ICMPv6HeaderSize bytes.
+func (b ICMPv6) NDPPayload() []byte {
+	return b[ICMPv6HeaderSize:]
+}
+
+// Payload implements Transport.Payload.
+func (b ICMPv6) Payload() []byte {
+	return b[ICMPv6PayloadOffset:]
+}
+
+// ICMPv6Checksum calculates the ICMP checksum over the provided ICMPv6 header,
+// IPv6 src/dst addresses and the payload.
+func ICMPv6Checksum(h ICMPv6, src, dst tcpip.Address, vv buffer.VectorisedView) uint16 {
+	// Calculate the IPv6 pseudo-header upper-layer checksum.
+	xsum := Checksum([]byte(src), 0)
+	xsum = Checksum([]byte(dst), xsum)
+	var upperLayerLength [4]byte
+	binary.BigEndian.PutUint32(upperLayerLength[:], uint32(len(h)+vv.Size()))
+	xsum = Checksum(upperLayerLength[:], xsum)
+	xsum = Checksum([]byte{0, 0, 0, uint8(ICMPv6ProtocolNumber)}, xsum)
+	for _, v := range vv.Views() {
+		xsum = Checksum(v, xsum)
+	}
+
+	// h[2:4] is the checksum itself, set it aside to avoid checksumming the checksum.
+	h2, h3 := h[2], h[3]
+	h[2], h[3] = 0, 0
+	xsum = ^Checksum(h, xsum)
+	h[2], h[3] = h2, h3
+
+	return xsum
+}
diff --git a/pkg/tcpip/header/interfaces.go b/pkg/tcpip/header/interfaces.go
new file mode 100644
index 000000000..861cbbb70
--- /dev/null
+++ b/pkg/tcpip/header/interfaces.go
@@ -0,0 +1,92 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// MaxIPPacketSize is the maximum supported IP packet size, excluding
+	// jumbograms. The maximum IPv4 packet size is 64k-1 (total size must fit
+	// in 16 bits). For IPv6, the payload max size (excluding jumbograms) is
+	// 64k-1 (also needs to fit in 16 bits). So we use 64k - 1 + 2 * m, where
+	// m is the minimum IPv6 header size; we leave room for some potential
+	// IP options.
+	MaxIPPacketSize = 0xffff + 2*IPv6MinimumSize
+)
+
+// Transport offers generic methods to query and/or update the fields of the
+// header of a transport protocol buffer.
+type Transport interface {
+	// SourcePort returns the value of the "source port" field.
+	SourcePort() uint16
+
+	// Destination returns the value of the "destination port" field.
+	DestinationPort() uint16
+
+	// Checksum returns the value of the "checksum" field.
+	Checksum() uint16
+
+	// SetSourcePort sets the value of the "source port" field.
+	SetSourcePort(uint16)
+
+	// SetDestinationPort sets the value of the "destination port" field.
+	SetDestinationPort(uint16)
+
+	// SetChecksum sets the value of the "checksum" field.
+	SetChecksum(uint16)
+
+	// Payload returns the data carried in the transport buffer.
+	Payload() []byte
+}
+
+// Network offers generic methods to query and/or update the fields of the
+// header of a network protocol buffer.
+type Network interface {
+	// SourceAddress returns the value of the "source address" field.
+	SourceAddress() tcpip.Address
+
+	// DestinationAddress returns the value of the "destination address"
+	// field.
+	DestinationAddress() tcpip.Address
+
+	// Checksum returns the value of the "checksum" field.
+	Checksum() uint16
+
+	// SetSourceAddress sets the value of the "source address" field.
+	SetSourceAddress(tcpip.Address)
+
+	// SetDestinationAddress sets the value of the "destination address"
+	// field.
+	SetDestinationAddress(tcpip.Address)
+
+	// SetChecksum sets the value of the "checksum" field.
+	SetChecksum(uint16)
+
+	// TransportProtocol returns the number of the transport protocol
+	// stored in the payload.
+	TransportProtocol() tcpip.TransportProtocolNumber
+
+	// Payload returns a byte slice containing the payload of the network
+	// packet.
+	Payload() []byte
+
+	// TOS returns the values of the "type of service" and "flow label" fields.
+	TOS() (uint8, uint32)
+
+	// SetTOS sets the values of the "type of service" and "flow label" fields.
+	SetTOS(t uint8, l uint32)
+}
diff --git a/pkg/tcpip/header/ipv4.go b/pkg/tcpip/header/ipv4.go
new file mode 100644
index 000000000..62ac932bb
--- /dev/null
+++ b/pkg/tcpip/header/ipv4.go
@@ -0,0 +1,312 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	versIHL = 0
+	tos     = 1
+	// IPv4TotalLenOffset is the offset of the total length field in the
+	// IPv4 header.
+	IPv4TotalLenOffset = 2
+	id                 = 4
+	flagsFO            = 6
+	ttl                = 8
+	protocol           = 9
+	checksum           = 10
+	srcAddr            = 12
+	dstAddr            = 16
+)
+
+// IPv4Fields contains the fields of an IPv4 packet. It is used to describe the
+// fields of a packet that needs to be encoded.
+type IPv4Fields struct {
+	// IHL is the "internet header length" field of an IPv4 packet. The value
+	// is in bytes.
+	IHL uint8
+
+	// TOS is the "type of service" field of an IPv4 packet.
+	TOS uint8
+
+	// TotalLength is the "total length" field of an IPv4 packet.
+	TotalLength uint16
+
+	// ID is the "identification" field of an IPv4 packet.
+	ID uint16
+
+	// Flags is the "flags" field of an IPv4 packet.
+	Flags uint8
+
+	// FragmentOffset is the "fragment offset" field of an IPv4 packet.
+	FragmentOffset uint16
+
+	// TTL is the "time to live" field of an IPv4 packet.
+	TTL uint8
+
+	// Protocol is the "protocol" field of an IPv4 packet.
+	Protocol uint8
+
+	// Checksum is the "checksum" field of an IPv4 packet.
+	Checksum uint16
+
+	// SrcAddr is the "source ip address" of an IPv4 packet.
+	SrcAddr tcpip.Address
+
+	// DstAddr is the "destination ip address" of an IPv4 packet.
+	DstAddr tcpip.Address
+}
+
+// IPv4 represents an ipv4 header stored in a byte array.
+// Most of the methods of IPv4 access to the underlying slice without
+// checking the boundaries and could panic because of 'index out of range'.
+// Always call IsValid() to validate an instance of IPv4 before using other methods.
+type IPv4 []byte
+
+const (
+	// IPv4MinimumSize is the minimum size of a valid IPv4 packet.
+	IPv4MinimumSize = 20
+
+	// IPv4MaximumHeaderSize is the maximum size of an IPv4 header. Given
+	// that there are only 4 bits to represents the header length in 32-bit
+	// units, the header cannot exceed 15*4 = 60 bytes.
+	IPv4MaximumHeaderSize = 60
+
+	// MinIPFragmentPayloadSize is the minimum number of payload bytes that
+	// the first fragment must carry when an IPv4 packet is fragmented.
+	MinIPFragmentPayloadSize = 8
+
+	// IPv4AddressSize is the size, in bytes, of an IPv4 address.
+	IPv4AddressSize = 4
+
+	// IPv4ProtocolNumber is IPv4's network protocol number.
+	IPv4ProtocolNumber tcpip.NetworkProtocolNumber = 0x0800
+
+	// IPv4Version is the version of the ipv4 protocol.
+	IPv4Version = 4
+
+	// IPv4Broadcast is the broadcast address of the IPv4 procotol.
+	IPv4Broadcast tcpip.Address = "\xff\xff\xff\xff"
+
+	// IPv4Any is the non-routable IPv4 "any" meta address.
+	IPv4Any tcpip.Address = "\x00\x00\x00\x00"
+
+	// IPv4MinimumProcessableDatagramSize is the minimum size of an IP
+	// packet that every IPv4 capable host must be able to
+	// process/reassemble.
+	IPv4MinimumProcessableDatagramSize = 576
+)
+
+// Flags that may be set in an IPv4 packet.
+const (
+	IPv4FlagMoreFragments = 1 << iota
+	IPv4FlagDontFragment
+)
+
+// IPv4EmptySubnet is the empty IPv4 subnet.
+var IPv4EmptySubnet = func() tcpip.Subnet {
+	subnet, err := tcpip.NewSubnet(IPv4Any, tcpip.AddressMask(IPv4Any))
+	if err != nil {
+		panic(err)
+	}
+	return subnet
+}()
+
+// IPVersion returns the version of IP used in the given packet. It returns -1
+// if the packet is not large enough to contain the version field.
+func IPVersion(b []byte) int {
+	// Length must be at least offset+length of version field.
+	if len(b) < versIHL+1 {
+		return -1
+	}
+	return int(b[versIHL] >> 4)
+}
+
+// HeaderLength returns the value of the "header length" field of the ipv4
+// header. The length returned is in bytes.
+func (b IPv4) HeaderLength() uint8 {
+	return (b[versIHL] & 0xf) * 4
+}
+
+// ID returns the value of the identifier field of the ipv4 header.
+func (b IPv4) ID() uint16 {
+	return binary.BigEndian.Uint16(b[id:])
+}
+
+// Protocol returns the value of the protocol field of the ipv4 header.
+func (b IPv4) Protocol() uint8 {
+	return b[protocol]
+}
+
+// Flags returns the "flags" field of the ipv4 header.
+func (b IPv4) Flags() uint8 {
+	return uint8(binary.BigEndian.Uint16(b[flagsFO:]) >> 13)
+}
+
+// More returns whether the more fragments flag is set.
+func (b IPv4) More() bool {
+	return b.Flags()&IPv4FlagMoreFragments != 0
+}
+
+// TTL returns the "TTL" field of the ipv4 header.
+func (b IPv4) TTL() uint8 {
+	return b[ttl]
+}
+
+// FragmentOffset returns the "fragment offset" field of the ipv4 header.
+func (b IPv4) FragmentOffset() uint16 {
+	return binary.BigEndian.Uint16(b[flagsFO:]) << 3
+}
+
+// TotalLength returns the "total length" field of the ipv4 header.
+func (b IPv4) TotalLength() uint16 {
+	return binary.BigEndian.Uint16(b[IPv4TotalLenOffset:])
+}
+
+// Checksum returns the checksum field of the ipv4 header.
+func (b IPv4) Checksum() uint16 {
+	return binary.BigEndian.Uint16(b[checksum:])
+}
+
+// SourceAddress returns the "source address" field of the ipv4 header.
+func (b IPv4) SourceAddress() tcpip.Address {
+	return tcpip.Address(b[srcAddr : srcAddr+IPv4AddressSize])
+}
+
+// DestinationAddress returns the "destination address" field of the ipv4
+// header.
+func (b IPv4) DestinationAddress() tcpip.Address {
+	return tcpip.Address(b[dstAddr : dstAddr+IPv4AddressSize])
+}
+
+// TransportProtocol implements Network.TransportProtocol.
+func (b IPv4) TransportProtocol() tcpip.TransportProtocolNumber {
+	return tcpip.TransportProtocolNumber(b.Protocol())
+}
+
+// Payload implements Network.Payload.
+func (b IPv4) Payload() []byte {
+	return b[b.HeaderLength():][:b.PayloadLength()]
+}
+
+// PayloadLength returns the length of the payload portion of the ipv4 packet.
+func (b IPv4) PayloadLength() uint16 {
+	return b.TotalLength() - uint16(b.HeaderLength())
+}
+
+// TOS returns the "type of service" field of the ipv4 header.
+func (b IPv4) TOS() (uint8, uint32) {
+	return b[tos], 0
+}
+
+// SetTOS sets the "type of service" field of the ipv4 header.
+func (b IPv4) SetTOS(v uint8, _ uint32) {
+	b[tos] = v
+}
+
+// SetTotalLength sets the "total length" field of the ipv4 header.
+func (b IPv4) SetTotalLength(totalLength uint16) {
+	binary.BigEndian.PutUint16(b[IPv4TotalLenOffset:], totalLength)
+}
+
+// SetChecksum sets the checksum field of the ipv4 header.
+func (b IPv4) SetChecksum(v uint16) {
+	binary.BigEndian.PutUint16(b[checksum:], v)
+}
+
+// SetFlagsFragmentOffset sets the "flags" and "fragment offset" fields of the
+// ipv4 header.
+func (b IPv4) SetFlagsFragmentOffset(flags uint8, offset uint16) {
+	v := (uint16(flags) << 13) | (offset >> 3)
+	binary.BigEndian.PutUint16(b[flagsFO:], v)
+}
+
+// SetID sets the identification field.
+func (b IPv4) SetID(v uint16) {
+	binary.BigEndian.PutUint16(b[id:], v)
+}
+
+// SetSourceAddress sets the "source address" field of the ipv4 header.
+func (b IPv4) SetSourceAddress(addr tcpip.Address) {
+	copy(b[srcAddr:srcAddr+IPv4AddressSize], addr)
+}
+
+// SetDestinationAddress sets the "destination address" field of the ipv4
+// header.
+func (b IPv4) SetDestinationAddress(addr tcpip.Address) {
+	copy(b[dstAddr:dstAddr+IPv4AddressSize], addr)
+}
+
+// CalculateChecksum calculates the checksum of the ipv4 header.
+func (b IPv4) CalculateChecksum() uint16 {
+	return Checksum(b[:b.HeaderLength()], 0)
+}
+
+// Encode encodes all the fields of the ipv4 header.
+func (b IPv4) Encode(i *IPv4Fields) {
+	b[versIHL] = (4 << 4) | ((i.IHL / 4) & 0xf)
+	b[tos] = i.TOS
+	b.SetTotalLength(i.TotalLength)
+	binary.BigEndian.PutUint16(b[id:], i.ID)
+	b.SetFlagsFragmentOffset(i.Flags, i.FragmentOffset)
+	b[ttl] = i.TTL
+	b[protocol] = i.Protocol
+	b.SetChecksum(i.Checksum)
+	copy(b[srcAddr:srcAddr+IPv4AddressSize], i.SrcAddr)
+	copy(b[dstAddr:dstAddr+IPv4AddressSize], i.DstAddr)
+}
+
+// EncodePartial updates the total length and checksum fields of ipv4 header,
+// taking in the partial checksum, which is the checksum of the header without
+// the total length and checksum fields. It is useful in cases when similar
+// packets are produced.
+func (b IPv4) EncodePartial(partialChecksum, totalLength uint16) {
+	b.SetTotalLength(totalLength)
+	checksum := Checksum(b[IPv4TotalLenOffset:IPv4TotalLenOffset+2], partialChecksum)
+	b.SetChecksum(^checksum)
+}
+
+// IsValid performs basic validation on the packet.
+func (b IPv4) IsValid(pktSize int) bool {
+	if len(b) < IPv4MinimumSize {
+		return false
+	}
+
+	hlen := int(b.HeaderLength())
+	tlen := int(b.TotalLength())
+	if hlen < IPv4MinimumSize || hlen > tlen || tlen > pktSize {
+		return false
+	}
+
+	if IPVersion(b) != IPv4Version {
+		return false
+	}
+
+	return true
+}
+
+// IsV4MulticastAddress determines if the provided address is an IPv4 multicast
+// address (range 224.0.0.0 to 239.255.255.255). The four most significant bits
+// will be 1110 = 0xe0.
+func IsV4MulticastAddress(addr tcpip.Address) bool {
+	if len(addr) != IPv4AddressSize {
+		return false
+	}
+	return (addr[0] & 0xf0) == 0xe0
+}
diff --git a/pkg/tcpip/header/ipv6.go b/pkg/tcpip/header/ipv6.go
new file mode 100644
index 000000000..4f367fe4c
--- /dev/null
+++ b/pkg/tcpip/header/ipv6.go
@@ -0,0 +1,499 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"crypto/sha256"
+	"encoding/binary"
+	"fmt"
+	"strings"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	versTCFL = 0
+	// IPv6PayloadLenOffset is the offset of the PayloadLength field in
+	// IPv6 header.
+	IPv6PayloadLenOffset = 4
+	// IPv6NextHeaderOffset is the offset of the NextHeader field in
+	// IPv6 header.
+	IPv6NextHeaderOffset = 6
+	hopLimit             = 7
+	v6SrcAddr            = 8
+	v6DstAddr            = v6SrcAddr + IPv6AddressSize
+)
+
+// IPv6Fields contains the fields of an IPv6 packet. It is used to describe the
+// fields of a packet that needs to be encoded.
+type IPv6Fields struct {
+	// TrafficClass is the "traffic class" field of an IPv6 packet.
+	TrafficClass uint8
+
+	// FlowLabel is the "flow label" field of an IPv6 packet.
+	FlowLabel uint32
+
+	// PayloadLength is the "payload length" field of an IPv6 packet.
+	PayloadLength uint16
+
+	// NextHeader is the "next header" field of an IPv6 packet.
+	NextHeader uint8
+
+	// HopLimit is the "hop limit" field of an IPv6 packet.
+	HopLimit uint8
+
+	// SrcAddr is the "source ip address" of an IPv6 packet.
+	SrcAddr tcpip.Address
+
+	// DstAddr is the "destination ip address" of an IPv6 packet.
+	DstAddr tcpip.Address
+}
+
+// IPv6 represents an ipv6 header stored in a byte array.
+// Most of the methods of IPv6 access to the underlying slice without
+// checking the boundaries and could panic because of 'index out of range'.
+// Always call IsValid() to validate an instance of IPv6 before using other methods.
+type IPv6 []byte
+
+const (
+	// IPv6MinimumSize is the minimum size of a valid IPv6 packet.
+	IPv6MinimumSize = 40
+
+	// IPv6AddressSize is the size, in bytes, of an IPv6 address.
+	IPv6AddressSize = 16
+
+	// IPv6ProtocolNumber is IPv6's network protocol number.
+	IPv6ProtocolNumber tcpip.NetworkProtocolNumber = 0x86dd
+
+	// IPv6Version is the version of the ipv6 protocol.
+	IPv6Version = 6
+
+	// IPv6AllNodesMulticastAddress is a link-local multicast group that
+	// all IPv6 nodes MUST join, as per RFC 4291, section 2.8. Packets
+	// destined to this address will reach all nodes on a link.
+	//
+	// The address is ff02::1.
+	IPv6AllNodesMulticastAddress tcpip.Address = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+
+	// IPv6AllRoutersMulticastAddress is a link-local multicast group that
+	// all IPv6 routers MUST join, as per RFC 4291, section 2.8. Packets
+	// destined to this address will reach all routers on a link.
+	//
+	// The address is ff02::2.
+	IPv6AllRoutersMulticastAddress tcpip.Address = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+
+	// IPv6MinimumMTU is the minimum MTU required by IPv6, per RFC 2460,
+	// section 5.
+	IPv6MinimumMTU = 1280
+
+	// IPv6Any is the non-routable IPv6 "any" meta address. It is also
+	// known as the unspecified address.
+	IPv6Any tcpip.Address = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+
+	// IIDSize is the size of an interface identifier (IID), in bytes, as
+	// defined by RFC 4291 section 2.5.1.
+	IIDSize = 8
+
+	// IIDOffsetInIPv6Address is the offset, in bytes, from the start
+	// of an IPv6 address to the beginning of the interface identifier
+	// (IID) for auto-generated addresses. That is, all bytes before
+	// the IIDOffsetInIPv6Address-th byte are the prefix bytes, and all
+	// bytes including and after the IIDOffsetInIPv6Address-th byte are
+	// for the IID.
+	IIDOffsetInIPv6Address = 8
+
+	// OpaqueIIDSecretKeyMinBytes is the recommended minimum number of bytes
+	// for the secret key used to generate an opaque interface identifier as
+	// outlined by RFC 7217.
+	OpaqueIIDSecretKeyMinBytes = 16
+
+	// ipv6MulticastAddressScopeByteIdx is the byte where the scope (scop) field
+	// is located within a multicast IPv6 address, as per RFC 4291 section 2.7.
+	ipv6MulticastAddressScopeByteIdx = 1
+
+	// ipv6MulticastAddressScopeMask is the mask for the scope (scop) field,
+	// within the byte holding the field, as per RFC 4291 section 2.7.
+	ipv6MulticastAddressScopeMask = 0xF
+
+	// ipv6LinkLocalMulticastScope is the value of the scope (scop) field within
+	// a multicast IPv6 address that indicates the address has link-local scope,
+	// as per RFC 4291 section 2.7.
+	ipv6LinkLocalMulticastScope = 2
+)
+
+// IPv6EmptySubnet is the empty IPv6 subnet. It may also be known as the
+// catch-all or wildcard subnet. That is, all IPv6 addresses are considered to
+// be contained within this subnet.
+var IPv6EmptySubnet = func() tcpip.Subnet {
+	subnet, err := tcpip.NewSubnet(IPv6Any, tcpip.AddressMask(IPv6Any))
+	if err != nil {
+		panic(err)
+	}
+	return subnet
+}()
+
+// IPv6LinkLocalPrefix is the prefix for IPv6 link-local addresses, as defined
+// by RFC 4291 section 2.5.6.
+//
+// The prefix is fe80::/64
+var IPv6LinkLocalPrefix = tcpip.AddressWithPrefix{
+	Address:   "\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+	PrefixLen: 64,
+}
+
+// PayloadLength returns the value of the "payload length" field of the ipv6
+// header.
+func (b IPv6) PayloadLength() uint16 {
+	return binary.BigEndian.Uint16(b[IPv6PayloadLenOffset:])
+}
+
+// HopLimit returns the value of the "hop limit" field of the ipv6 header.
+func (b IPv6) HopLimit() uint8 {
+	return b[hopLimit]
+}
+
+// NextHeader returns the value of the "next header" field of the ipv6 header.
+func (b IPv6) NextHeader() uint8 {
+	return b[IPv6NextHeaderOffset]
+}
+
+// TransportProtocol implements Network.TransportProtocol.
+func (b IPv6) TransportProtocol() tcpip.TransportProtocolNumber {
+	return tcpip.TransportProtocolNumber(b.NextHeader())
+}
+
+// Payload implements Network.Payload.
+func (b IPv6) Payload() []byte {
+	return b[IPv6MinimumSize:][:b.PayloadLength()]
+}
+
+// SourceAddress returns the "source address" field of the ipv6 header.
+func (b IPv6) SourceAddress() tcpip.Address {
+	return tcpip.Address(b[v6SrcAddr:][:IPv6AddressSize])
+}
+
+// DestinationAddress returns the "destination address" field of the ipv6
+// header.
+func (b IPv6) DestinationAddress() tcpip.Address {
+	return tcpip.Address(b[v6DstAddr:][:IPv6AddressSize])
+}
+
+// Checksum implements Network.Checksum. Given that IPv6 doesn't have a
+// checksum, it just returns 0.
+func (IPv6) Checksum() uint16 {
+	return 0
+}
+
+// TOS returns the "traffic class" and "flow label" fields of the ipv6 header.
+func (b IPv6) TOS() (uint8, uint32) {
+	v := binary.BigEndian.Uint32(b[versTCFL:])
+	return uint8(v >> 20), v & 0xfffff
+}
+
+// SetTOS sets the "traffic class" and "flow label" fields of the ipv6 header.
+func (b IPv6) SetTOS(t uint8, l uint32) {
+	vtf := (6 << 28) | (uint32(t) << 20) | (l & 0xfffff)
+	binary.BigEndian.PutUint32(b[versTCFL:], vtf)
+}
+
+// SetPayloadLength sets the "payload length" field of the ipv6 header.
+func (b IPv6) SetPayloadLength(payloadLength uint16) {
+	binary.BigEndian.PutUint16(b[IPv6PayloadLenOffset:], payloadLength)
+}
+
+// SetSourceAddress sets the "source address" field of the ipv6 header.
+func (b IPv6) SetSourceAddress(addr tcpip.Address) {
+	copy(b[v6SrcAddr:][:IPv6AddressSize], addr)
+}
+
+// SetDestinationAddress sets the "destination address" field of the ipv6
+// header.
+func (b IPv6) SetDestinationAddress(addr tcpip.Address) {
+	copy(b[v6DstAddr:][:IPv6AddressSize], addr)
+}
+
+// SetNextHeader sets the value of the "next header" field of the ipv6 header.
+func (b IPv6) SetNextHeader(v uint8) {
+	b[IPv6NextHeaderOffset] = v
+}
+
+// SetChecksum implements Network.SetChecksum. Given that IPv6 doesn't have a
+// checksum, it is empty.
+func (IPv6) SetChecksum(uint16) {
+}
+
+// Encode encodes all the fields of the ipv6 header.
+func (b IPv6) Encode(i *IPv6Fields) {
+	b.SetTOS(i.TrafficClass, i.FlowLabel)
+	b.SetPayloadLength(i.PayloadLength)
+	b[IPv6NextHeaderOffset] = i.NextHeader
+	b[hopLimit] = i.HopLimit
+	b.SetSourceAddress(i.SrcAddr)
+	b.SetDestinationAddress(i.DstAddr)
+}
+
+// IsValid performs basic validation on the packet.
+func (b IPv6) IsValid(pktSize int) bool {
+	if len(b) < IPv6MinimumSize {
+		return false
+	}
+
+	dlen := int(b.PayloadLength())
+	if dlen > pktSize-IPv6MinimumSize {
+		return false
+	}
+
+	if IPVersion(b) != IPv6Version {
+		return false
+	}
+
+	return true
+}
+
+// IsV4MappedAddress determines if the provided address is an IPv4 mapped
+// address by checking if its prefix is 0:0:0:0:0:ffff::/96.
+func IsV4MappedAddress(addr tcpip.Address) bool {
+	if len(addr) != IPv6AddressSize {
+		return false
+	}
+
+	return strings.HasPrefix(string(addr), "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff")
+}
+
+// IsV6MulticastAddress determines if the provided address is an IPv6
+// multicast address (anything starting with FF).
+func IsV6MulticastAddress(addr tcpip.Address) bool {
+	if len(addr) != IPv6AddressSize {
+		return false
+	}
+	return addr[0] == 0xff
+}
+
+// IsV6UnicastAddress determines if the provided address is a valid IPv6
+// unicast (and specified) address. That is, IsV6UnicastAddress returns
+// true if addr contains IPv6AddressSize bytes, is not the unspecified
+// address and is not a multicast address.
+func IsV6UnicastAddress(addr tcpip.Address) bool {
+	if len(addr) != IPv6AddressSize {
+		return false
+	}
+
+	// Must not be unspecified
+	if addr == IPv6Any {
+		return false
+	}
+
+	// Return if not a multicast.
+	return addr[0] != 0xff
+}
+
+// SolicitedNodeAddr computes the solicited-node multicast address. This is
+// used for NDP. Described in RFC 4291. The argument must be a full-length IPv6
+// address.
+func SolicitedNodeAddr(addr tcpip.Address) tcpip.Address {
+	const solicitedNodeMulticastPrefix = "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff"
+	return solicitedNodeMulticastPrefix + addr[len(addr)-3:]
+}
+
+// EthernetAdddressToModifiedEUI64IntoBuf populates buf with a modified EUI-64
+// from a 48-bit Ethernet/MAC address, as per RFC 4291 section 2.5.1.
+//
+// buf MUST be at least 8 bytes.
+func EthernetAdddressToModifiedEUI64IntoBuf(linkAddr tcpip.LinkAddress, buf []byte) {
+	buf[0] = linkAddr[0] ^ 2
+	buf[1] = linkAddr[1]
+	buf[2] = linkAddr[2]
+	buf[3] = 0xFF
+	buf[4] = 0xFE
+	buf[5] = linkAddr[3]
+	buf[6] = linkAddr[4]
+	buf[7] = linkAddr[5]
+}
+
+// EthernetAddressToModifiedEUI64 computes a modified EUI-64 from a 48-bit
+// Ethernet/MAC address, as per RFC 4291 section 2.5.1.
+func EthernetAddressToModifiedEUI64(linkAddr tcpip.LinkAddress) [IIDSize]byte {
+	var buf [IIDSize]byte
+	EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, buf[:])
+	return buf
+}
+
+// LinkLocalAddr computes the default IPv6 link-local address from a link-layer
+// (MAC) address.
+func LinkLocalAddr(linkAddr tcpip.LinkAddress) tcpip.Address {
+	// Convert a 48-bit MAC to a modified EUI-64 and then prepend the
+	// link-local header, FE80::.
+	//
+	// The conversion is very nearly:
+	//	aa:bb:cc:dd:ee:ff => FE80::Aabb:ccFF:FEdd:eeff
+	// Note the capital A. The conversion aa->Aa involves a bit flip.
+	lladdrb := [IPv6AddressSize]byte{
+		0: 0xFE,
+		1: 0x80,
+	}
+	EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, lladdrb[IIDOffsetInIPv6Address:])
+	return tcpip.Address(lladdrb[:])
+}
+
+// IsV6LinkLocalAddress determines if the provided address is an IPv6
+// link-local address (fe80::/10).
+func IsV6LinkLocalAddress(addr tcpip.Address) bool {
+	if len(addr) != IPv6AddressSize {
+		return false
+	}
+	return addr[0] == 0xfe && (addr[1]&0xc0) == 0x80
+}
+
+// IsV6LinkLocalMulticastAddress determines if the provided address is an IPv6
+// link-local multicast address.
+func IsV6LinkLocalMulticastAddress(addr tcpip.Address) bool {
+	return IsV6MulticastAddress(addr) && addr[ipv6MulticastAddressScopeByteIdx]&ipv6MulticastAddressScopeMask == ipv6LinkLocalMulticastScope
+}
+
+// IsV6UniqueLocalAddress determines if the provided address is an IPv6
+// unique-local address (within the prefix FC00::/7).
+func IsV6UniqueLocalAddress(addr tcpip.Address) bool {
+	if len(addr) != IPv6AddressSize {
+		return false
+	}
+	// According to RFC 4193 section 3.1, a unique local address has the prefix
+	// FC00::/7.
+	return (addr[0] & 0xfe) == 0xfc
+}
+
+// AppendOpaqueInterfaceIdentifier appends a 64 bit opaque interface identifier
+// (IID) to buf as outlined by RFC 7217 and returns the extended buffer.
+//
+// The opaque IID is generated from the cryptographic hash of the concatenation
+// of the prefix, NIC's name, DAD counter (DAD retry counter) and the secret
+// key. The secret key SHOULD be at least OpaqueIIDSecretKeyMinBytes bytes and
+// MUST be generated to a pseudo-random number. See RFC 4086 for randomness
+// requirements for security.
+//
+// If buf has enough capacity for the IID (IIDSize bytes), a new underlying
+// array for the buffer will not be allocated.
+func AppendOpaqueInterfaceIdentifier(buf []byte, prefix tcpip.Subnet, nicName string, dadCounter uint8, secretKey []byte) []byte {
+	// As per RFC 7217 section 5, the opaque identifier can be generated as a
+	// cryptographic hash of the concatenation of each of the function parameters.
+	// Note, we omit the optional Network_ID field.
+	h := sha256.New()
+	// h.Write never returns an error.
+	h.Write([]byte(prefix.ID()[:IIDOffsetInIPv6Address]))
+	h.Write([]byte(nicName))
+	h.Write([]byte{dadCounter})
+	h.Write(secretKey)
+
+	var sumBuf [sha256.Size]byte
+	sum := h.Sum(sumBuf[:0])
+
+	return append(buf, sum[:IIDSize]...)
+}
+
+// LinkLocalAddrWithOpaqueIID computes the default IPv6 link-local address with
+// an opaque IID.
+func LinkLocalAddrWithOpaqueIID(nicName string, dadCounter uint8, secretKey []byte) tcpip.Address {
+	lladdrb := [IPv6AddressSize]byte{
+		0: 0xFE,
+		1: 0x80,
+	}
+
+	return tcpip.Address(AppendOpaqueInterfaceIdentifier(lladdrb[:IIDOffsetInIPv6Address], IPv6LinkLocalPrefix.Subnet(), nicName, dadCounter, secretKey))
+}
+
+// IPv6AddressScope is the scope of an IPv6 address.
+type IPv6AddressScope int
+
+const (
+	// LinkLocalScope indicates a link-local address.
+	LinkLocalScope IPv6AddressScope = iota
+
+	// UniqueLocalScope indicates a unique-local address.
+	UniqueLocalScope
+
+	// GlobalScope indicates a global address.
+	GlobalScope
+)
+
+// ScopeForIPv6Address returns the scope for an IPv6 address.
+func ScopeForIPv6Address(addr tcpip.Address) (IPv6AddressScope, *tcpip.Error) {
+	if len(addr) != IPv6AddressSize {
+		return GlobalScope, tcpip.ErrBadAddress
+	}
+
+	switch {
+	case IsV6LinkLocalMulticastAddress(addr):
+		return LinkLocalScope, nil
+
+	case IsV6LinkLocalAddress(addr):
+		return LinkLocalScope, nil
+
+	case IsV6UniqueLocalAddress(addr):
+		return UniqueLocalScope, nil
+
+	default:
+		return GlobalScope, nil
+	}
+}
+
+// InitialTempIID generates the initial temporary IID history value to generate
+// temporary SLAAC addresses with.
+//
+// Panics if initialTempIIDHistory is not at least IIDSize bytes.
+func InitialTempIID(initialTempIIDHistory []byte, seed []byte, nicID tcpip.NICID) {
+	h := sha256.New()
+	// h.Write never returns an error.
+	h.Write(seed)
+	var nicIDBuf [4]byte
+	binary.BigEndian.PutUint32(nicIDBuf[:], uint32(nicID))
+	h.Write(nicIDBuf[:])
+
+	var sumBuf [sha256.Size]byte
+	sum := h.Sum(sumBuf[:0])
+
+	if n := copy(initialTempIIDHistory, sum[sha256.Size-IIDSize:]); n != IIDSize {
+		panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, IIDSize))
+	}
+}
+
+// GenerateTempIPv6SLAACAddr generates a temporary SLAAC IPv6 address for an
+// associated stable/permanent SLAAC address.
+//
+// GenerateTempIPv6SLAACAddr will update the temporary IID history value to be
+// used when generating a new temporary IID.
+//
+// Panics if tempIIDHistory is not at least IIDSize bytes.
+func GenerateTempIPv6SLAACAddr(tempIIDHistory []byte, stableAddr tcpip.Address) tcpip.AddressWithPrefix {
+	addrBytes := []byte(stableAddr)
+	h := sha256.New()
+	h.Write(tempIIDHistory)
+	h.Write(addrBytes[IIDOffsetInIPv6Address:])
+	var sumBuf [sha256.Size]byte
+	sum := h.Sum(sumBuf[:0])
+
+	// The rightmost 64 bits of sum are saved for the next iteration.
+	if n := copy(tempIIDHistory, sum[sha256.Size-IIDSize:]); n != IIDSize {
+		panic(fmt.Sprintf("copied %d bytes, expected %d bytes", n, IIDSize))
+	}
+
+	// The leftmost 64 bits of sum is used as the IID.
+	if n := copy(addrBytes[IIDOffsetInIPv6Address:], sum); n != IIDSize {
+		panic(fmt.Sprintf("copied %d IID bytes, expected %d bytes", n, IIDSize))
+	}
+
+	return tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(addrBytes),
+		PrefixLen: IIDOffsetInIPv6Address * 8,
+	}
+}
diff --git a/pkg/tcpip/header/ipv6_extension_headers.go b/pkg/tcpip/header/ipv6_extension_headers.go
new file mode 100644
index 000000000..3499d8399
--- /dev/null
+++ b/pkg/tcpip/header/ipv6_extension_headers.go
@@ -0,0 +1,551 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"bufio"
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+// IPv6ExtensionHeaderIdentifier is an IPv6 extension header identifier.
+type IPv6ExtensionHeaderIdentifier uint8
+
+const (
+	// IPv6HopByHopOptionsExtHdrIdentifier is the header identifier of a Hop by
+	// Hop Options extension header, as per RFC 8200 section 4.3.
+	IPv6HopByHopOptionsExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 0
+
+	// IPv6RoutingExtHdrIdentifier is the header identifier of a Routing extension
+	// header, as per RFC 8200 section 4.4.
+	IPv6RoutingExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 43
+
+	// IPv6FragmentExtHdrIdentifier is the header identifier of a Fragment
+	// extension header, as per RFC 8200 section 4.5.
+	IPv6FragmentExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 44
+
+	// IPv6DestinationOptionsExtHdrIdentifier is the header identifier of a
+	// Destination Options extension header, as per RFC 8200 section 4.6.
+	IPv6DestinationOptionsExtHdrIdentifier IPv6ExtensionHeaderIdentifier = 60
+
+	// IPv6NoNextHeaderIdentifier is the header identifier used to signify the end
+	// of an IPv6 payload, as per RFC 8200 section 4.7.
+	IPv6NoNextHeaderIdentifier IPv6ExtensionHeaderIdentifier = 59
+)
+
+const (
+	// ipv6UnknownExtHdrOptionActionMask is the mask of the action to take when
+	// a node encounters an unrecognized option.
+	ipv6UnknownExtHdrOptionActionMask = 192
+
+	// ipv6UnknownExtHdrOptionActionShift is the least significant bits to discard
+	// from the action value for an unrecognized option identifier.
+	ipv6UnknownExtHdrOptionActionShift = 6
+
+	// ipv6RoutingExtHdrSegmentsLeftIdx is the index to the Segments Left field
+	// within an IPv6RoutingExtHdr.
+	ipv6RoutingExtHdrSegmentsLeftIdx = 1
+
+	// IPv6FragmentExtHdrLength is the length of an IPv6 extension header, in
+	// bytes.
+	IPv6FragmentExtHdrLength = 8
+
+	// ipv6FragmentExtHdrFragmentOffsetOffset is the offset to the start of the
+	// Fragment Offset field within an IPv6FragmentExtHdr.
+	ipv6FragmentExtHdrFragmentOffsetOffset = 0
+
+	// ipv6FragmentExtHdrFragmentOffsetShift is the least significant bits to
+	// discard from the Fragment Offset.
+	ipv6FragmentExtHdrFragmentOffsetShift = 3
+
+	// ipv6FragmentExtHdrFlagsIdx is the index to the flags field within an
+	// IPv6FragmentExtHdr.
+	ipv6FragmentExtHdrFlagsIdx = 1
+
+	// ipv6FragmentExtHdrMFlagMask is the mask of the More (M) flag within the
+	// flags field of an IPv6FragmentExtHdr.
+	ipv6FragmentExtHdrMFlagMask = 1
+
+	// ipv6FragmentExtHdrIdentificationOffset is the offset to the Identification
+	// field within an IPv6FragmentExtHdr.
+	ipv6FragmentExtHdrIdentificationOffset = 2
+
+	// ipv6ExtHdrLenBytesPerUnit is the unit size of an extension header's length
+	// field. That is, given a Length field of 2, the extension header expects
+	// 16 bytes following the first 8 bytes (see ipv6ExtHdrLenBytesExcluded for
+	// details about the first 8 bytes' exclusion from the Length field).
+	ipv6ExtHdrLenBytesPerUnit = 8
+
+	// ipv6ExtHdrLenBytesExcluded is the number of bytes excluded from an
+	// extension header's Length field following the Length field.
+	//
+	// The Length field excludes the first 8 bytes, but the Next Header and Length
+	// field take up the first 2 of the 8 bytes so we expect (at minimum) 6 bytes
+	// after the Length field.
+	//
+	// This ensures that every extension header is at least 8 bytes.
+	ipv6ExtHdrLenBytesExcluded = 6
+
+	// IPv6FragmentExtHdrFragmentOffsetBytesPerUnit is the unit size of a Fragment
+	// extension header's Fragment Offset field. That is, given a Fragment Offset
+	// of 2, the extension header is indiciating that the fragment's payload
+	// starts at the 16th byte in the reassembled packet.
+	IPv6FragmentExtHdrFragmentOffsetBytesPerUnit = 8
+)
+
+// IPv6PayloadHeader is implemented by the various headers that can be found
+// in an IPv6 payload.
+//
+// These headers include IPv6 extension headers or upper layer data.
+type IPv6PayloadHeader interface {
+	isIPv6PayloadHeader()
+}
+
+// IPv6RawPayloadHeader the remainder of an IPv6 payload after an iterator
+// encounters a Next Header field it does not recognize as an IPv6 extension
+// header.
+type IPv6RawPayloadHeader struct {
+	Identifier IPv6ExtensionHeaderIdentifier
+	Buf        buffer.VectorisedView
+}
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6RawPayloadHeader) isIPv6PayloadHeader() {}
+
+// ipv6OptionsExtHdr is an IPv6 extension header that holds options.
+type ipv6OptionsExtHdr []byte
+
+// Iter returns an iterator over the IPv6 extension header options held in b.
+func (b ipv6OptionsExtHdr) Iter() IPv6OptionsExtHdrOptionsIterator {
+	it := IPv6OptionsExtHdrOptionsIterator{}
+	it.reader.Reset(b)
+	return it
+}
+
+// IPv6OptionsExtHdrOptionsIterator is an iterator over IPv6 extension header
+// options.
+//
+// Note, between when an IPv6OptionsExtHdrOptionsIterator is obtained and last
+// used, no changes to the underlying buffer may happen. Doing so may cause
+// undefined and unexpected behaviour. It is fine to obtain an
+// IPv6OptionsExtHdrOptionsIterator, iterate over the first few options then
+// modify the backing payload so long as the IPv6OptionsExtHdrOptionsIterator
+// obtained before modification is no longer used.
+type IPv6OptionsExtHdrOptionsIterator struct {
+	reader bytes.Reader
+}
+
+// IPv6OptionUnknownAction is the action that must be taken if the processing
+// IPv6 node does not recognize the option, as outlined in RFC 8200 section 4.2.
+type IPv6OptionUnknownAction int
+
+const (
+	// IPv6OptionUnknownActionSkip indicates that the unrecognized option must
+	// be skipped and the node should continue processing the header.
+	IPv6OptionUnknownActionSkip IPv6OptionUnknownAction = 0
+
+	// IPv6OptionUnknownActionDiscard indicates that the packet must be silently
+	// discarded.
+	IPv6OptionUnknownActionDiscard IPv6OptionUnknownAction = 1
+
+	// IPv6OptionUnknownActionDiscardSendICMP indicates that the packet must be
+	// discarded and the node must send an ICMP Parameter Problem, Code 2, message
+	// to the packet's source, regardless of whether or not the packet's
+	// Destination was a multicast address.
+	IPv6OptionUnknownActionDiscardSendICMP IPv6OptionUnknownAction = 2
+
+	// IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest indicates that the
+	// packet must be discarded and the node must send an ICMP Parameter Problem,
+	// Code 2, message to the packet's source only if the packet's Destination was
+	// not a multicast address.
+	IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest IPv6OptionUnknownAction = 3
+)
+
+// IPv6ExtHdrOption is implemented by the various IPv6 extension header options.
+type IPv6ExtHdrOption interface {
+	// UnknownAction returns the action to take in response to an unrecognized
+	// option.
+	UnknownAction() IPv6OptionUnknownAction
+
+	// isIPv6ExtHdrOption is used to "lock" this interface so it is not
+	// implemented by other packages.
+	isIPv6ExtHdrOption()
+}
+
+// IPv6ExtHdrOptionIndentifier is an IPv6 extension header option identifier.
+type IPv6ExtHdrOptionIndentifier uint8
+
+const (
+	// ipv6Pad1ExtHdrOptionIdentifier is the identifier for a padding option that
+	// provides 1 byte padding, as outlined in RFC 8200 section 4.2.
+	ipv6Pad1ExtHdrOptionIdentifier IPv6ExtHdrOptionIndentifier = 0
+
+	// ipv6PadBExtHdrOptionIdentifier is the identifier for a padding option that
+	// provides variable length byte padding, as outlined in RFC 8200 section 4.2.
+	ipv6PadNExtHdrOptionIdentifier IPv6ExtHdrOptionIndentifier = 1
+)
+
+// IPv6UnknownExtHdrOption holds the identifier and data for an IPv6 extension
+// header option that is unknown by the parsing utilities.
+type IPv6UnknownExtHdrOption struct {
+	Identifier IPv6ExtHdrOptionIndentifier
+	Data       []byte
+}
+
+// UnknownAction implements IPv6OptionUnknownAction.UnknownAction.
+func (o *IPv6UnknownExtHdrOption) UnknownAction() IPv6OptionUnknownAction {
+	return IPv6OptionUnknownAction((o.Identifier & ipv6UnknownExtHdrOptionActionMask) >> ipv6UnknownExtHdrOptionActionShift)
+}
+
+// isIPv6ExtHdrOption implements IPv6ExtHdrOption.isIPv6ExtHdrOption.
+func (*IPv6UnknownExtHdrOption) isIPv6ExtHdrOption() {}
+
+// Next returns the next option in the options data.
+//
+// If the next item is not a known extension header option,
+// IPv6UnknownExtHdrOption will be returned with the option identifier and data.
+//
+// The return is of the format (option, done, error). done will be true when
+// Next is unable to return anything because the iterator has reached the end of
+// the options data, or an error occured.
+func (i *IPv6OptionsExtHdrOptionsIterator) Next() (IPv6ExtHdrOption, bool, error) {
+	for {
+		temp, err := i.reader.ReadByte()
+		if err != nil {
+			// If we can't read the first byte of a new option, then we know the
+			// options buffer has been exhausted and we are done iterating.
+			return nil, true, nil
+		}
+		id := IPv6ExtHdrOptionIndentifier(temp)
+
+		// If the option identifier indicates the option is a Pad1 option, then we
+		// know the option does not have Length and Data fields. End processing of
+		// the Pad1 option and continue processing the buffer as a new option.
+		if id == ipv6Pad1ExtHdrOptionIdentifier {
+			continue
+		}
+
+		length, err := i.reader.ReadByte()
+		if err != nil {
+			if err != io.EOF {
+				// ReadByte should only ever return nil or io.EOF.
+				panic(fmt.Sprintf("unexpected error when reading the option's Length field for option with id = %d: %s", id, err))
+			}
+
+			// We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected once
+			// we start parsing an option; we expect the reader to contain enough
+			// bytes for the whole option.
+			return nil, true, fmt.Errorf("error when reading the option's Length field for option with id = %d: %w", id, io.ErrUnexpectedEOF)
+		}
+
+		// Special-case the variable length padding option to avoid a copy.
+		if id == ipv6PadNExtHdrOptionIdentifier {
+			// Do we have enough bytes in the reader for the PadN option?
+			if n := i.reader.Len(); n < int(length) {
+				// Reset the reader to effectively consume the remaining buffer.
+				i.reader.Reset(nil)
+
+				// We return the same error as if we failed to read a non-padding option
+				// so consumers of this iterator don't need to differentiate between
+				// padding and non-padding options.
+				return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, io.ErrUnexpectedEOF)
+			}
+
+			if _, err := i.reader.Seek(int64(length), io.SeekCurrent); err != nil {
+				panic(fmt.Sprintf("error when skipping PadN (N = %d) option's data bytes: %s", length, err))
+			}
+
+			// End processing of the PadN option and continue processing the buffer as
+			// a new option.
+			continue
+		}
+
+		bytes := make([]byte, length)
+		if n, err := io.ReadFull(&i.reader, bytes); err != nil {
+			// io.ReadFull may return io.EOF if i.reader has been exhausted. We use
+			// io.ErrUnexpectedEOF instead as the io.EOF is unexpected given the
+			// Length field found in the option.
+			if err == io.EOF {
+				err = io.ErrUnexpectedEOF
+			}
+
+			return nil, true, fmt.Errorf("read %d out of %d option data bytes for option with id = %d: %w", n, length, id, err)
+		}
+
+		return &IPv6UnknownExtHdrOption{Identifier: id, Data: bytes}, false, nil
+	}
+}
+
+// IPv6HopByHopOptionsExtHdr is a buffer holding the Hop By Hop Options
+// extension header.
+type IPv6HopByHopOptionsExtHdr struct {
+	ipv6OptionsExtHdr
+}
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6HopByHopOptionsExtHdr) isIPv6PayloadHeader() {}
+
+// IPv6DestinationOptionsExtHdr is a buffer holding the Destination Options
+// extension header.
+type IPv6DestinationOptionsExtHdr struct {
+	ipv6OptionsExtHdr
+}
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6DestinationOptionsExtHdr) isIPv6PayloadHeader() {}
+
+// IPv6RoutingExtHdr is a buffer holding the Routing extension header specific
+// data as outlined in RFC 8200 section 4.4.
+type IPv6RoutingExtHdr []byte
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6RoutingExtHdr) isIPv6PayloadHeader() {}
+
+// SegmentsLeft returns the Segments Left field.
+func (b IPv6RoutingExtHdr) SegmentsLeft() uint8 {
+	return b[ipv6RoutingExtHdrSegmentsLeftIdx]
+}
+
+// IPv6FragmentExtHdr is a buffer holding the Fragment extension header specific
+// data as outlined in RFC 8200 section 4.5.
+//
+// Note, the buffer does not include the Next Header and Reserved fields.
+type IPv6FragmentExtHdr [6]byte
+
+// isIPv6PayloadHeader implements IPv6PayloadHeader.isIPv6PayloadHeader.
+func (IPv6FragmentExtHdr) isIPv6PayloadHeader() {}
+
+// FragmentOffset returns the Fragment Offset field.
+//
+// This value indicates where the buffer following the Fragment extension header
+// starts in the target (reassembled) packet.
+func (b IPv6FragmentExtHdr) FragmentOffset() uint16 {
+	return binary.BigEndian.Uint16(b[ipv6FragmentExtHdrFragmentOffsetOffset:]) >> ipv6FragmentExtHdrFragmentOffsetShift
+}
+
+// More returns the More (M) flag.
+//
+// This indicates whether any fragments are expected to succeed b.
+func (b IPv6FragmentExtHdr) More() bool {
+	return b[ipv6FragmentExtHdrFlagsIdx]&ipv6FragmentExtHdrMFlagMask != 0
+}
+
+// ID returns the Identification field.
+//
+// This value is used to uniquely identify the packet, between a
+// souce and destination.
+func (b IPv6FragmentExtHdr) ID() uint32 {
+	return binary.BigEndian.Uint32(b[ipv6FragmentExtHdrIdentificationOffset:])
+}
+
+// IsAtomic returns whether the fragment header indicates an atomic fragment. An
+// atomic fragment is a fragment that contains all the data required to
+// reassemble a full packet.
+func (b IPv6FragmentExtHdr) IsAtomic() bool {
+	return !b.More() && b.FragmentOffset() == 0
+}
+
+// IPv6PayloadIterator is an iterator over the contents of an IPv6 payload.
+//
+// The IPv6 payload may contain IPv6 extension headers before any upper layer
+// data.
+//
+// Note, between when an IPv6PayloadIterator is obtained and last used, no
+// changes to the payload may happen. Doing so may cause undefined and
+// unexpected behaviour. It is fine to obtain an IPv6PayloadIterator, iterate
+// over the first few headers then modify the backing payload so long as the
+// IPv6PayloadIterator obtained before modification is no longer used.
+type IPv6PayloadIterator struct {
+	// The identifier of the next header to parse.
+	nextHdrIdentifier IPv6ExtensionHeaderIdentifier
+
+	// reader is an io.Reader over payload.
+	reader  bufio.Reader
+	payload buffer.VectorisedView
+
+	// Indicates to the iterator that it should return the remaining payload as a
+	// raw payload on the next call to Next.
+	forceRaw bool
+}
+
+// MakeIPv6PayloadIterator returns an iterator over the IPv6 payload containing
+// extension headers, or a raw payload if the payload cannot be parsed.
+func MakeIPv6PayloadIterator(nextHdrIdentifier IPv6ExtensionHeaderIdentifier, payload buffer.VectorisedView) IPv6PayloadIterator {
+	readers := payload.Readers()
+	readerPs := make([]io.Reader, 0, len(readers))
+	for i := range readers {
+		readerPs = append(readerPs, &readers[i])
+	}
+
+	return IPv6PayloadIterator{
+		nextHdrIdentifier: nextHdrIdentifier,
+		payload:           payload.Clone(nil),
+		// We need a buffer of size 1 for calls to bufio.Reader.ReadByte.
+		reader: *bufio.NewReaderSize(io.MultiReader(readerPs...), 1),
+	}
+}
+
+// AsRawHeader returns the remaining payload of i as a raw header and
+// optionally consumes the iterator.
+//
+// If consume is true, calls to Next after calling AsRawHeader on i will
+// indicate that the iterator is done.
+func (i *IPv6PayloadIterator) AsRawHeader(consume bool) IPv6RawPayloadHeader {
+	identifier := i.nextHdrIdentifier
+
+	var buf buffer.VectorisedView
+	if consume {
+		// Since we consume the iterator, we return the payload as is.
+		buf = i.payload
+
+		// Mark i as done.
+		*i = IPv6PayloadIterator{
+			nextHdrIdentifier: IPv6NoNextHeaderIdentifier,
+		}
+	} else {
+		buf = i.payload.Clone(nil)
+	}
+
+	return IPv6RawPayloadHeader{Identifier: identifier, Buf: buf}
+}
+
+// Next returns the next item in the payload.
+//
+// If the next item is not a known IPv6 extension header, IPv6RawPayloadHeader
+// will be returned with the remaining bytes and next header identifier.
+//
+// The return is of the format (header, done, error). done will be true when
+// Next is unable to return anything because the iterator has reached the end of
+// the payload, or an error occured.
+func (i *IPv6PayloadIterator) Next() (IPv6PayloadHeader, bool, error) {
+	// We could be forced to return i as a raw header when the previous header was
+	// a fragment extension header as the data following the fragment extension
+	// header may not be complete.
+	if i.forceRaw {
+		return i.AsRawHeader(true /* consume */), false, nil
+	}
+
+	// Is the header we are parsing a known extension header?
+	switch i.nextHdrIdentifier {
+	case IPv6HopByHopOptionsExtHdrIdentifier:
+		nextHdrIdentifier, bytes, err := i.nextHeaderData(false /* fragmentHdr */, nil)
+		if err != nil {
+			return nil, true, err
+		}
+
+		i.nextHdrIdentifier = nextHdrIdentifier
+		return IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: bytes}, false, nil
+	case IPv6RoutingExtHdrIdentifier:
+		nextHdrIdentifier, bytes, err := i.nextHeaderData(false /* fragmentHdr */, nil)
+		if err != nil {
+			return nil, true, err
+		}
+
+		i.nextHdrIdentifier = nextHdrIdentifier
+		return IPv6RoutingExtHdr(bytes), false, nil
+	case IPv6FragmentExtHdrIdentifier:
+		var data [6]byte
+		// We ignore the returned bytes becauase we know the fragment extension
+		// header specific data will fit in data.
+		nextHdrIdentifier, _, err := i.nextHeaderData(true /* fragmentHdr */, data[:])
+		if err != nil {
+			return nil, true, err
+		}
+
+		fragmentExtHdr := IPv6FragmentExtHdr(data)
+
+		// If the packet is not the first fragment, do not attempt to parse anything
+		// after the fragment extension header as the payload following the fragment
+		// extension header should not contain any headers; the first fragment must
+		// hold all the headers up to and including any upper layer headers, as per
+		// RFC 8200 section 4.5.
+		if fragmentExtHdr.FragmentOffset() != 0 {
+			i.forceRaw = true
+		}
+
+		i.nextHdrIdentifier = nextHdrIdentifier
+		return fragmentExtHdr, false, nil
+	case IPv6DestinationOptionsExtHdrIdentifier:
+		nextHdrIdentifier, bytes, err := i.nextHeaderData(false /* fragmentHdr */, nil)
+		if err != nil {
+			return nil, true, err
+		}
+
+		i.nextHdrIdentifier = nextHdrIdentifier
+		return IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: bytes}, false, nil
+	case IPv6NoNextHeaderIdentifier:
+		// This indicates the end of the IPv6 payload.
+		return nil, true, nil
+
+	default:
+		// The header we are parsing is not a known extension header. Return the
+		// raw payload.
+		return i.AsRawHeader(true /* consume */), false, nil
+	}
+}
+
+// nextHeaderData returns the extension header's Next Header field and raw data.
+//
+// fragmentHdr indicates that the extension header being parsed is the Fragment
+// extension header so the Length field should be ignored as it is Reserved
+// for the Fragment extension header.
+//
+// If bytes is not nil, extension header specific data will be read into bytes
+// if it has enough capacity. If bytes is provided but does not have enough
+// capacity for the data, nextHeaderData will panic.
+func (i *IPv6PayloadIterator) nextHeaderData(fragmentHdr bool, bytes []byte) (IPv6ExtensionHeaderIdentifier, []byte, error) {
+	// We ignore the number of bytes read because we know we will only ever read
+	// at max 1 bytes since rune has a length of 1. If we read 0 bytes, the Read
+	// would return io.EOF to indicate that io.Reader has reached the end of the
+	// payload.
+	nextHdrIdentifier, err := i.reader.ReadByte()
+	i.payload.TrimFront(1)
+	if err != nil {
+		return 0, nil, fmt.Errorf("error when reading the Next Header field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
+	}
+
+	var length uint8
+	length, err = i.reader.ReadByte()
+	i.payload.TrimFront(1)
+	if err != nil {
+		if fragmentHdr {
+			return 0, nil, fmt.Errorf("error when reading the Length field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
+		}
+
+		return 0, nil, fmt.Errorf("error when reading the Reserved field for extension header with id = %d: %w", i.nextHdrIdentifier, err)
+	}
+	if fragmentHdr {
+		length = 0
+	}
+
+	bytesLen := int(length)*ipv6ExtHdrLenBytesPerUnit + ipv6ExtHdrLenBytesExcluded
+	if bytes == nil {
+		bytes = make([]byte, bytesLen)
+	} else if n := len(bytes); n < bytesLen {
+		panic(fmt.Sprintf("bytes only has space for %d bytes but need space for %d bytes (length = %d) for extension header with id = %d", n, bytesLen, length, i.nextHdrIdentifier))
+	}
+
+	n, err := io.ReadFull(&i.reader, bytes)
+	i.payload.TrimFront(n)
+	if err != nil {
+		return 0, nil, fmt.Errorf("read %d out of %d extension header data bytes (length = %d) for header with id = %d: %w", n, bytesLen, length, i.nextHdrIdentifier, err)
+	}
+
+	return IPv6ExtensionHeaderIdentifier(nextHdrIdentifier), bytes, nil
+}
diff --git a/pkg/tcpip/header/ipv6_extension_headers_test.go b/pkg/tcpip/header/ipv6_extension_headers_test.go
new file mode 100644
index 000000000..ab20c5f37
--- /dev/null
+++ b/pkg/tcpip/header/ipv6_extension_headers_test.go
@@ -0,0 +1,992 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"bytes"
+	"errors"
+	"io"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+// Equal returns true of a and b are equivalent.
+//
+// Note, Equal will return true if a and b hold the same Identifier value and
+// contain the same bytes in Buf, even if the bytes are split across views
+// differently.
+//
+// Needed to use cmp.Equal on IPv6RawPayloadHeader as it contains unexported
+// fields.
+func (a IPv6RawPayloadHeader) Equal(b IPv6RawPayloadHeader) bool {
+	return a.Identifier == b.Identifier && bytes.Equal(a.Buf.ToView(), b.Buf.ToView())
+}
+
+// Equal returns true of a and b are equivalent.
+//
+// Note, Equal will return true if a and b hold equivalent ipv6OptionsExtHdrs.
+//
+// Needed to use cmp.Equal on IPv6RawPayloadHeader as it contains unexported
+// fields.
+func (a IPv6HopByHopOptionsExtHdr) Equal(b IPv6HopByHopOptionsExtHdr) bool {
+	return bytes.Equal(a.ipv6OptionsExtHdr, b.ipv6OptionsExtHdr)
+}
+
+// Equal returns true of a and b are equivalent.
+//
+// Note, Equal will return true if a and b hold equivalent ipv6OptionsExtHdrs.
+//
+// Needed to use cmp.Equal on IPv6RawPayloadHeader as it contains unexported
+// fields.
+func (a IPv6DestinationOptionsExtHdr) Equal(b IPv6DestinationOptionsExtHdr) bool {
+	return bytes.Equal(a.ipv6OptionsExtHdr, b.ipv6OptionsExtHdr)
+}
+
+func TestIPv6UnknownExtHdrOption(t *testing.T) {
+	tests := []struct {
+		name                  string
+		identifier            IPv6ExtHdrOptionIndentifier
+		expectedUnknownAction IPv6OptionUnknownAction
+	}{
+		{
+			name:                  "Skip with zero LSBs",
+			identifier:            0,
+			expectedUnknownAction: IPv6OptionUnknownActionSkip,
+		},
+		{
+			name:                  "Discard with zero LSBs",
+			identifier:            64,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscard,
+		},
+		{
+			name:                  "Discard and ICMP with zero LSBs",
+			identifier:            128,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscardSendICMP,
+		},
+		{
+			name:                  "Discard and ICMP for non multicast destination with zero LSBs",
+			identifier:            192,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest,
+		},
+		{
+			name:                  "Skip with non-zero LSBs",
+			identifier:            63,
+			expectedUnknownAction: IPv6OptionUnknownActionSkip,
+		},
+		{
+			name:                  "Discard with non-zero LSBs",
+			identifier:            127,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscard,
+		},
+		{
+			name:                  "Discard and ICMP with non-zero LSBs",
+			identifier:            191,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscardSendICMP,
+		},
+		{
+			name:                  "Discard and ICMP for non multicast destination with non-zero LSBs",
+			identifier:            255,
+			expectedUnknownAction: IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opt := &IPv6UnknownExtHdrOption{Identifier: test.identifier, Data: []byte{1, 2, 3, 4}}
+			if a := opt.UnknownAction(); a != test.expectedUnknownAction {
+				t.Fatalf("got UnknownAction() = %d, want = %d", a, test.expectedUnknownAction)
+			}
+		})
+	}
+
+}
+
+func TestIPv6OptionsExtHdrIterErr(t *testing.T) {
+	tests := []struct {
+		name  string
+		bytes []byte
+		err   error
+	}{
+		{
+			name:  "Single unknown with zero length",
+			bytes: []byte{255, 0},
+		},
+		{
+			name:  "Single unknown with non-zero length",
+			bytes: []byte{255, 3, 1, 2, 3},
+		},
+		{
+			name: "Two options",
+			bytes: []byte{
+				255, 0,
+				254, 1, 1,
+			},
+		},
+		{
+			name: "Three options",
+			bytes: []byte{
+				255, 0,
+				254, 1, 1,
+				253, 4, 2, 3, 4, 5,
+			},
+		},
+		{
+			name:  "Single unknown only identifier",
+			bytes: []byte{255},
+			err:   io.ErrUnexpectedEOF,
+		},
+		{
+			name:  "Single unknown too small with length = 1",
+			bytes: []byte{255, 1},
+			err:   io.ErrUnexpectedEOF,
+		},
+		{
+			name:  "Single unknown too small with length = 2",
+			bytes: []byte{255, 2, 1},
+			err:   io.ErrUnexpectedEOF,
+		},
+		{
+			name: "Valid first with second unknown only identifier",
+			bytes: []byte{
+				255, 0,
+				254,
+			},
+			err: io.ErrUnexpectedEOF,
+		},
+		{
+			name: "Valid first with second unknown missing data",
+			bytes: []byte{
+				255, 0,
+				254, 1,
+			},
+			err: io.ErrUnexpectedEOF,
+		},
+		{
+			name: "Valid first with second unknown too small",
+			bytes: []byte{
+				255, 0,
+				254, 2, 1,
+			},
+			err: io.ErrUnexpectedEOF,
+		},
+		{
+			name:  "One Pad1",
+			bytes: []byte{0},
+		},
+		{
+			name:  "Multiple Pad1",
+			bytes: []byte{0, 0, 0},
+		},
+		{
+			name: "Multiple PadN",
+			bytes: []byte{
+				// Pad3
+				1, 1, 1,
+
+				// Pad5
+				1, 3, 1, 2, 3,
+			},
+		},
+		{
+			name:  "Pad5 too small middle of data buffer",
+			bytes: []byte{1, 3, 1, 2},
+			err:   io.ErrUnexpectedEOF,
+		},
+		{
+			name:  "Pad5 no data",
+			bytes: []byte{1, 3},
+			err:   io.ErrUnexpectedEOF,
+		},
+	}
+
+	check := func(t *testing.T, it IPv6OptionsExtHdrOptionsIterator, expectedErr error) {
+		for i := 0; ; i++ {
+			_, done, err := it.Next()
+			if err != nil {
+				// If we encountered a non-nil error while iterating, make sure it is
+				// is the same error as expectedErr.
+				if !errors.Is(err, expectedErr) {
+					t.Fatalf("got %d-th Next() = %v, want = %v", i, err, expectedErr)
+				}
+
+				return
+			}
+			if done {
+				// If we are done (without an error), make sure that we did not expect
+				// an error.
+				if expectedErr != nil {
+					t.Fatalf("expected error when iterating; want = %s", expectedErr)
+				}
+
+				return
+			}
+		}
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			t.Run("Hop By Hop", func(t *testing.T) {
+				extHdr := IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: test.bytes}
+				check(t, extHdr.Iter(), test.err)
+			})
+
+			t.Run("Destination", func(t *testing.T) {
+				extHdr := IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: test.bytes}
+				check(t, extHdr.Iter(), test.err)
+			})
+		})
+	}
+}
+
+func TestIPv6OptionsExtHdrIter(t *testing.T) {
+	tests := []struct {
+		name     string
+		bytes    []byte
+		expected []IPv6ExtHdrOption
+	}{
+		{
+			name:  "Single unknown with zero length",
+			bytes: []byte{255, 0},
+			expected: []IPv6ExtHdrOption{
+				&IPv6UnknownExtHdrOption{Identifier: 255, Data: []byte{}},
+			},
+		},
+		{
+			name:  "Single unknown with non-zero length",
+			bytes: []byte{255, 3, 1, 2, 3},
+			expected: []IPv6ExtHdrOption{
+				&IPv6UnknownExtHdrOption{Identifier: 255, Data: []byte{1, 2, 3}},
+			},
+		},
+		{
+			name:  "Single Pad1",
+			bytes: []byte{0},
+		},
+		{
+			name:  "Two Pad1",
+			bytes: []byte{0, 0},
+		},
+		{
+			name:  "Single Pad3",
+			bytes: []byte{1, 1, 1},
+		},
+		{
+			name:  "Single Pad5",
+			bytes: []byte{1, 3, 1, 2, 3},
+		},
+		{
+			name: "Multiple Pad",
+			bytes: []byte{
+				// Pad1
+				0,
+
+				// Pad2
+				1, 0,
+
+				// Pad3
+				1, 1, 1,
+
+				// Pad4
+				1, 2, 1, 2,
+
+				// Pad5
+				1, 3, 1, 2, 3,
+			},
+		},
+		{
+			name: "Multiple options",
+			bytes: []byte{
+				// Pad1
+				0,
+
+				// Unknown
+				255, 0,
+
+				// Pad2
+				1, 0,
+
+				// Unknown
+				254, 1, 1,
+
+				// Pad3
+				1, 1, 1,
+
+				// Unknown
+				253, 4, 2, 3, 4, 5,
+
+				// Pad4
+				1, 2, 1, 2,
+			},
+			expected: []IPv6ExtHdrOption{
+				&IPv6UnknownExtHdrOption{Identifier: 255, Data: []byte{}},
+				&IPv6UnknownExtHdrOption{Identifier: 254, Data: []byte{1}},
+				&IPv6UnknownExtHdrOption{Identifier: 253, Data: []byte{2, 3, 4, 5}},
+			},
+		},
+	}
+
+	checkIter := func(t *testing.T, it IPv6OptionsExtHdrOptionsIterator, expected []IPv6ExtHdrOption) {
+		for i, e := range expected {
+			opt, done, err := it.Next()
+			if err != nil {
+				t.Errorf("(i=%d) Next(): %s", i, err)
+			}
+			if done {
+				t.Errorf("(i=%d) unexpectedly done iterating", i)
+			}
+			if diff := cmp.Diff(e, opt); diff != "" {
+				t.Errorf("(i=%d) got option mismatch (-want +got):\n%s", i, diff)
+			}
+
+			if t.Failed() {
+				t.FailNow()
+			}
+		}
+
+		opt, done, err := it.Next()
+		if err != nil {
+			t.Errorf("(last) Next(): %s", err)
+		}
+		if !done {
+			t.Errorf("(last) iterator unexpectedly not done")
+		}
+		if opt != nil {
+			t.Errorf("(last) got Next() = %T, want = nil", opt)
+		}
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			t.Run("Hop By Hop", func(t *testing.T) {
+				extHdr := IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: test.bytes}
+				checkIter(t, extHdr.Iter(), test.expected)
+			})
+
+			t.Run("Destination", func(t *testing.T) {
+				extHdr := IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: test.bytes}
+				checkIter(t, extHdr.Iter(), test.expected)
+			})
+		})
+	}
+}
+
+func TestIPv6RoutingExtHdr(t *testing.T) {
+	tests := []struct {
+		name         string
+		bytes        []byte
+		segmentsLeft uint8
+	}{
+		{
+			name:         "Zeroes",
+			bytes:        []byte{0, 0, 0, 0, 0, 0},
+			segmentsLeft: 0,
+		},
+		{
+			name:         "Ones",
+			bytes:        []byte{1, 1, 1, 1, 1, 1},
+			segmentsLeft: 1,
+		},
+		{
+			name:         "Mixed",
+			bytes:        []byte{1, 2, 3, 4, 5, 6},
+			segmentsLeft: 2,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			extHdr := IPv6RoutingExtHdr(test.bytes)
+			if got := extHdr.SegmentsLeft(); got != test.segmentsLeft {
+				t.Errorf("got SegmentsLeft() = %d, want = %d", got, test.segmentsLeft)
+			}
+		})
+	}
+}
+
+func TestIPv6FragmentExtHdr(t *testing.T) {
+	tests := []struct {
+		name           string
+		bytes          [6]byte
+		fragmentOffset uint16
+		more           bool
+		id             uint32
+	}{
+		{
+			name:           "Zeroes",
+			bytes:          [6]byte{0, 0, 0, 0, 0, 0},
+			fragmentOffset: 0,
+			more:           false,
+			id:             0,
+		},
+		{
+			name:           "Ones",
+			bytes:          [6]byte{0, 9, 0, 0, 0, 1},
+			fragmentOffset: 1,
+			more:           true,
+			id:             1,
+		},
+		{
+			name:           "Mixed",
+			bytes:          [6]byte{68, 9, 128, 4, 2, 1},
+			fragmentOffset: 2177,
+			more:           true,
+			id:             2147746305,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			extHdr := IPv6FragmentExtHdr(test.bytes)
+			if got := extHdr.FragmentOffset(); got != test.fragmentOffset {
+				t.Errorf("got FragmentOffset() = %d, want = %d", got, test.fragmentOffset)
+			}
+			if got := extHdr.More(); got != test.more {
+				t.Errorf("got More() = %t, want = %t", got, test.more)
+			}
+			if got := extHdr.ID(); got != test.id {
+				t.Errorf("got ID() = %d, want = %d", got, test.id)
+			}
+		})
+	}
+}
+
+func makeVectorisedViewFromByteBuffers(bs ...[]byte) buffer.VectorisedView {
+	size := 0
+	var vs []buffer.View
+
+	for _, b := range bs {
+		vs = append(vs, buffer.View(b))
+		size += len(b)
+	}
+
+	return buffer.NewVectorisedView(size, vs)
+}
+
+func TestIPv6ExtHdrIterErr(t *testing.T) {
+	tests := []struct {
+		name         string
+		firstNextHdr IPv6ExtensionHeaderIdentifier
+		payload      buffer.VectorisedView
+		err          error
+	}{
+		{
+			name:         "Upper layer only without data",
+			firstNextHdr: 255,
+		},
+		{
+			name:         "Upper layer only with data",
+			firstNextHdr: 255,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{1, 2, 3, 4}),
+		},
+		{
+			name:         "No next header",
+			firstNextHdr: IPv6NoNextHeaderIdentifier,
+		},
+		{
+			name:         "No next header with data",
+			firstNextHdr: IPv6NoNextHeaderIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{1, 2, 3, 4}),
+		},
+		{
+			name:         "Valid single hop by hop",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 4, 1, 2, 3, 4}),
+		},
+		{
+			name:         "Hop by hop too small",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 4, 1, 2, 3}),
+			err:          io.ErrUnexpectedEOF,
+		},
+		{
+			name:         "Valid single fragment",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 68, 9, 128, 4, 2, 1}),
+		},
+		{
+			name:         "Fragment too small",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 68, 9, 128, 4, 2}),
+			err:          io.ErrUnexpectedEOF,
+		},
+		{
+			name:         "Valid single destination",
+			firstNextHdr: IPv6DestinationOptionsExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 4, 1, 2, 3, 4}),
+		},
+		{
+			name:         "Destination too small",
+			firstNextHdr: IPv6DestinationOptionsExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 4, 1, 2, 3}),
+			err:          io.ErrUnexpectedEOF,
+		},
+		{
+			name:         "Valid single routing",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 2, 3, 4, 5, 6}),
+		},
+		{
+			name:         "Valid single routing across views",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 2}, []byte{3, 4, 5, 6}),
+		},
+		{
+			name:         "Routing too small with zero length field",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 0, 1, 2, 3, 4, 5}),
+			err:          io.ErrUnexpectedEOF,
+		},
+		{
+			name:         "Valid routing with non-zero length field",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 1, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 8}),
+		},
+		{
+			name:         "Valid routing with non-zero length field across views",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 1, 1, 2, 3, 4, 5, 6}, []byte{1, 2, 3, 4, 5, 6, 7, 8}),
+		},
+		{
+			name:         "Routing too small with non-zero length field",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 1, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7}),
+			err:          io.ErrUnexpectedEOF,
+		},
+		{
+			name:         "Routing too small with non-zero length field across views",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload:      makeVectorisedViewFromByteBuffers([]byte{255, 1, 1, 2, 3, 4, 5, 6}, []byte{1, 2, 3, 4, 5, 6, 7}),
+			err:          io.ErrUnexpectedEOF,
+		},
+		{
+			name:         "Mixed",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop Options extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// (Atomic) Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6, 128, 4, 2, 1,
+
+				// Routing extension header.
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Destination Options extension header.
+				255, 0, 255, 4, 1, 2, 3, 4,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+		},
+		{
+			name:         "Mixed without upper layer data",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop Options extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// (Atomic) Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6, 128, 4, 2, 1,
+
+				// Routing extension header.
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Destination Options extension header.
+				255, 0, 255, 4, 1, 2, 3, 4,
+			}),
+		},
+		{
+			name:         "Mixed without upper layer data but last ext hdr too small",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop Options extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// (Atomic) Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6, 128, 4, 2, 1,
+
+				// Routing extension header.
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Destination Options extension header.
+				255, 0, 255, 4, 1, 2, 3,
+			}),
+			err: io.ErrUnexpectedEOF,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			it := MakeIPv6PayloadIterator(test.firstNextHdr, test.payload)
+
+			for i := 0; ; i++ {
+				_, done, err := it.Next()
+				if err != nil {
+					// If we encountered a non-nil error while iterating, make sure it is
+					// is the same error as test.err.
+					if !errors.Is(err, test.err) {
+						t.Fatalf("got %d-th Next() = %v, want = %v", i, err, test.err)
+					}
+
+					return
+				}
+				if done {
+					// If we are done (without an error), make sure that we did not expect
+					// an error.
+					if test.err != nil {
+						t.Fatalf("expected error when iterating; want = %s", test.err)
+					}
+
+					return
+				}
+			}
+		})
+	}
+}
+
+func TestIPv6ExtHdrIter(t *testing.T) {
+	routingExtHdrWithUpperLayerData := buffer.View([]byte{255, 0, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4})
+	upperLayerData := buffer.View([]byte{1, 2, 3, 4})
+	tests := []struct {
+		name         string
+		firstNextHdr IPv6ExtensionHeaderIdentifier
+		payload      buffer.VectorisedView
+		expected     []IPv6PayloadHeader
+	}{
+		// With a non-atomic fragment that is not the first fragment, the payload
+		// after the fragment will not be parsed because the payload is expected to
+		// only hold upper layer data.
+		{
+			name:         "hopbyhop - fragment (not first) - routing - upper",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// Fragment extension header.
+				//
+				// More = 1, Fragment Offset = 2117, ID = 2147746305
+				uint8(IPv6RoutingExtHdrIdentifier), 0, 68, 9, 128, 4, 2, 1,
+
+				// Routing extension header.
+				//
+				// Even though we have a routing ext header here, it should be
+				// be interpretted as raw bytes as only the first fragment is expected
+				// to hold headers.
+				255, 0, 1, 2, 3, 4, 5, 6,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
+				IPv6FragmentExtHdr([6]byte{68, 9, 128, 4, 2, 1}),
+				IPv6RawPayloadHeader{
+					Identifier: IPv6RoutingExtHdrIdentifier,
+					Buf:        routingExtHdrWithUpperLayerData.ToVectorisedView(),
+				},
+			},
+		},
+		{
+			name:         "hopbyhop - fragment (first) - routing - upper",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// Fragment extension header.
+				//
+				// More = 1, Fragment Offset = 0, ID = 2147746305
+				uint8(IPv6RoutingExtHdrIdentifier), 0, 0, 1, 128, 4, 2, 1,
+
+				// Routing extension header.
+				255, 0, 1, 2, 3, 4, 5, 6,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
+				IPv6FragmentExtHdr([6]byte{0, 1, 128, 4, 2, 1}),
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6RawPayloadHeader{
+					Identifier: 255,
+					Buf:        upperLayerData.ToVectorisedView(),
+				},
+			},
+		},
+		{
+			name:         "fragment - routing - upper (across views)",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Fragment extension header.
+				uint8(IPv6RoutingExtHdrIdentifier), 0, 68, 9, 128, 4, 2, 1,
+
+				// Routing extension header.
+				255, 0, 1, 2}, []byte{3, 4, 5, 6,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6FragmentExtHdr([6]byte{68, 9, 128, 4, 2, 1}),
+				IPv6RawPayloadHeader{
+					Identifier: IPv6RoutingExtHdrIdentifier,
+					Buf:        routingExtHdrWithUpperLayerData.ToVectorisedView(),
+				},
+			},
+		},
+
+		// If we have an atomic fragment, the payload following the fragment
+		// extension header should be parsed normally.
+		{
+			name:         "atomic fragment - routing - destination - upper",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6, 128, 4, 2, 1,
+
+				// Routing extension header.
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Destination Options extension header.
+				255, 0, 1, 4, 1, 2, 3, 4,
+
+				// Upper layer data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
+				IPv6RawPayloadHeader{
+					Identifier: 255,
+					Buf:        upperLayerData.ToVectorisedView(),
+				},
+			},
+		},
+		{
+			name:         "atomic fragment - routing - upper (across views)",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6RoutingExtHdrIdentifier), 255, 0, 6}, []byte{128, 4, 2, 1,
+
+				// Routing extension header.
+				255, 0, 1, 2}, []byte{3, 4, 5, 6,
+
+				// Upper layer data.
+				1, 2}, []byte{3, 4}),
+			expected: []IPv6PayloadHeader{
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6RawPayloadHeader{
+					Identifier: 255,
+					Buf:        makeVectorisedViewFromByteBuffers(upperLayerData[:2], upperLayerData[2:]),
+				},
+			},
+		},
+		{
+			name:         "atomic fragment - destination - no next header",
+			firstNextHdr: IPv6FragmentExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Fragment extension header.
+				//
+				// Res (Reserved) bits are 1 which should not affect anything.
+				uint8(IPv6DestinationOptionsExtHdrIdentifier), 0, 0, 6, 128, 4, 2, 1,
+
+				// Destination Options extension header.
+				uint8(IPv6NoNextHeaderIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// Random data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+				IPv6DestinationOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
+			},
+		},
+		{
+			name:         "routing - atomic fragment - no next header",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Routing extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6NoNextHeaderIdentifier), 0, 0, 6, 128, 4, 2, 1,
+
+				// Random data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+			},
+		},
+		{
+			name:         "routing - atomic fragment - no next header (across views)",
+			firstNextHdr: IPv6RoutingExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Routing extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Fragment extension header.
+				//
+				// Reserved bits are 1 which should not affect anything.
+				uint8(IPv6NoNextHeaderIdentifier), 255, 0, 6}, []byte{128, 4, 2, 1,
+
+				// Random data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6FragmentExtHdr([6]byte{0, 6, 128, 4, 2, 1}),
+			},
+		},
+		{
+			name:         "hopbyhop - routing - fragment - no next header",
+			firstNextHdr: IPv6HopByHopOptionsExtHdrIdentifier,
+			payload: makeVectorisedViewFromByteBuffers([]byte{
+				// Hop By Hop Options extension header.
+				uint8(IPv6RoutingExtHdrIdentifier), 0, 1, 4, 1, 2, 3, 4,
+
+				// Routing extension header.
+				uint8(IPv6FragmentExtHdrIdentifier), 0, 1, 2, 3, 4, 5, 6,
+
+				// Fragment extension header.
+				//
+				// Fragment Offset = 32; Res = 6.
+				uint8(IPv6NoNextHeaderIdentifier), 0, 1, 6, 128, 4, 2, 1,
+
+				// Random data.
+				1, 2, 3, 4,
+			}),
+			expected: []IPv6PayloadHeader{
+				IPv6HopByHopOptionsExtHdr{ipv6OptionsExtHdr: []byte{1, 4, 1, 2, 3, 4}},
+				IPv6RoutingExtHdr([]byte{1, 2, 3, 4, 5, 6}),
+				IPv6FragmentExtHdr([6]byte{1, 6, 128, 4, 2, 1}),
+				IPv6RawPayloadHeader{
+					Identifier: IPv6NoNextHeaderIdentifier,
+					Buf:        upperLayerData.ToVectorisedView(),
+				},
+			},
+		},
+
+		// Test the raw payload for common transport layer protocol numbers.
+		{
+			name:         "TCP raw payload",
+			firstNextHdr: IPv6ExtensionHeaderIdentifier(TCPProtocolNumber),
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: IPv6ExtensionHeaderIdentifier(TCPProtocolNumber),
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "UDP raw payload",
+			firstNextHdr: IPv6ExtensionHeaderIdentifier(UDPProtocolNumber),
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: IPv6ExtensionHeaderIdentifier(UDPProtocolNumber),
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "ICMPv4 raw payload",
+			firstNextHdr: IPv6ExtensionHeaderIdentifier(ICMPv4ProtocolNumber),
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: IPv6ExtensionHeaderIdentifier(ICMPv4ProtocolNumber),
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "ICMPv6 raw payload",
+			firstNextHdr: IPv6ExtensionHeaderIdentifier(ICMPv6ProtocolNumber),
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: IPv6ExtensionHeaderIdentifier(ICMPv6ProtocolNumber),
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "Unknwon next header raw payload",
+			firstNextHdr: 255,
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: 255,
+				Buf:        upperLayerData.ToVectorisedView(),
+			}},
+		},
+		{
+			name:         "Unknwon next header raw payload (across views)",
+			firstNextHdr: 255,
+			payload:      makeVectorisedViewFromByteBuffers(upperLayerData[:2], upperLayerData[2:]),
+			expected: []IPv6PayloadHeader{IPv6RawPayloadHeader{
+				Identifier: 255,
+				Buf:        makeVectorisedViewFromByteBuffers(upperLayerData[:2], upperLayerData[2:]),
+			}},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			it := MakeIPv6PayloadIterator(test.firstNextHdr, test.payload)
+
+			for i, e := range test.expected {
+				extHdr, done, err := it.Next()
+				if err != nil {
+					t.Errorf("(i=%d) Next(): %s", i, err)
+				}
+				if done {
+					t.Errorf("(i=%d) unexpectedly done iterating", i)
+				}
+				if diff := cmp.Diff(e, extHdr); diff != "" {
+					t.Errorf("(i=%d) got ext hdr mismatch (-want +got):\n%s", i, diff)
+				}
+
+				if t.Failed() {
+					t.FailNow()
+				}
+			}
+
+			extHdr, done, err := it.Next()
+			if err != nil {
+				t.Errorf("(last) Next(): %s", err)
+			}
+			if !done {
+				t.Errorf("(last) iterator unexpectedly not done")
+			}
+			if extHdr != nil {
+				t.Errorf("(last) got Next() = %T, want = nil", extHdr)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/header/ipv6_fragment.go b/pkg/tcpip/header/ipv6_fragment.go
new file mode 100644
index 000000000..018555a26
--- /dev/null
+++ b/pkg/tcpip/header/ipv6_fragment.go
@@ -0,0 +1,146 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	nextHdrFrag = 0
+	fragOff     = 2
+	more        = 3
+	idV6        = 4
+)
+
+// IPv6FragmentFields contains the fields of an IPv6 fragment. It is used to describe the
+// fields of a packet that needs to be encoded.
+type IPv6FragmentFields struct {
+	// NextHeader is the "next header" field of an IPv6 fragment.
+	NextHeader uint8
+
+	// FragmentOffset is the "fragment offset" field of an IPv6 fragment.
+	FragmentOffset uint16
+
+	// M is the "more" field of an IPv6 fragment.
+	M bool
+
+	// Identification is the "identification" field of an IPv6 fragment.
+	Identification uint32
+}
+
+// IPv6Fragment represents an ipv6 fragment header stored in a byte array.
+// Most of the methods of IPv6Fragment access to the underlying slice without
+// checking the boundaries and could panic because of 'index out of range'.
+// Always call IsValid() to validate an instance of IPv6Fragment before using other methods.
+type IPv6Fragment []byte
+
+const (
+	// IPv6FragmentHeader header is the number used to specify that the next
+	// header is a fragment header, per RFC 2460.
+	IPv6FragmentHeader = 44
+
+	// IPv6FragmentHeaderSize is the size of the fragment header.
+	IPv6FragmentHeaderSize = 8
+)
+
+// Encode encodes all the fields of the ipv6 fragment.
+func (b IPv6Fragment) Encode(i *IPv6FragmentFields) {
+	b[nextHdrFrag] = i.NextHeader
+	binary.BigEndian.PutUint16(b[fragOff:], i.FragmentOffset<<3)
+	if i.M {
+		b[more] |= 1
+	}
+	binary.BigEndian.PutUint32(b[idV6:], i.Identification)
+}
+
+// IsValid performs basic validation on the fragment header.
+func (b IPv6Fragment) IsValid() bool {
+	return len(b) >= IPv6FragmentHeaderSize
+}
+
+// NextHeader returns the value of the "next header" field of the ipv6 fragment.
+func (b IPv6Fragment) NextHeader() uint8 {
+	return b[nextHdrFrag]
+}
+
+// FragmentOffset returns the "fragment offset" field of the ipv6 fragment.
+func (b IPv6Fragment) FragmentOffset() uint16 {
+	return binary.BigEndian.Uint16(b[fragOff:]) >> 3
+}
+
+// More returns the "more" field of the ipv6 fragment.
+func (b IPv6Fragment) More() bool {
+	return b[more]&1 > 0
+}
+
+// Payload implements Network.Payload.
+func (b IPv6Fragment) Payload() []byte {
+	return b[IPv6FragmentHeaderSize:]
+}
+
+// ID returns the value of the identifier field of the ipv6 fragment.
+func (b IPv6Fragment) ID() uint32 {
+	return binary.BigEndian.Uint32(b[idV6:])
+}
+
+// TransportProtocol implements Network.TransportProtocol.
+func (b IPv6Fragment) TransportProtocol() tcpip.TransportProtocolNumber {
+	return tcpip.TransportProtocolNumber(b.NextHeader())
+}
+
+// The functions below have been added only to satisfy the Network interface.
+
+// Checksum is not supported by IPv6Fragment.
+func (b IPv6Fragment) Checksum() uint16 {
+	panic("not supported")
+}
+
+// SourceAddress is not supported by IPv6Fragment.
+func (b IPv6Fragment) SourceAddress() tcpip.Address {
+	panic("not supported")
+}
+
+// DestinationAddress is not supported by IPv6Fragment.
+func (b IPv6Fragment) DestinationAddress() tcpip.Address {
+	panic("not supported")
+}
+
+// SetSourceAddress is not supported by IPv6Fragment.
+func (b IPv6Fragment) SetSourceAddress(tcpip.Address) {
+	panic("not supported")
+}
+
+// SetDestinationAddress is not supported by IPv6Fragment.
+func (b IPv6Fragment) SetDestinationAddress(tcpip.Address) {
+	panic("not supported")
+}
+
+// SetChecksum is not supported by IPv6Fragment.
+func (b IPv6Fragment) SetChecksum(uint16) {
+	panic("not supported")
+}
+
+// TOS is not supported by IPv6Fragment.
+func (b IPv6Fragment) TOS() (uint8, uint32) {
+	panic("not supported")
+}
+
+// SetTOS is not supported by IPv6Fragment.
+func (b IPv6Fragment) SetTOS(t uint8, l uint32) {
+	panic("not supported")
+}
diff --git a/pkg/tcpip/header/ipv6_test.go b/pkg/tcpip/header/ipv6_test.go
new file mode 100644
index 000000000..426a873b1
--- /dev/null
+++ b/pkg/tcpip/header/ipv6_test.go
@@ -0,0 +1,417 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header_test
+
+import (
+	"bytes"
+	"crypto/sha256"
+	"fmt"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+const (
+	linkAddr               = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+	linkLocalAddr          = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	linkLocalMulticastAddr = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	uniqueLocalAddr1       = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	uniqueLocalAddr2       = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+	globalAddr             = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+)
+
+func TestEthernetAdddressToModifiedEUI64(t *testing.T) {
+	expectedIID := [header.IIDSize]byte{0, 2, 3, 255, 254, 4, 5, 6}
+
+	if diff := cmp.Diff(expectedIID, header.EthernetAddressToModifiedEUI64(linkAddr)); diff != "" {
+		t.Errorf("EthernetAddressToModifiedEUI64(%s) mismatch (-want +got):\n%s", linkAddr, diff)
+	}
+
+	var buf [header.IIDSize]byte
+	header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, buf[:])
+	if diff := cmp.Diff(expectedIID, buf); diff != "" {
+		t.Errorf("EthernetAddressToModifiedEUI64IntoBuf(%s, _) mismatch (-want +got):\n%s", linkAddr, diff)
+	}
+}
+
+func TestLinkLocalAddr(t *testing.T) {
+	if got, want := header.LinkLocalAddr(linkAddr), tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x02\x03\xff\xfe\x04\x05\x06"); got != want {
+		t.Errorf("got LinkLocalAddr(%s) = %s, want = %s", linkAddr, got, want)
+	}
+}
+
+func TestAppendOpaqueInterfaceIdentifier(t *testing.T) {
+	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes * 2]byte
+	if n, err := rand.Read(secretKeyBuf[:]); err != nil {
+		t.Fatalf("rand.Read(_): %s", err)
+	} else if want := header.OpaqueIIDSecretKeyMinBytes * 2; n != want {
+		t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", want, n)
+	}
+
+	tests := []struct {
+		name       string
+		prefix     tcpip.Subnet
+		nicName    string
+		dadCounter uint8
+		secretKey  []byte
+	}{
+		{
+			name:       "SecretKey of minimum size",
+			prefix:     header.IPv6LinkLocalPrefix.Subnet(),
+			nicName:    "eth0",
+			dadCounter: 0,
+			secretKey:  secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes],
+		},
+		{
+			name: "SecretKey of less than minimum size",
+			prefix: func() tcpip.Subnet {
+				addrWithPrefix := tcpip.AddressWithPrefix{
+					Address:   "\x01\x02\x03\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+					PrefixLen: header.IIDOffsetInIPv6Address * 8,
+				}
+				return addrWithPrefix.Subnet()
+			}(),
+			nicName:    "eth10",
+			dadCounter: 1,
+			secretKey:  secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes/2],
+		},
+		{
+			name: "SecretKey of more than minimum size",
+			prefix: func() tcpip.Subnet {
+				addrWithPrefix := tcpip.AddressWithPrefix{
+					Address:   "\x01\x02\x03\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+					PrefixLen: header.IIDOffsetInIPv6Address * 8,
+				}
+				return addrWithPrefix.Subnet()
+			}(),
+			nicName:    "eth11",
+			dadCounter: 2,
+			secretKey:  secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes*2],
+		},
+		{
+			name: "Nil SecretKey and empty nicName",
+			prefix: func() tcpip.Subnet {
+				addrWithPrefix := tcpip.AddressWithPrefix{
+					Address:   "\x01\x02\x03\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+					PrefixLen: header.IIDOffsetInIPv6Address * 8,
+				}
+				return addrWithPrefix.Subnet()
+			}(),
+			nicName:    "",
+			dadCounter: 3,
+			secretKey:  nil,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			h := sha256.New()
+			h.Write([]byte(test.prefix.ID()[:header.IIDOffsetInIPv6Address]))
+			h.Write([]byte(test.nicName))
+			h.Write([]byte{test.dadCounter})
+			if k := test.secretKey; k != nil {
+				h.Write(k)
+			}
+			var hashSum [sha256.Size]byte
+			h.Sum(hashSum[:0])
+			want := hashSum[:header.IIDSize]
+
+			// Passing a nil buffer should result in a new buffer returned with the
+			// IID.
+			if got := header.AppendOpaqueInterfaceIdentifier(nil, test.prefix, test.nicName, test.dadCounter, test.secretKey); !bytes.Equal(got, want) {
+				t.Errorf("got AppendOpaqueInterfaceIdentifier(nil, %s, %s, %d, %x) = %x, want = %x", test.prefix, test.nicName, test.dadCounter, test.secretKey, got, want)
+			}
+
+			// Passing a buffer with sufficient capacity for the IID should populate
+			// the buffer provided.
+			var iidBuf [header.IIDSize]byte
+			if got := header.AppendOpaqueInterfaceIdentifier(iidBuf[:0], test.prefix, test.nicName, test.dadCounter, test.secretKey); !bytes.Equal(got, want) {
+				t.Errorf("got AppendOpaqueInterfaceIdentifier(iidBuf[:0], %s, %s, %d, %x) = %x, want = %x", test.prefix, test.nicName, test.dadCounter, test.secretKey, got, want)
+			}
+			if got := iidBuf[:]; !bytes.Equal(got, want) {
+				t.Errorf("got iidBuf = %x, want = %x", got, want)
+			}
+		})
+	}
+}
+
+func TestLinkLocalAddrWithOpaqueIID(t *testing.T) {
+	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes * 2]byte
+	if n, err := rand.Read(secretKeyBuf[:]); err != nil {
+		t.Fatalf("rand.Read(_): %s", err)
+	} else if want := header.OpaqueIIDSecretKeyMinBytes * 2; n != want {
+		t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", want, n)
+	}
+
+	prefix := header.IPv6LinkLocalPrefix.Subnet()
+
+	tests := []struct {
+		name       string
+		prefix     tcpip.Subnet
+		nicName    string
+		dadCounter uint8
+		secretKey  []byte
+	}{
+		{
+			name:       "SecretKey of minimum size",
+			nicName:    "eth0",
+			dadCounter: 0,
+			secretKey:  secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes],
+		},
+		{
+			name:       "SecretKey of less than minimum size",
+			nicName:    "eth10",
+			dadCounter: 1,
+			secretKey:  secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes/2],
+		},
+		{
+			name:       "SecretKey of more than minimum size",
+			nicName:    "eth11",
+			dadCounter: 2,
+			secretKey:  secretKeyBuf[:header.OpaqueIIDSecretKeyMinBytes*2],
+		},
+		{
+			name:       "Nil SecretKey and empty nicName",
+			nicName:    "",
+			dadCounter: 3,
+			secretKey:  nil,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			addrBytes := [header.IPv6AddressSize]byte{
+				0: 0xFE,
+				1: 0x80,
+			}
+
+			want := tcpip.Address(header.AppendOpaqueInterfaceIdentifier(
+				addrBytes[:header.IIDOffsetInIPv6Address],
+				prefix,
+				test.nicName,
+				test.dadCounter,
+				test.secretKey,
+			))
+
+			if got := header.LinkLocalAddrWithOpaqueIID(test.nicName, test.dadCounter, test.secretKey); got != want {
+				t.Errorf("got LinkLocalAddrWithOpaqueIID(%s, %d, %x) = %s, want = %s", test.nicName, test.dadCounter, test.secretKey, got, want)
+			}
+		})
+	}
+}
+
+func TestIsV6UniqueLocalAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		addr     tcpip.Address
+		expected bool
+	}{
+		{
+			name:     "Valid Unique 1",
+			addr:     uniqueLocalAddr1,
+			expected: true,
+		},
+		{
+			name:     "Valid Unique 2",
+			addr:     uniqueLocalAddr1,
+			expected: true,
+		},
+		{
+			name:     "Link Local",
+			addr:     linkLocalAddr,
+			expected: false,
+		},
+		{
+			name:     "Global",
+			addr:     globalAddr,
+			expected: false,
+		},
+		{
+			name:     "IPv4",
+			addr:     "\x01\x02\x03\x04",
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := header.IsV6UniqueLocalAddress(test.addr); got != test.expected {
+				t.Errorf("got header.IsV6UniqueLocalAddress(%s) = %t, want = %t", test.addr, got, test.expected)
+			}
+		})
+	}
+}
+
+func TestIsV6LinkLocalMulticastAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		addr     tcpip.Address
+		expected bool
+	}{
+		{
+			name:     "Valid Link Local Multicast",
+			addr:     linkLocalMulticastAddr,
+			expected: true,
+		},
+		{
+			name:     "Valid Link Local Multicast with flags",
+			addr:     "\xff\xf2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01",
+			expected: true,
+		},
+		{
+			name:     "Link Local Unicast",
+			addr:     linkLocalAddr,
+			expected: false,
+		},
+		{
+			name:     "IPv4 Multicast",
+			addr:     "\xe0\x00\x00\x01",
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := header.IsV6LinkLocalMulticastAddress(test.addr); got != test.expected {
+				t.Errorf("got header.IsV6LinkLocalMulticastAddress(%s) = %t, want = %t", test.addr, got, test.expected)
+			}
+		})
+	}
+}
+
+func TestIsV6LinkLocalAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		addr     tcpip.Address
+		expected bool
+	}{
+		{
+			name:     "Valid Link Local Unicast",
+			addr:     linkLocalAddr,
+			expected: true,
+		},
+		{
+			name:     "Link Local Multicast",
+			addr:     linkLocalMulticastAddr,
+			expected: false,
+		},
+		{
+			name:     "Unique Local",
+			addr:     uniqueLocalAddr1,
+			expected: false,
+		},
+		{
+			name:     "Global",
+			addr:     globalAddr,
+			expected: false,
+		},
+		{
+			name:     "IPv4 Link Local",
+			addr:     "\xa9\xfe\x00\x01",
+			expected: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			if got := header.IsV6LinkLocalAddress(test.addr); got != test.expected {
+				t.Errorf("got header.IsV6LinkLocalAddress(%s) = %t, want = %t", test.addr, got, test.expected)
+			}
+		})
+	}
+}
+
+func TestScopeForIPv6Address(t *testing.T) {
+	tests := []struct {
+		name  string
+		addr  tcpip.Address
+		scope header.IPv6AddressScope
+		err   *tcpip.Error
+	}{
+		{
+			name:  "Unique Local",
+			addr:  uniqueLocalAddr1,
+			scope: header.UniqueLocalScope,
+			err:   nil,
+		},
+		{
+			name:  "Link Local Unicast",
+			addr:  linkLocalAddr,
+			scope: header.LinkLocalScope,
+			err:   nil,
+		},
+		{
+			name:  "Link Local Multicast",
+			addr:  linkLocalMulticastAddr,
+			scope: header.LinkLocalScope,
+			err:   nil,
+		},
+		{
+			name:  "Global",
+			addr:  globalAddr,
+			scope: header.GlobalScope,
+			err:   nil,
+		},
+		{
+			name:  "IPv4",
+			addr:  "\x01\x02\x03\x04",
+			scope: header.GlobalScope,
+			err:   tcpip.ErrBadAddress,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			got, err := header.ScopeForIPv6Address(test.addr)
+			if err != test.err {
+				t.Errorf("got header.IsV6UniqueLocalAddress(%s) = (_, %v), want = (_, %v)", test.addr, err, test.err)
+			}
+			if got != test.scope {
+				t.Errorf("got header.IsV6UniqueLocalAddress(%s) = (%d, _), want = (%d, _)", test.addr, got, test.scope)
+			}
+		})
+	}
+}
+
+func TestSolicitedNodeAddr(t *testing.T) {
+	tests := []struct {
+		addr tcpip.Address
+		want tcpip.Address
+	}{
+		{
+			addr: "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\xa0",
+			want: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff\x0e\x0f\xa0",
+		},
+		{
+			addr: "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\xdd\x0e\x0f\xa0",
+			want: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff\x0e\x0f\xa0",
+		},
+		{
+			addr: "\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\xdd\x01\x02\x03",
+			want: "\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\xff\x01\x02\x03",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(fmt.Sprintf("%s", test.addr), func(t *testing.T) {
+			if got := header.SolicitedNodeAddr(test.addr); got != test.want {
+				t.Fatalf("got header.SolicitedNodeAddr(%s) = %s, want = %s", test.addr, got, test.want)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/header/ipversion_test.go b/pkg/tcpip/header/ipversion_test.go
new file mode 100644
index 000000000..b5540bf66
--- /dev/null
+++ b/pkg/tcpip/header/ipversion_test.go
@@ -0,0 +1,67 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+func TestIPv4(t *testing.T) {
+	b := header.IPv4(make([]byte, header.IPv4MinimumSize))
+	b.Encode(&header.IPv4Fields{})
+
+	const want = header.IPv4Version
+	if v := header.IPVersion(b); v != want {
+		t.Fatalf("Bad version, want %v, got %v", want, v)
+	}
+}
+
+func TestIPv6(t *testing.T) {
+	b := header.IPv6(make([]byte, header.IPv6MinimumSize))
+	b.Encode(&header.IPv6Fields{})
+
+	const want = header.IPv6Version
+	if v := header.IPVersion(b); v != want {
+		t.Fatalf("Bad version, want %v, got %v", want, v)
+	}
+}
+
+func TestOtherVersion(t *testing.T) {
+	const want = header.IPv4Version + header.IPv6Version
+	b := make([]byte, 1)
+	b[0] = want << 4
+
+	if v := header.IPVersion(b); v != want {
+		t.Fatalf("Bad version, want %v, got %v", want, v)
+	}
+}
+
+func TestTooShort(t *testing.T) {
+	b := make([]byte, 1)
+	b[0] = (header.IPv4Version + header.IPv6Version) << 4
+
+	// Get the version of a zero-length slice.
+	const want = -1
+	if v := header.IPVersion(b[:0]); v != want {
+		t.Fatalf("Bad version, want %v, got %v", want, v)
+	}
+
+	// Get the version of a nil slice.
+	if v := header.IPVersion(nil); v != want {
+		t.Fatalf("Bad version, want %v, got %v", want, v)
+	}
+}
diff --git a/pkg/tcpip/header/ndp_neighbor_advert.go b/pkg/tcpip/header/ndp_neighbor_advert.go
new file mode 100644
index 000000000..505c92668
--- /dev/null
+++ b/pkg/tcpip/header/ndp_neighbor_advert.go
@@ -0,0 +1,110 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import "gvisor.dev/gvisor/pkg/tcpip"
+
+// NDPNeighborAdvert is an NDP Neighbor Advertisement message. It will
+// only contain the body of an ICMPv6 packet.
+//
+// See RFC 4861 section 4.4 for more details.
+type NDPNeighborAdvert []byte
+
+const (
+	// NDPNAMinimumSize is the minimum size of a valid NDP Neighbor
+	// Advertisement message (body of an ICMPv6 packet).
+	NDPNAMinimumSize = 20
+
+	// ndpNATargetAddressOffset is the start of the Target Address
+	// field within an NDPNeighborAdvert.
+	ndpNATargetAddressOffset = 4
+
+	// ndpNAOptionsOffset is the start of the NDP options in an
+	// NDPNeighborAdvert.
+	ndpNAOptionsOffset = ndpNATargetAddressOffset + IPv6AddressSize
+
+	// ndpNAFlagsOffset is the offset of the flags within an
+	// NDPNeighborAdvert
+	ndpNAFlagsOffset = 0
+
+	// ndpNARouterFlagMask is the mask of the Router Flag field in
+	// the flags byte within in an NDPNeighborAdvert.
+	ndpNARouterFlagMask = (1 << 7)
+
+	// ndpNASolicitedFlagMask is the mask of the Solicited Flag field in
+	// the flags byte within in an NDPNeighborAdvert.
+	ndpNASolicitedFlagMask = (1 << 6)
+
+	// ndpNAOverrideFlagMask is the mask of the Override Flag field in
+	// the flags byte within in an NDPNeighborAdvert.
+	ndpNAOverrideFlagMask = (1 << 5)
+)
+
+// TargetAddress returns the value within the Target Address field.
+func (b NDPNeighborAdvert) TargetAddress() tcpip.Address {
+	return tcpip.Address(b[ndpNATargetAddressOffset:][:IPv6AddressSize])
+}
+
+// SetTargetAddress sets the value within the Target Address field.
+func (b NDPNeighborAdvert) SetTargetAddress(addr tcpip.Address) {
+	copy(b[ndpNATargetAddressOffset:][:IPv6AddressSize], addr)
+}
+
+// RouterFlag returns the value of the Router Flag field.
+func (b NDPNeighborAdvert) RouterFlag() bool {
+	return b[ndpNAFlagsOffset]&ndpNARouterFlagMask != 0
+}
+
+// SetRouterFlag sets the value in the Router Flag field.
+func (b NDPNeighborAdvert) SetRouterFlag(f bool) {
+	if f {
+		b[ndpNAFlagsOffset] |= ndpNARouterFlagMask
+	} else {
+		b[ndpNAFlagsOffset] &^= ndpNARouterFlagMask
+	}
+}
+
+// SolicitedFlag returns the value of the Solicited Flag field.
+func (b NDPNeighborAdvert) SolicitedFlag() bool {
+	return b[ndpNAFlagsOffset]&ndpNASolicitedFlagMask != 0
+}
+
+// SetSolicitedFlag sets the value in the Solicited Flag field.
+func (b NDPNeighborAdvert) SetSolicitedFlag(f bool) {
+	if f {
+		b[ndpNAFlagsOffset] |= ndpNASolicitedFlagMask
+	} else {
+		b[ndpNAFlagsOffset] &^= ndpNASolicitedFlagMask
+	}
+}
+
+// OverrideFlag returns the value of the Override Flag field.
+func (b NDPNeighborAdvert) OverrideFlag() bool {
+	return b[ndpNAFlagsOffset]&ndpNAOverrideFlagMask != 0
+}
+
+// SetOverrideFlag sets the value in the Override Flag field.
+func (b NDPNeighborAdvert) SetOverrideFlag(f bool) {
+	if f {
+		b[ndpNAFlagsOffset] |= ndpNAOverrideFlagMask
+	} else {
+		b[ndpNAFlagsOffset] &^= ndpNAOverrideFlagMask
+	}
+}
+
+// Options returns an NDPOptions of the the options body.
+func (b NDPNeighborAdvert) Options() NDPOptions {
+	return NDPOptions(b[ndpNAOptionsOffset:])
+}
diff --git a/pkg/tcpip/header/ndp_neighbor_solicit.go b/pkg/tcpip/header/ndp_neighbor_solicit.go
new file mode 100644
index 000000000..3a1b8e139
--- /dev/null
+++ b/pkg/tcpip/header/ndp_neighbor_solicit.go
@@ -0,0 +1,52 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import "gvisor.dev/gvisor/pkg/tcpip"
+
+// NDPNeighborSolicit is an NDP Neighbor Solicitation message. It will only
+// contain the body of an ICMPv6 packet.
+//
+// See RFC 4861 section 4.3 for more details.
+type NDPNeighborSolicit []byte
+
+const (
+	// NDPNSMinimumSize is the minimum size of a valid NDP Neighbor
+	// Solicitation message (body of an ICMPv6 packet).
+	NDPNSMinimumSize = 20
+
+	// ndpNSTargetAddessOffset is the start of the Target Address
+	// field within an NDPNeighborSolicit.
+	ndpNSTargetAddessOffset = 4
+
+	// ndpNSOptionsOffset is the start of the NDP options in an
+	// NDPNeighborSolicit.
+	ndpNSOptionsOffset = ndpNSTargetAddessOffset + IPv6AddressSize
+)
+
+// TargetAddress returns the value within the Target Address field.
+func (b NDPNeighborSolicit) TargetAddress() tcpip.Address {
+	return tcpip.Address(b[ndpNSTargetAddessOffset:][:IPv6AddressSize])
+}
+
+// SetTargetAddress sets the value within the Target Address field.
+func (b NDPNeighborSolicit) SetTargetAddress(addr tcpip.Address) {
+	copy(b[ndpNSTargetAddessOffset:][:IPv6AddressSize], addr)
+}
+
+// Options returns an NDPOptions of the the options body.
+func (b NDPNeighborSolicit) Options() NDPOptions {
+	return NDPOptions(b[ndpNSOptionsOffset:])
+}
diff --git a/pkg/tcpip/header/ndp_options.go b/pkg/tcpip/header/ndp_options.go
new file mode 100644
index 000000000..5d3975c56
--- /dev/null
+++ b/pkg/tcpip/header/ndp_options.go
@@ -0,0 +1,899 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"math"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+// NDPOptionIdentifier is an NDP option type identifier.
+type NDPOptionIdentifier uint8
+
+const (
+	// NDPSourceLinkLayerAddressOptionType is the type of the Source Link Layer
+	// Address option, as per RFC 4861 section 4.6.1.
+	NDPSourceLinkLayerAddressOptionType NDPOptionIdentifier = 1
+
+	// NDPTargetLinkLayerAddressOptionType is the type of the Target Link Layer
+	// Address option, as per RFC 4861 section 4.6.1.
+	NDPTargetLinkLayerAddressOptionType NDPOptionIdentifier = 2
+
+	// NDPPrefixInformationType is the type of the Prefix Information
+	// option, as per RFC 4861 section 4.6.2.
+	NDPPrefixInformationType NDPOptionIdentifier = 3
+
+	// NDPRecursiveDNSServerOptionType is the type of the Recursive DNS
+	// Server option, as per RFC 8106 section 5.1.
+	NDPRecursiveDNSServerOptionType NDPOptionIdentifier = 25
+
+	// NDPDNSSearchListOptionType is the type of the DNS Search List option,
+	// as per RFC 8106 section 5.2.
+	NDPDNSSearchListOptionType = 31
+)
+
+const (
+	// NDPLinkLayerAddressSize is the size of a Source or Target Link Layer
+	// Address option for an Ethernet address.
+	NDPLinkLayerAddressSize = 8
+
+	// ndpPrefixInformationLength is the expected length, in bytes, of the
+	// body of an NDP Prefix Information option, as per RFC 4861 section
+	// 4.6.2 which specifies that the Length field is 4. Given this, the
+	// expected length, in bytes, is 30 becuase 4 * lengthByteUnits (8) - 2
+	// (Type & Length) = 30.
+	ndpPrefixInformationLength = 30
+
+	// ndpPrefixInformationPrefixLengthOffset is the offset of the Prefix
+	// Length field within an NDPPrefixInformation.
+	ndpPrefixInformationPrefixLengthOffset = 0
+
+	// ndpPrefixInformationFlagsOffset is the offset of the flags byte
+	// within an NDPPrefixInformation.
+	ndpPrefixInformationFlagsOffset = 1
+
+	// ndpPrefixInformationOnLinkFlagMask is the mask of the On-Link Flag
+	// field in the flags byte within an NDPPrefixInformation.
+	ndpPrefixInformationOnLinkFlagMask = (1 << 7)
+
+	// ndpPrefixInformationAutoAddrConfFlagMask is the mask of the
+	// Autonomous Address-Configuration flag field in the flags byte within
+	// an NDPPrefixInformation.
+	ndpPrefixInformationAutoAddrConfFlagMask = (1 << 6)
+
+	// ndpPrefixInformationReserved1FlagsMask is the mask of the Reserved1
+	// field in the flags byte within an NDPPrefixInformation.
+	ndpPrefixInformationReserved1FlagsMask = 63
+
+	// ndpPrefixInformationValidLifetimeOffset is the start of the 4-byte
+	// Valid Lifetime field within an NDPPrefixInformation.
+	ndpPrefixInformationValidLifetimeOffset = 2
+
+	// ndpPrefixInformationPreferredLifetimeOffset is the start of the
+	// 4-byte Preferred Lifetime field within an NDPPrefixInformation.
+	ndpPrefixInformationPreferredLifetimeOffset = 6
+
+	// ndpPrefixInformationReserved2Offset is the start of the 4-byte
+	// Reserved2 field within an NDPPrefixInformation.
+	ndpPrefixInformationReserved2Offset = 10
+
+	// ndpPrefixInformationReserved2Length is the length of the Reserved2
+	// field.
+	//
+	// It is 4 bytes.
+	ndpPrefixInformationReserved2Length = 4
+
+	// ndpPrefixInformationPrefixOffset is the start of the Prefix field
+	// within an NDPPrefixInformation.
+	ndpPrefixInformationPrefixOffset = 14
+
+	// ndpRecursiveDNSServerLifetimeOffset is the start of the 4-byte
+	// Lifetime field within an NDPRecursiveDNSServer.
+	ndpRecursiveDNSServerLifetimeOffset = 2
+
+	// ndpRecursiveDNSServerAddressesOffset is the start of the addresses
+	// for IPv6 Recursive DNS Servers within an NDPRecursiveDNSServer.
+	ndpRecursiveDNSServerAddressesOffset = 6
+
+	// minNDPRecursiveDNSServerLength is the minimum NDP Recursive DNS Server
+	// option's body size when it contains at least one IPv6 address, as per
+	// RFC 8106 section 5.3.1.
+	minNDPRecursiveDNSServerBodySize = 22
+
+	// ndpDNSSearchListLifetimeOffset is the start of the 4-byte
+	// Lifetime field within an NDPDNSSearchList.
+	ndpDNSSearchListLifetimeOffset = 2
+
+	// ndpDNSSearchListDomainNamesOffset is the start of the DNS search list
+	// domain names within an NDPDNSSearchList.
+	ndpDNSSearchListDomainNamesOffset = 6
+
+	// minNDPDNSSearchListBodySize is the minimum NDP DNS Search List option's
+	// body size when it contains at least one domain name, as per RFC 8106
+	// section 5.3.1.
+	minNDPDNSSearchListBodySize = 14
+
+	// maxDomainNameLabelLength is the maximum length of a domain name
+	// label, as per RFC 1035 section 3.1.
+	maxDomainNameLabelLength = 63
+
+	// maxDomainNameLength is the maximum length of a domain name, including
+	// label AND label length octet, as per RFC 1035 section 3.1.
+	maxDomainNameLength = 255
+
+	// lengthByteUnits is the multiplier factor for the Length field of an
+	// NDP option. That is, the length field for NDP options is in units of
+	// 8 octets, as per RFC 4861 section 4.6.
+	lengthByteUnits = 8
+)
+
+var (
+	// NDPInfiniteLifetime is a value that represents infinity for the
+	// 4-byte lifetime fields found in various NDP options. Its value is
+	// (2^32 - 1)s = 4294967295s.
+	//
+	// This is a variable instead of a constant so that tests can change
+	// this value to a smaller value. It should only be modified by tests.
+	NDPInfiniteLifetime = time.Second * math.MaxUint32
+)
+
+// NDPOptionIterator is an iterator of NDPOption.
+//
+// Note, between when an NDPOptionIterator is obtained and last used, no changes
+// to the NDPOptions may happen. Doing so may cause undefined and unexpected
+// behaviour. It is fine to obtain an NDPOptionIterator, iterate over the first
+// few NDPOption then modify the backing NDPOptions so long as the
+// NDPOptionIterator obtained before modification is no longer used.
+type NDPOptionIterator struct {
+	opts *bytes.Buffer
+}
+
+// Potential errors when iterating over an NDPOptions.
+var (
+	ErrNDPOptMalformedBody   = errors.New("NDP option has a malformed body")
+	ErrNDPOptMalformedHeader = errors.New("NDP option has a malformed header")
+)
+
+// Next returns the next element in the backing NDPOptions, or true if we are
+// done, or false if an error occured.
+//
+// The return can be read as option, done, error. Note, option should only be
+// used if done is false and error is nil.
+func (i *NDPOptionIterator) Next() (NDPOption, bool, error) {
+	for {
+		// Do we still have elements to look at?
+		if i.opts.Len() == 0 {
+			return nil, true, nil
+		}
+
+		// Get the Type field.
+		temp, err := i.opts.ReadByte()
+		if err != nil {
+			if err != io.EOF {
+				// ReadByte should only ever return nil or io.EOF.
+				panic(fmt.Sprintf("unexpected error when reading the option's Type field: %s", err))
+			}
+
+			// We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected once
+			// we start parsing an option; we expect the buffer to contain enough
+			// bytes for the whole option.
+			return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Type field: %w", io.ErrUnexpectedEOF)
+		}
+		kind := NDPOptionIdentifier(temp)
+
+		// Get the Length field.
+		length, err := i.opts.ReadByte()
+		if err != nil {
+			if err != io.EOF {
+				panic(fmt.Sprintf("unexpected error when reading the option's Length field for %s: %s", kind, err))
+			}
+
+			return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Length field for %s: %w", kind, io.ErrUnexpectedEOF)
+		}
+
+		// This would indicate an erroneous NDP option as the Length field should
+		// never be 0.
+		if length == 0 {
+			return nil, true, fmt.Errorf("zero valued Length field for %s: %w", kind, ErrNDPOptMalformedHeader)
+		}
+
+		// Get the body.
+		numBytes := int(length) * lengthByteUnits
+		numBodyBytes := numBytes - 2
+		body := i.opts.Next(numBodyBytes)
+		if len(body) < numBodyBytes {
+			return nil, true, fmt.Errorf("unexpectedly exhausted buffer when reading the option's Body for %s: %w", kind, io.ErrUnexpectedEOF)
+		}
+
+		switch kind {
+		case NDPSourceLinkLayerAddressOptionType:
+			return NDPSourceLinkLayerAddressOption(body), false, nil
+
+		case NDPTargetLinkLayerAddressOptionType:
+			return NDPTargetLinkLayerAddressOption(body), false, nil
+
+		case NDPPrefixInformationType:
+			// Make sure the length of a Prefix Information option
+			// body is ndpPrefixInformationLength, as per RFC 4861
+			// section 4.6.2.
+			if numBodyBytes != ndpPrefixInformationLength {
+				return nil, true, fmt.Errorf("got %d bytes for NDP Prefix Information option's body, expected %d bytes: %w", numBodyBytes, ndpPrefixInformationLength, ErrNDPOptMalformedBody)
+			}
+
+			return NDPPrefixInformation(body), false, nil
+
+		case NDPRecursiveDNSServerOptionType:
+			opt := NDPRecursiveDNSServer(body)
+			if err := opt.checkAddresses(); err != nil {
+				return nil, true, err
+			}
+
+			return opt, false, nil
+
+		case NDPDNSSearchListOptionType:
+			opt := NDPDNSSearchList(body)
+			if err := opt.checkDomainNames(); err != nil {
+				return nil, true, err
+			}
+
+			return opt, false, nil
+
+		default:
+			// We do not yet recognize the option, just skip for
+			// now. This is okay because RFC 4861 allows us to
+			// skip/ignore any unrecognized options. However,
+			// we MUST recognized all the options in RFC 4861.
+			//
+			// TODO(b/141487990): Handle all NDP options as defined
+			//                    by RFC 4861.
+		}
+	}
+}
+
+// NDPOptions is a buffer of NDP options as defined by RFC 4861 section 4.6.
+type NDPOptions []byte
+
+// Iter returns an iterator of NDPOption.
+//
+// If check is true, Iter will do an integrity check on the options by iterating
+// over it and returning an error if detected.
+//
+// See NDPOptionIterator for more information.
+func (b NDPOptions) Iter(check bool) (NDPOptionIterator, error) {
+	it := NDPOptionIterator{
+		opts: bytes.NewBuffer(b),
+	}
+
+	if check {
+		it2 := NDPOptionIterator{
+			opts: bytes.NewBuffer(b),
+		}
+
+		for {
+			if _, done, err := it2.Next(); err != nil || done {
+				return it, err
+			}
+		}
+	}
+
+	return it, nil
+}
+
+// Serialize serializes the provided list of NDP options into o.
+//
+// Note, b must be of sufficient size to hold all the options in s. See
+// NDPOptionsSerializer.Length for details on the getting the total size
+// of a serialized NDPOptionsSerializer.
+//
+// Serialize may panic if b is not of sufficient size to hold all the options
+// in s.
+func (b NDPOptions) Serialize(s NDPOptionsSerializer) int {
+	done := 0
+
+	for _, o := range s {
+		l := paddedLength(o)
+
+		if l == 0 {
+			continue
+		}
+
+		b[0] = byte(o.Type())
+
+		// We know this safe because paddedLength would have returned
+		// 0 if o had an invalid length (> 255 * lengthByteUnits).
+		b[1] = uint8(l / lengthByteUnits)
+
+		// Serialize NDP option body.
+		used := o.serializeInto(b[2:])
+
+		// Zero out remaining (padding) bytes, if any exists.
+		for i := used + 2; i < l; i++ {
+			b[i] = 0
+		}
+
+		b = b[l:]
+		done += l
+	}
+
+	return done
+}
+
+// NDPOption is the set of functions to be implemented by all NDP option types.
+type NDPOption interface {
+	fmt.Stringer
+
+	// Type returns the type of the receiver.
+	Type() NDPOptionIdentifier
+
+	// Length returns the length of the body of the receiver, in bytes.
+	Length() int
+
+	// serializeInto serializes the receiver into the provided byte
+	// buffer.
+	//
+	// Note, the caller MUST provide a byte buffer with size of at least
+	// Length. Implementers of this function may assume that the byte buffer
+	// is of sufficient size. serializeInto MAY panic if the provided byte
+	// buffer is not of sufficient size.
+	//
+	// serializeInto will return the number of bytes that was used to
+	// serialize the receiver. Implementers must only use the number of
+	// bytes required to serialize the receiver. Callers MAY provide a
+	// larger buffer than required to serialize into.
+	serializeInto([]byte) int
+}
+
+// paddedLength returns the length of o, in bytes, with any padding bytes, if
+// required.
+func paddedLength(o NDPOption) int {
+	l := o.Length()
+
+	if l == 0 {
+		return 0
+	}
+
+	// Length excludes the 2 Type and Length bytes.
+	l += 2
+
+	// Add extra bytes if needed to make sure the option is
+	// lengthByteUnits-byte aligned. We do this by adding lengthByteUnits-1
+	// to l and then stripping off the last few LSBits from l. This will
+	// make sure that l is rounded up to the nearest unit of
+	// lengthByteUnits. This works since lengthByteUnits is a power of 2
+	// (= 8).
+	mask := lengthByteUnits - 1
+	l += mask
+	l &^= mask
+
+	if l/lengthByteUnits > 255 {
+		// Should never happen because an option can only have a max
+		// value of 255 for its Length field, so just return 0 so this
+		// option does not get serialized.
+		//
+		// Returning 0 here will make sure that this option does not get
+		// serialized when NDPOptions.Serialize is called with the
+		// NDPOptionsSerializer that holds this option, effectively
+		// skipping this option during serialization. Also note that
+		// a value of zero for the Length field in an NDP option is
+		// invalid so this is another sign to the caller that this NDP
+		// option is malformed, as per RFC 4861 section 4.6.
+		return 0
+	}
+
+	return l
+}
+
+// NDPOptionsSerializer is a serializer for NDP options.
+type NDPOptionsSerializer []NDPOption
+
+// Length returns the total number of bytes required to serialize.
+func (b NDPOptionsSerializer) Length() int {
+	l := 0
+
+	for _, o := range b {
+		l += paddedLength(o)
+	}
+
+	return l
+}
+
+// NDPSourceLinkLayerAddressOption is the NDP Source Link Layer Option
+// as defined by RFC 4861 section 4.6.1.
+//
+// It is the first X bytes following the NDP option's Type and Length field
+// where X is the value in Length multiplied by lengthByteUnits - 2 bytes.
+type NDPSourceLinkLayerAddressOption tcpip.LinkAddress
+
+// Type implements NDPOption.Type.
+func (o NDPSourceLinkLayerAddressOption) Type() NDPOptionIdentifier {
+	return NDPSourceLinkLayerAddressOptionType
+}
+
+// Length implements NDPOption.Length.
+func (o NDPSourceLinkLayerAddressOption) Length() int {
+	return len(o)
+}
+
+// serializeInto implements NDPOption.serializeInto.
+func (o NDPSourceLinkLayerAddressOption) serializeInto(b []byte) int {
+	return copy(b, o)
+}
+
+// String implements fmt.Stringer.String.
+func (o NDPSourceLinkLayerAddressOption) String() string {
+	return fmt.Sprintf("%T(%s)", o, tcpip.LinkAddress(o))
+}
+
+// EthernetAddress will return an ethernet (MAC) address if the
+// NDPSourceLinkLayerAddressOption's body has at minimum EthernetAddressSize
+// bytes. If the body has more than EthernetAddressSize bytes, only the first
+// EthernetAddressSize bytes are returned as that is all that is needed for an
+// Ethernet address.
+func (o NDPSourceLinkLayerAddressOption) EthernetAddress() tcpip.LinkAddress {
+	if len(o) >= EthernetAddressSize {
+		return tcpip.LinkAddress(o[:EthernetAddressSize])
+	}
+
+	return tcpip.LinkAddress([]byte(nil))
+}
+
+// NDPTargetLinkLayerAddressOption is the NDP Target Link Layer Option
+// as defined by RFC 4861 section 4.6.1.
+//
+// It is the first X bytes following the NDP option's Type and Length field
+// where X is the value in Length multiplied by lengthByteUnits - 2 bytes.
+type NDPTargetLinkLayerAddressOption tcpip.LinkAddress
+
+// Type implements NDPOption.Type.
+func (o NDPTargetLinkLayerAddressOption) Type() NDPOptionIdentifier {
+	return NDPTargetLinkLayerAddressOptionType
+}
+
+// Length implements NDPOption.Length.
+func (o NDPTargetLinkLayerAddressOption) Length() int {
+	return len(o)
+}
+
+// serializeInto implements NDPOption.serializeInto.
+func (o NDPTargetLinkLayerAddressOption) serializeInto(b []byte) int {
+	return copy(b, o)
+}
+
+// String implements fmt.Stringer.String.
+func (o NDPTargetLinkLayerAddressOption) String() string {
+	return fmt.Sprintf("%T(%s)", o, tcpip.LinkAddress(o))
+}
+
+// EthernetAddress will return an ethernet (MAC) address if the
+// NDPTargetLinkLayerAddressOption's body has at minimum EthernetAddressSize
+// bytes. If the body has more than EthernetAddressSize bytes, only the first
+// EthernetAddressSize bytes are returned as that is all that is needed for an
+// Ethernet address.
+func (o NDPTargetLinkLayerAddressOption) EthernetAddress() tcpip.LinkAddress {
+	if len(o) >= EthernetAddressSize {
+		return tcpip.LinkAddress(o[:EthernetAddressSize])
+	}
+
+	return tcpip.LinkAddress([]byte(nil))
+}
+
+// NDPPrefixInformation is the NDP Prefix Information option as defined by
+// RFC 4861 section 4.6.2.
+//
+// The length, in bytes, of a valid NDP Prefix Information option body MUST be
+// ndpPrefixInformationLength bytes.
+type NDPPrefixInformation []byte
+
+// Type implements NDPOption.Type.
+func (o NDPPrefixInformation) Type() NDPOptionIdentifier {
+	return NDPPrefixInformationType
+}
+
+// Length implements NDPOption.Length.
+func (o NDPPrefixInformation) Length() int {
+	return ndpPrefixInformationLength
+}
+
+// serializeInto implements NDPOption.serializeInto.
+func (o NDPPrefixInformation) serializeInto(b []byte) int {
+	used := copy(b, o)
+
+	// Zero out the Reserved1 field.
+	b[ndpPrefixInformationFlagsOffset] &^= ndpPrefixInformationReserved1FlagsMask
+
+	// Zero out the Reserved2 field.
+	reserved2 := b[ndpPrefixInformationReserved2Offset:][:ndpPrefixInformationReserved2Length]
+	for i := range reserved2 {
+		reserved2[i] = 0
+	}
+
+	return used
+}
+
+// String implements fmt.Stringer.String.
+func (o NDPPrefixInformation) String() string {
+	return fmt.Sprintf("%T(O=%t, A=%t, PL=%s, VL=%s, Prefix=%s)",
+		o,
+		o.OnLinkFlag(),
+		o.AutonomousAddressConfigurationFlag(),
+		o.PreferredLifetime(),
+		o.ValidLifetime(),
+		o.Subnet())
+}
+
+// PrefixLength returns the value in the number of leading bits in the Prefix
+// that are valid.
+//
+// Valid values are in the range [0, 128], but o may not always contain valid
+// values. It is up to the caller to valdiate the Prefix Information option.
+func (o NDPPrefixInformation) PrefixLength() uint8 {
+	return o[ndpPrefixInformationPrefixLengthOffset]
+}
+
+// OnLinkFlag returns true of the prefix is considered on-link. On-link means
+// that a forwarding node is not needed to send packets to other nodes on the
+// same prefix.
+//
+// Note, when this function returns false, no statement is made about the
+// on-link property of a prefix. That is, if OnLinkFlag returns false, the
+// caller MUST NOT conclude that the prefix is off-link and MUST NOT update any
+// previously stored state for this prefix about its on-link status.
+func (o NDPPrefixInformation) OnLinkFlag() bool {
+	return o[ndpPrefixInformationFlagsOffset]&ndpPrefixInformationOnLinkFlagMask != 0
+}
+
+// AutonomousAddressConfigurationFlag returns true if the prefix can be used for
+// Stateless Address Auto-Configuration (as specified in RFC 4862).
+func (o NDPPrefixInformation) AutonomousAddressConfigurationFlag() bool {
+	return o[ndpPrefixInformationFlagsOffset]&ndpPrefixInformationAutoAddrConfFlagMask != 0
+}
+
+// ValidLifetime returns the length of time that the prefix is valid for the
+// purpose of on-link determination. This value is relative to the send time of
+// the packet that the Prefix Information option was present in.
+//
+// Note, a value of 0 implies the prefix should not be considered as on-link,
+// and a value of infinity/forever is represented by
+// NDPInfiniteLifetime.
+func (o NDPPrefixInformation) ValidLifetime() time.Duration {
+	// The field is the time in seconds, as per RFC 4861 section 4.6.2.
+	return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpPrefixInformationValidLifetimeOffset:]))
+}
+
+// PreferredLifetime returns the length of time that an address generated from
+// the prefix via Stateless Address Auto-Configuration remains preferred. This
+// value is relative to the send time of the packet that the Prefix Information
+// option was present in.
+//
+// Note, a value of 0 implies that addresses generated from the prefix should
+// no longer remain preferred, and a value of infinity is represented by
+// NDPInfiniteLifetime.
+//
+// Also note that the value of this field MUST NOT exceed the Valid Lifetime
+// field to avoid preferring addresses that are no longer valid, for the
+// purpose of Stateless Address Auto-Configuration.
+func (o NDPPrefixInformation) PreferredLifetime() time.Duration {
+	// The field is the time in seconds, as per RFC 4861 section 4.6.2.
+	return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpPrefixInformationPreferredLifetimeOffset:]))
+}
+
+// Prefix returns an IPv6 address or a prefix of an IPv6 address. The Prefix
+// Length field (see NDPPrefixInformation.PrefixLength) contains the number
+// of valid leading bits in the prefix.
+//
+// Hosts SHOULD ignore an NDP Prefix Information option where the Prefix field
+// holds the link-local prefix (fe80::).
+func (o NDPPrefixInformation) Prefix() tcpip.Address {
+	return tcpip.Address(o[ndpPrefixInformationPrefixOffset:][:IPv6AddressSize])
+}
+
+// Subnet returns the Prefix field and Prefix Length field represented in a
+// tcpip.Subnet.
+func (o NDPPrefixInformation) Subnet() tcpip.Subnet {
+	addrWithPrefix := tcpip.AddressWithPrefix{
+		Address:   o.Prefix(),
+		PrefixLen: int(o.PrefixLength()),
+	}
+	return addrWithPrefix.Subnet()
+}
+
+// NDPRecursiveDNSServer is the NDP Recursive DNS Server option, as defined by
+// RFC 8106 section 5.1.
+//
+// To make sure that the option meets its minimum length and does not end in the
+// middle of a DNS server's IPv6 address, the length of a valid
+// NDPRecursiveDNSServer must meet the following constraint:
+//   (Length - ndpRecursiveDNSServerAddressesOffset) % IPv6AddressSize == 0
+type NDPRecursiveDNSServer []byte
+
+// Type returns the type of an NDP Recursive DNS Server option.
+//
+// Type implements NDPOption.Type.
+func (NDPRecursiveDNSServer) Type() NDPOptionIdentifier {
+	return NDPRecursiveDNSServerOptionType
+}
+
+// Length implements NDPOption.Length.
+func (o NDPRecursiveDNSServer) Length() int {
+	return len(o)
+}
+
+// serializeInto implements NDPOption.serializeInto.
+func (o NDPRecursiveDNSServer) serializeInto(b []byte) int {
+	used := copy(b, o)
+
+	// Zero out the reserved bytes that are before the Lifetime field.
+	for i := 0; i < ndpRecursiveDNSServerLifetimeOffset; i++ {
+		b[i] = 0
+	}
+
+	return used
+}
+
+// String implements fmt.Stringer.String.
+func (o NDPRecursiveDNSServer) String() string {
+	lt := o.Lifetime()
+	addrs, err := o.Addresses()
+	if err != nil {
+		return fmt.Sprintf("%T([] valid for %s; err = %s)", o, lt, err)
+	}
+	return fmt.Sprintf("%T(%s valid for %s)", o, addrs, lt)
+}
+
+// Lifetime returns the length of time that the DNS server addresses
+// in this option may be used for name resolution.
+//
+// Note, a value of 0 implies the addresses should no longer be used,
+// and a value of infinity/forever is represented by NDPInfiniteLifetime.
+//
+// Lifetime may panic if o does not have enough bytes to hold the Lifetime
+// field.
+func (o NDPRecursiveDNSServer) Lifetime() time.Duration {
+	// The field is the time in seconds, as per RFC 8106 section 5.1.
+	return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpRecursiveDNSServerLifetimeOffset:]))
+}
+
+// Addresses returns the recursive DNS server IPv6 addresses that may be
+// used for name resolution.
+//
+// Note, the addresses MAY be link-local addresses.
+func (o NDPRecursiveDNSServer) Addresses() ([]tcpip.Address, error) {
+	var addrs []tcpip.Address
+	return addrs, o.iterAddresses(func(addr tcpip.Address) { addrs = append(addrs, addr) })
+}
+
+// checkAddresses iterates over the addresses in an NDP Recursive DNS Server
+// option and returns any error it encounters.
+func (o NDPRecursiveDNSServer) checkAddresses() error {
+	return o.iterAddresses(nil)
+}
+
+// iterAddresses iterates over the addresses in an NDP Recursive DNS Server
+// option and calls a function with each valid unicast IPv6 address.
+//
+// Note, the addresses MAY be link-local addresses.
+func (o NDPRecursiveDNSServer) iterAddresses(fn func(tcpip.Address)) error {
+	if l := len(o); l < minNDPRecursiveDNSServerBodySize {
+		return fmt.Errorf("got %d bytes for NDP Recursive DNS Server option's body, expected at least %d bytes: %w", l, minNDPRecursiveDNSServerBodySize, io.ErrUnexpectedEOF)
+	}
+
+	o = o[ndpRecursiveDNSServerAddressesOffset:]
+	l := len(o)
+	if l%IPv6AddressSize != 0 {
+		return fmt.Errorf("NDP Recursive DNS Server option's body ends in the middle of an IPv6 address (addresses body size = %d bytes): %w", l, ErrNDPOptMalformedBody)
+	}
+
+	for i := 0; len(o) != 0; i++ {
+		addr := tcpip.Address(o[:IPv6AddressSize])
+		if !IsV6UnicastAddress(addr) {
+			return fmt.Errorf("%d-th address (%s) in NDP Recursive DNS Server option is not a valid unicast IPv6 address: %w", i, addr, ErrNDPOptMalformedBody)
+		}
+
+		if fn != nil {
+			fn(addr)
+		}
+
+		o = o[IPv6AddressSize:]
+	}
+
+	return nil
+}
+
+// NDPDNSSearchList is the NDP DNS Search List option, as defined by
+// RFC 8106 section 5.2.
+type NDPDNSSearchList []byte
+
+// Type implements NDPOption.Type.
+func (o NDPDNSSearchList) Type() NDPOptionIdentifier {
+	return NDPDNSSearchListOptionType
+}
+
+// Length implements NDPOption.Length.
+func (o NDPDNSSearchList) Length() int {
+	return len(o)
+}
+
+// serializeInto implements NDPOption.serializeInto.
+func (o NDPDNSSearchList) serializeInto(b []byte) int {
+	used := copy(b, o)
+
+	// Zero out the reserved bytes that are before the Lifetime field.
+	for i := 0; i < ndpDNSSearchListLifetimeOffset; i++ {
+		b[i] = 0
+	}
+
+	return used
+}
+
+// String implements fmt.Stringer.String.
+func (o NDPDNSSearchList) String() string {
+	lt := o.Lifetime()
+	domainNames, err := o.DomainNames()
+	if err != nil {
+		return fmt.Sprintf("%T([] valid for %s; err = %s)", o, lt, err)
+	}
+	return fmt.Sprintf("%T(%s valid for %s)", o, domainNames, lt)
+}
+
+// Lifetime returns the length of time that the DNS search list of domain names
+// in this option may be used for name resolution.
+//
+// Note, a value of 0 implies the domain names should no longer be used,
+// and a value of infinity/forever is represented by NDPInfiniteLifetime.
+func (o NDPDNSSearchList) Lifetime() time.Duration {
+	// The field is the time in seconds, as per RFC 8106 section 5.1.
+	return time.Second * time.Duration(binary.BigEndian.Uint32(o[ndpDNSSearchListLifetimeOffset:]))
+}
+
+// DomainNames returns a DNS search list of domain names.
+//
+// DomainNames will parse the backing buffer as outlined by RFC 1035 section
+// 3.1 and return a list of strings, with all domain names in lower case.
+func (o NDPDNSSearchList) DomainNames() ([]string, error) {
+	var domainNames []string
+	return domainNames, o.iterDomainNames(func(domainName string) { domainNames = append(domainNames, domainName) })
+}
+
+// checkDomainNames iterates over the domain names in an NDP DNS Search List
+// option and returns any error it encounters.
+func (o NDPDNSSearchList) checkDomainNames() error {
+	return o.iterDomainNames(nil)
+}
+
+// iterDomainNames iterates over the domain names in an NDP DNS Search List
+// option and calls a function with each valid domain name.
+func (o NDPDNSSearchList) iterDomainNames(fn func(string)) error {
+	if l := len(o); l < minNDPDNSSearchListBodySize {
+		return fmt.Errorf("got %d bytes for NDP DNS Search List  option's body, expected at least %d bytes: %w", l, minNDPDNSSearchListBodySize, io.ErrUnexpectedEOF)
+	}
+
+	var searchList bytes.Reader
+	searchList.Reset(o[ndpDNSSearchListDomainNamesOffset:])
+
+	var scratch [maxDomainNameLength]byte
+	domainName := bytes.NewBuffer(scratch[:])
+
+	// Parse the domain names, as per RFC 1035 section 3.1.
+	for searchList.Len() != 0 {
+		domainName.Reset()
+
+		// Parse a label within a domain name, as per RFC 1035 section 3.1.
+		for {
+			// The first byte is the label length.
+			labelLenByte, err := searchList.ReadByte()
+			if err != nil {
+				if err != io.EOF {
+					// ReadByte should only ever return nil or io.EOF.
+					panic(fmt.Sprintf("unexpected error when reading a label's length: %s", err))
+				}
+
+				// We use io.ErrUnexpectedEOF as exhausting the buffer is unexpected
+				// once we start parsing a domain name; we expect the buffer to contain
+				// enough bytes for the whole domain name.
+				return fmt.Errorf("unexpected exhausted buffer while parsing a new label for a domain from NDP Search List option: %w", io.ErrUnexpectedEOF)
+			}
+			labelLen := int(labelLenByte)
+
+			// A zero-length label implies the end of a domain name.
+			if labelLen == 0 {
+				// If the domain name is empty or we have no callback function, do
+				// nothing further with the current domain name.
+				if domainName.Len() == 0 || fn == nil {
+					break
+				}
+
+				// Ignore the trailing period in the parsed domain name.
+				domainName.Truncate(domainName.Len() - 1)
+				fn(domainName.String())
+				break
+			}
+
+			// The label's length must not exceed the maximum length for a label.
+			if labelLen > maxDomainNameLabelLength {
+				return fmt.Errorf("label length of %d bytes is greater than the max label length of %d bytes for an NDP Search List option: %w", labelLen, maxDomainNameLabelLength, ErrNDPOptMalformedBody)
+			}
+
+			// The label (and trailing period) must not make the domain name too long.
+			if labelLen+1 > domainName.Cap()-domainName.Len() {
+				return fmt.Errorf("label would make an NDP Search List option's domain name longer than the max domain name length of %d bytes: %w", maxDomainNameLength, ErrNDPOptMalformedBody)
+			}
+
+			// Copy the label and add a trailing period.
+			for i := 0; i < labelLen; i++ {
+				b, err := searchList.ReadByte()
+				if err != nil {
+					if err != io.EOF {
+						panic(fmt.Sprintf("unexpected error when reading domain name's label: %s", err))
+					}
+
+					return fmt.Errorf("read %d out of %d bytes for a domain name's label from NDP Search List option: %w", i, labelLen, io.ErrUnexpectedEOF)
+				}
+
+				// As per RFC 1035 section 2.3.1:
+				//  1) the label must only contain ASCII include letters, digits and
+				//     hyphens
+				//  2) the first character in a label must be a letter
+				//  3) the last letter in a label must be a letter or digit
+
+				if !isLetter(b) {
+					if i == 0 {
+						return fmt.Errorf("first character of a domain name's label in an NDP Search List option must be a letter, got character code = %d: %w", b, ErrNDPOptMalformedBody)
+					}
+
+					if b == '-' {
+						if i == labelLen-1 {
+							return fmt.Errorf("last character of a domain name's label in an NDP Search List option must not be a hyphen (-): %w", ErrNDPOptMalformedBody)
+						}
+					} else if !isDigit(b) {
+						return fmt.Errorf("domain name's label in an NDP Search List option may only contain letters, digits and hyphens, got character code = %d: %w", b, ErrNDPOptMalformedBody)
+					}
+				}
+
+				// If b is an upper case character, make it lower case.
+				if isUpperLetter(b) {
+					b = b - 'A' + 'a'
+				}
+
+				if err := domainName.WriteByte(b); err != nil {
+					panic(fmt.Sprintf("unexpected error writing label to domain name buffer: %s", err))
+				}
+			}
+			if err := domainName.WriteByte('.'); err != nil {
+				panic(fmt.Sprintf("unexpected error writing trailing period to domain name buffer: %s", err))
+			}
+		}
+	}
+
+	return nil
+}
+
+func isLetter(b byte) bool {
+	return b >= 'a' && b <= 'z' || isUpperLetter(b)
+}
+
+func isUpperLetter(b byte) bool {
+	return b >= 'A' && b <= 'Z'
+}
+
+func isDigit(b byte) bool {
+	return b >= '0' && b <= '9'
+}
diff --git a/pkg/tcpip/header/ndp_router_advert.go b/pkg/tcpip/header/ndp_router_advert.go
new file mode 100644
index 000000000..bf7610863
--- /dev/null
+++ b/pkg/tcpip/header/ndp_router_advert.go
@@ -0,0 +1,112 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"encoding/binary"
+	"time"
+)
+
+// NDPRouterAdvert is an NDP Router Advertisement message. It will only contain
+// the body of an ICMPv6 packet.
+//
+// See RFC 4861 section 4.2 for more details.
+type NDPRouterAdvert []byte
+
+const (
+	// NDPRAMinimumSize is the minimum size of a valid NDP Router
+	// Advertisement message (body of an ICMPv6 packet).
+	NDPRAMinimumSize = 12
+
+	// ndpRACurrHopLimitOffset is the byte of the Curr Hop Limit field
+	// within an NDPRouterAdvert.
+	ndpRACurrHopLimitOffset = 0
+
+	// ndpRAFlagsOffset is the byte with the NDP RA bit-fields/flags
+	// within an NDPRouterAdvert.
+	ndpRAFlagsOffset = 1
+
+	// ndpRAManagedAddrConfFlagMask is the mask of the Managed Address
+	// Configuration flag within the bit-field/flags byte of an
+	// NDPRouterAdvert.
+	ndpRAManagedAddrConfFlagMask = (1 << 7)
+
+	// ndpRAOtherConfFlagMask is the mask of the Other Configuration flag
+	// within the bit-field/flags byte of an NDPRouterAdvert.
+	ndpRAOtherConfFlagMask = (1 << 6)
+
+	// ndpRARouterLifetimeOffset is the start of the 2-byte Router Lifetime
+	// field within an NDPRouterAdvert.
+	ndpRARouterLifetimeOffset = 2
+
+	// ndpRAReachableTimeOffset is the start of the 4-byte Reachable Time
+	// field within an NDPRouterAdvert.
+	ndpRAReachableTimeOffset = 4
+
+	// ndpRARetransTimerOffset is the start of the 4-byte Retrans Timer
+	// field within an NDPRouterAdvert.
+	ndpRARetransTimerOffset = 8
+
+	// ndpRAOptionsOffset is the start of the NDP options in an
+	// NDPRouterAdvert.
+	ndpRAOptionsOffset = 12
+)
+
+// CurrHopLimit returns the value of the Curr Hop Limit field.
+func (b NDPRouterAdvert) CurrHopLimit() uint8 {
+	return b[ndpRACurrHopLimitOffset]
+}
+
+// ManagedAddrConfFlag returns the value of the Managed Address Configuration
+// flag.
+func (b NDPRouterAdvert) ManagedAddrConfFlag() bool {
+	return b[ndpRAFlagsOffset]&ndpRAManagedAddrConfFlagMask != 0
+}
+
+// OtherConfFlag returns the value of the Other Configuration flag.
+func (b NDPRouterAdvert) OtherConfFlag() bool {
+	return b[ndpRAFlagsOffset]&ndpRAOtherConfFlagMask != 0
+}
+
+// RouterLifetime returns the lifetime associated with the default router. A
+// value of 0 means the source of the Router Advertisement is not a default
+// router and SHOULD NOT appear on the default router list. Note, a value of 0
+// only means that the router should not be used as a default router, it does
+// not apply to other information contained in the Router Advertisement.
+func (b NDPRouterAdvert) RouterLifetime() time.Duration {
+	// The field is the time in seconds, as per RFC 4861 section 4.2.
+	return time.Second * time.Duration(binary.BigEndian.Uint16(b[ndpRARouterLifetimeOffset:]))
+}
+
+// ReachableTime returns the time that a node assumes a neighbor is reachable
+// after having received a reachability confirmation. A value of 0 means
+// that it is unspecified by the source of the Router Advertisement message.
+func (b NDPRouterAdvert) ReachableTime() time.Duration {
+	// The field is the time in milliseconds, as per RFC 4861 section 4.2.
+	return time.Millisecond * time.Duration(binary.BigEndian.Uint32(b[ndpRAReachableTimeOffset:]))
+}
+
+// RetransTimer returns the time between retransmitted Neighbor Solicitation
+// messages. A value of 0 means that it is unspecified by the source of the
+// Router Advertisement message.
+func (b NDPRouterAdvert) RetransTimer() time.Duration {
+	// The field is the time in milliseconds, as per RFC 4861 section 4.2.
+	return time.Millisecond * time.Duration(binary.BigEndian.Uint32(b[ndpRARetransTimerOffset:]))
+}
+
+// Options returns an NDPOptions of the the options body.
+func (b NDPRouterAdvert) Options() NDPOptions {
+	return NDPOptions(b[ndpRAOptionsOffset:])
+}
diff --git a/pkg/tcpip/header/ndp_router_solicit.go b/pkg/tcpip/header/ndp_router_solicit.go
new file mode 100644
index 000000000..9e67ba95d
--- /dev/null
+++ b/pkg/tcpip/header/ndp_router_solicit.go
@@ -0,0 +1,36 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+// NDPRouterSolicit is an NDP Router Solicitation message. It will only contain
+// the body of an ICMPv6 packet.
+//
+// See RFC 4861 section 4.1 for more details.
+type NDPRouterSolicit []byte
+
+const (
+	// NDPRSMinimumSize is the minimum size of a valid NDP Router
+	// Solicitation message (body of an ICMPv6 packet).
+	NDPRSMinimumSize = 4
+
+	// ndpRSOptionsOffset is the start of the NDP options in an
+	// NDPRouterSolicit.
+	ndpRSOptionsOffset = 4
+)
+
+// Options returns an NDPOptions of the the options body.
+func (b NDPRouterSolicit) Options() NDPOptions {
+	return NDPOptions(b[ndpRSOptionsOffset:])
+}
diff --git a/pkg/tcpip/header/ndp_test.go b/pkg/tcpip/header/ndp_test.go
new file mode 100644
index 000000000..dc4591253
--- /dev/null
+++ b/pkg/tcpip/header/ndp_test.go
@@ -0,0 +1,1521 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"io"
+	"regexp"
+	"testing"
+	"time"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+// TestNDPNeighborSolicit tests the functions of NDPNeighborSolicit.
+func TestNDPNeighborSolicit(t *testing.T) {
+	b := []byte{
+		0, 0, 0, 0,
+		1, 2, 3, 4,
+		5, 6, 7, 8,
+		9, 10, 11, 12,
+		13, 14, 15, 16,
+	}
+
+	// Test getting the Target Address.
+	ns := NDPNeighborSolicit(b)
+	addr := tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10")
+	if got := ns.TargetAddress(); got != addr {
+		t.Errorf("got ns.TargetAddress = %s, want %s", got, addr)
+	}
+
+	// Test updating the Target Address.
+	addr2 := tcpip.Address("\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x11")
+	ns.SetTargetAddress(addr2)
+	if got := ns.TargetAddress(); got != addr2 {
+		t.Errorf("got ns.TargetAddress = %s, want %s", got, addr2)
+	}
+	// Make sure the address got updated in the backing buffer.
+	if got := tcpip.Address(b[ndpNSTargetAddessOffset:][:IPv6AddressSize]); got != addr2 {
+		t.Errorf("got targetaddress buffer = %s, want %s", got, addr2)
+	}
+}
+
+// TestNDPNeighborAdvert tests the functions of NDPNeighborAdvert.
+func TestNDPNeighborAdvert(t *testing.T) {
+	b := []byte{
+		160, 0, 0, 0,
+		1, 2, 3, 4,
+		5, 6, 7, 8,
+		9, 10, 11, 12,
+		13, 14, 15, 16,
+	}
+
+	// Test getting the Target Address.
+	na := NDPNeighborAdvert(b)
+	addr := tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10")
+	if got := na.TargetAddress(); got != addr {
+		t.Errorf("got TargetAddress = %s, want %s", got, addr)
+	}
+
+	// Test getting the Router Flag.
+	if got := na.RouterFlag(); !got {
+		t.Errorf("got RouterFlag = false, want = true")
+	}
+
+	// Test getting the Solicited Flag.
+	if got := na.SolicitedFlag(); got {
+		t.Errorf("got SolicitedFlag = true, want = false")
+	}
+
+	// Test getting the Override Flag.
+	if got := na.OverrideFlag(); !got {
+		t.Errorf("got OverrideFlag = false, want = true")
+	}
+
+	// Test updating the Target Address.
+	addr2 := tcpip.Address("\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x11")
+	na.SetTargetAddress(addr2)
+	if got := na.TargetAddress(); got != addr2 {
+		t.Errorf("got TargetAddress = %s, want %s", got, addr2)
+	}
+	// Make sure the address got updated in the backing buffer.
+	if got := tcpip.Address(b[ndpNATargetAddressOffset:][:IPv6AddressSize]); got != addr2 {
+		t.Errorf("got targetaddress buffer = %s, want %s", got, addr2)
+	}
+
+	// Test updating the Router Flag.
+	na.SetRouterFlag(false)
+	if got := na.RouterFlag(); got {
+		t.Errorf("got RouterFlag = true, want = false")
+	}
+
+	// Test updating the Solicited Flag.
+	na.SetSolicitedFlag(true)
+	if got := na.SolicitedFlag(); !got {
+		t.Errorf("got SolicitedFlag = false, want = true")
+	}
+
+	// Test updating the Override Flag.
+	na.SetOverrideFlag(false)
+	if got := na.OverrideFlag(); got {
+		t.Errorf("got OverrideFlag = true, want = false")
+	}
+
+	// Make sure flags got updated in the backing buffer.
+	if got := b[ndpNAFlagsOffset]; got != 64 {
+		t.Errorf("got flags byte = %d, want = 64", got)
+	}
+}
+
+func TestNDPRouterAdvert(t *testing.T) {
+	b := []byte{
+		64, 128, 1, 2,
+		3, 4, 5, 6,
+		7, 8, 9, 10,
+	}
+
+	ra := NDPRouterAdvert(b)
+
+	if got := ra.CurrHopLimit(); got != 64 {
+		t.Errorf("got ra.CurrHopLimit = %d, want = 64", got)
+	}
+
+	if got := ra.ManagedAddrConfFlag(); !got {
+		t.Errorf("got ManagedAddrConfFlag = false, want = true")
+	}
+
+	if got := ra.OtherConfFlag(); got {
+		t.Errorf("got OtherConfFlag = true, want = false")
+	}
+
+	if got, want := ra.RouterLifetime(), time.Second*258; got != want {
+		t.Errorf("got ra.RouterLifetime = %d, want = %d", got, want)
+	}
+
+	if got, want := ra.ReachableTime(), time.Millisecond*50595078; got != want {
+		t.Errorf("got ra.ReachableTime = %d, want = %d", got, want)
+	}
+
+	if got, want := ra.RetransTimer(), time.Millisecond*117967114; got != want {
+		t.Errorf("got ra.RetransTimer = %d, want = %d", got, want)
+	}
+}
+
+// TestNDPSourceLinkLayerAddressOptionEthernetAddress tests getting the
+// Ethernet address from an NDPSourceLinkLayerAddressOption.
+func TestNDPSourceLinkLayerAddressOptionEthernetAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		buf      []byte
+		expected tcpip.LinkAddress
+	}{
+		{
+			"ValidMAC",
+			[]byte{1, 2, 3, 4, 5, 6},
+			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+		},
+		{
+			"SLLBodyTooShort",
+			[]byte{1, 2, 3, 4, 5},
+			tcpip.LinkAddress([]byte(nil)),
+		},
+		{
+			"SLLBodyLargerThanNeeded",
+			[]byte{1, 2, 3, 4, 5, 6, 7, 8},
+			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			sll := NDPSourceLinkLayerAddressOption(test.buf)
+			if got := sll.EthernetAddress(); got != test.expected {
+				t.Errorf("got sll.EthernetAddress = %s, want = %s", got, test.expected)
+			}
+		})
+	}
+}
+
+// TestNDPSourceLinkLayerAddressOptionSerialize tests serializing a
+// NDPSourceLinkLayerAddressOption.
+func TestNDPSourceLinkLayerAddressOptionSerialize(t *testing.T) {
+	tests := []struct {
+		name        string
+		buf         []byte
+		expectedBuf []byte
+		addr        tcpip.LinkAddress
+	}{
+		{
+			"Ethernet",
+			make([]byte, 8),
+			[]byte{1, 1, 1, 2, 3, 4, 5, 6},
+			"\x01\x02\x03\x04\x05\x06",
+		},
+		{
+			"Padding",
+			[]byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+			[]byte{1, 2, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0},
+			"\x01\x02\x03\x04\x05\x06\x07\x08",
+		},
+		{
+			"Empty",
+			nil,
+			nil,
+			"",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opts := NDPOptions(test.buf)
+			serializer := NDPOptionsSerializer{
+				NDPSourceLinkLayerAddressOption(test.addr),
+			}
+			if got, want := int(serializer.Length()), len(test.expectedBuf); got != want {
+				t.Fatalf("got Length = %d, want = %d", got, want)
+			}
+			opts.Serialize(serializer)
+			if !bytes.Equal(test.buf, test.expectedBuf) {
+				t.Fatalf("got b = %d, want = %d", test.buf, test.expectedBuf)
+			}
+
+			it, err := opts.Iter(true)
+			if err != nil {
+				t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+			}
+
+			if len(test.expectedBuf) > 0 {
+				next, done, err := it.Next()
+				if err != nil {
+					t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+				}
+				if done {
+					t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+				}
+				if got := next.Type(); got != NDPSourceLinkLayerAddressOptionType {
+					t.Fatalf("got Type = %d, want = %d", got, NDPSourceLinkLayerAddressOptionType)
+				}
+				sll := next.(NDPSourceLinkLayerAddressOption)
+				if got, want := []byte(sll), test.expectedBuf[2:]; !bytes.Equal(got, want) {
+					t.Fatalf("got Next = (%x, _, _), want = (%x, _, _)", got, want)
+				}
+
+				if got, want := sll.EthernetAddress(), tcpip.LinkAddress(test.expectedBuf[2:][:EthernetAddressSize]); got != want {
+					t.Errorf("got sll.EthernetAddress = %s, want = %s", got, want)
+				}
+			}
+
+			// Iterator should not return anything else.
+			next, done, err := it.Next()
+			if err != nil {
+				t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+			}
+			if !done {
+				t.Error("got Next = (_, false, _), want = (_, true, _)")
+			}
+			if next != nil {
+				t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+			}
+		})
+	}
+}
+
+// TestNDPTargetLinkLayerAddressOptionEthernetAddress tests getting the
+// Ethernet address from an NDPTargetLinkLayerAddressOption.
+func TestNDPTargetLinkLayerAddressOptionEthernetAddress(t *testing.T) {
+	tests := []struct {
+		name     string
+		buf      []byte
+		expected tcpip.LinkAddress
+	}{
+		{
+			"ValidMAC",
+			[]byte{1, 2, 3, 4, 5, 6},
+			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+		},
+		{
+			"TLLBodyTooShort",
+			[]byte{1, 2, 3, 4, 5},
+			tcpip.LinkAddress([]byte(nil)),
+		},
+		{
+			"TLLBodyLargerThanNeeded",
+			[]byte{1, 2, 3, 4, 5, 6, 7, 8},
+			tcpip.LinkAddress("\x01\x02\x03\x04\x05\x06"),
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			tll := NDPTargetLinkLayerAddressOption(test.buf)
+			if got := tll.EthernetAddress(); got != test.expected {
+				t.Errorf("got tll.EthernetAddress = %s, want = %s", got, test.expected)
+			}
+		})
+	}
+}
+
+// TestNDPTargetLinkLayerAddressOptionSerialize tests serializing a
+// NDPTargetLinkLayerAddressOption.
+func TestNDPTargetLinkLayerAddressOptionSerialize(t *testing.T) {
+	tests := []struct {
+		name        string
+		buf         []byte
+		expectedBuf []byte
+		addr        tcpip.LinkAddress
+	}{
+		{
+			"Ethernet",
+			make([]byte, 8),
+			[]byte{2, 1, 1, 2, 3, 4, 5, 6},
+			"\x01\x02\x03\x04\x05\x06",
+		},
+		{
+			"Padding",
+			[]byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+			[]byte{2, 2, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0},
+			"\x01\x02\x03\x04\x05\x06\x07\x08",
+		},
+		{
+			"Empty",
+			nil,
+			nil,
+			"",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opts := NDPOptions(test.buf)
+			serializer := NDPOptionsSerializer{
+				NDPTargetLinkLayerAddressOption(test.addr),
+			}
+			if got, want := int(serializer.Length()), len(test.expectedBuf); got != want {
+				t.Fatalf("got Length = %d, want = %d", got, want)
+			}
+			opts.Serialize(serializer)
+			if !bytes.Equal(test.buf, test.expectedBuf) {
+				t.Fatalf("got b = %d, want = %d", test.buf, test.expectedBuf)
+			}
+
+			it, err := opts.Iter(true)
+			if err != nil {
+				t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+			}
+
+			if len(test.expectedBuf) > 0 {
+				next, done, err := it.Next()
+				if err != nil {
+					t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+				}
+				if done {
+					t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+				}
+				if got := next.Type(); got != NDPTargetLinkLayerAddressOptionType {
+					t.Fatalf("got Type = %d, want = %d", got, NDPTargetLinkLayerAddressOptionType)
+				}
+				tll := next.(NDPTargetLinkLayerAddressOption)
+				if got, want := []byte(tll), test.expectedBuf[2:]; !bytes.Equal(got, want) {
+					t.Fatalf("got Next = (%x, _, _), want = (%x, _, _)", got, want)
+				}
+
+				if got, want := tll.EthernetAddress(), tcpip.LinkAddress(test.expectedBuf[2:][:EthernetAddressSize]); got != want {
+					t.Errorf("got tll.EthernetAddress = %s, want = %s", got, want)
+				}
+			}
+
+			// Iterator should not return anything else.
+			next, done, err := it.Next()
+			if err != nil {
+				t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+			}
+			if !done {
+				t.Error("got Next = (_, false, _), want = (_, true, _)")
+			}
+			if next != nil {
+				t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+			}
+		})
+	}
+}
+
+// TestNDPPrefixInformationOption tests the field getters and serialization of a
+// NDPPrefixInformation.
+func TestNDPPrefixInformationOption(t *testing.T) {
+	b := []byte{
+		43, 127,
+		1, 2, 3, 4,
+		5, 6, 7, 8,
+		5, 5, 5, 5,
+		9, 10, 11, 12,
+		13, 14, 15, 16,
+		17, 18, 19, 20,
+		21, 22, 23, 24,
+	}
+
+	targetBuf := []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+	opts := NDPOptions(targetBuf)
+	serializer := NDPOptionsSerializer{
+		NDPPrefixInformation(b),
+	}
+	opts.Serialize(serializer)
+	expectedBuf := []byte{
+		3, 4, 43, 64,
+		1, 2, 3, 4,
+		5, 6, 7, 8,
+		0, 0, 0, 0,
+		9, 10, 11, 12,
+		13, 14, 15, 16,
+		17, 18, 19, 20,
+		21, 22, 23, 24,
+	}
+	if !bytes.Equal(targetBuf, expectedBuf) {
+		t.Fatalf("got targetBuf = %x, want = %x", targetBuf, expectedBuf)
+	}
+
+	it, err := opts.Iter(true)
+	if err != nil {
+		t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+	}
+
+	next, done, err := it.Next()
+	if err != nil {
+		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if done {
+		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+	}
+	if got := next.Type(); got != NDPPrefixInformationType {
+		t.Errorf("got Type = %d, want = %d", got, NDPPrefixInformationType)
+	}
+
+	pi := next.(NDPPrefixInformation)
+
+	if got := pi.Type(); got != 3 {
+		t.Errorf("got Type = %d, want = 3", got)
+	}
+
+	if got := pi.Length(); got != 30 {
+		t.Errorf("got Length = %d, want = 30", got)
+	}
+
+	if got := pi.PrefixLength(); got != 43 {
+		t.Errorf("got PrefixLength = %d, want = 43", got)
+	}
+
+	if pi.OnLinkFlag() {
+		t.Error("got OnLinkFlag = true, want = false")
+	}
+
+	if !pi.AutonomousAddressConfigurationFlag() {
+		t.Error("got AutonomousAddressConfigurationFlag = false, want = true")
+	}
+
+	if got, want := pi.ValidLifetime(), 16909060*time.Second; got != want {
+		t.Errorf("got ValidLifetime = %d, want = %d", got, want)
+	}
+
+	if got, want := pi.PreferredLifetime(), 84281096*time.Second; got != want {
+		t.Errorf("got PreferredLifetime = %d, want = %d", got, want)
+	}
+
+	if got, want := pi.Prefix(), tcpip.Address("\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18"); got != want {
+		t.Errorf("got Prefix = %s, want = %s", got, want)
+	}
+
+	// Iterator should not return anything else.
+	next, done, err = it.Next()
+	if err != nil {
+		t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if !done {
+		t.Error("got Next = (_, false, _), want = (_, true, _)")
+	}
+	if next != nil {
+		t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+	}
+}
+
+func TestNDPRecursiveDNSServerOptionSerialize(t *testing.T) {
+	b := []byte{
+		9, 8,
+		1, 2, 4, 8,
+		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+	}
+	targetBuf := []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+	expected := []byte{
+		25, 3, 0, 0,
+		1, 2, 4, 8,
+		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+	}
+	opts := NDPOptions(targetBuf)
+	serializer := NDPOptionsSerializer{
+		NDPRecursiveDNSServer(b),
+	}
+	if got, want := opts.Serialize(serializer), len(expected); got != want {
+		t.Errorf("got Serialize = %d, want = %d", got, want)
+	}
+	if !bytes.Equal(targetBuf, expected) {
+		t.Fatalf("got targetBuf = %x, want = %x", targetBuf, expected)
+	}
+
+	it, err := opts.Iter(true)
+	if err != nil {
+		t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+	}
+
+	next, done, err := it.Next()
+	if err != nil {
+		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if done {
+		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+	}
+	if got := next.Type(); got != NDPRecursiveDNSServerOptionType {
+		t.Errorf("got Type = %d, want = %d", got, NDPRecursiveDNSServerOptionType)
+	}
+
+	opt, ok := next.(NDPRecursiveDNSServer)
+	if !ok {
+		t.Fatalf("next (type = %T) cannot be casted to an NDPRecursiveDNSServer", next)
+	}
+	if got := opt.Type(); got != 25 {
+		t.Errorf("got Type = %d, want = 31", got)
+	}
+	if got := opt.Length(); got != 22 {
+		t.Errorf("got Length = %d, want = 22", got)
+	}
+	if got, want := opt.Lifetime(), 16909320*time.Second; got != want {
+		t.Errorf("got Lifetime = %s, want = %s", got, want)
+	}
+	want := []tcpip.Address{
+		"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+	}
+	addrs, err := opt.Addresses()
+	if err != nil {
+		t.Errorf("opt.Addresses() = %s", err)
+	}
+	if diff := cmp.Diff(addrs, want); diff != "" {
+		t.Errorf("mismatched addresses (-want +got):\n%s", diff)
+	}
+
+	// Iterator should not return anything else.
+	next, done, err = it.Next()
+	if err != nil {
+		t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if !done {
+		t.Error("got Next = (_, false, _), want = (_, true, _)")
+	}
+	if next != nil {
+		t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+	}
+}
+
+func TestNDPRecursiveDNSServerOption(t *testing.T) {
+	tests := []struct {
+		name     string
+		buf      []byte
+		lifetime time.Duration
+		addrs    []tcpip.Address
+	}{
+		{
+			"Valid1Addr",
+			[]byte{
+				25, 3, 0, 0,
+				0, 0, 0, 0,
+				0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+			},
+			0,
+			[]tcpip.Address{
+				"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+			},
+		},
+		{
+			"Valid2Addr",
+			[]byte{
+				25, 5, 0, 0,
+				0, 0, 0, 0,
+				0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+				17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16,
+			},
+			0,
+			[]tcpip.Address{
+				"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+				"\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x10",
+			},
+		},
+		{
+			"Valid3Addr",
+			[]byte{
+				25, 7, 0, 0,
+				0, 0, 0, 0,
+				0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+				17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16,
+				17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 17,
+			},
+			0,
+			[]tcpip.Address{
+				"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+				"\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x10",
+				"\x11\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x11",
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opts := NDPOptions(test.buf)
+			it, err := opts.Iter(true)
+			if err != nil {
+				t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+			}
+
+			// Iterator should get our option.
+			next, done, err := it.Next()
+			if err != nil {
+				t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+			}
+			if done {
+				t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+			}
+			if got := next.Type(); got != NDPRecursiveDNSServerOptionType {
+				t.Fatalf("got Type = %d, want = %d", got, NDPRecursiveDNSServerOptionType)
+			}
+
+			opt, ok := next.(NDPRecursiveDNSServer)
+			if !ok {
+				t.Fatalf("next (type = %T) cannot be casted to an NDPRecursiveDNSServer", next)
+			}
+			if got := opt.Lifetime(); got != test.lifetime {
+				t.Errorf("got Lifetime = %d, want = %d", got, test.lifetime)
+			}
+			addrs, err := opt.Addresses()
+			if err != nil {
+				t.Errorf("opt.Addresses() = %s", err)
+			}
+			if diff := cmp.Diff(addrs, test.addrs); diff != "" {
+				t.Errorf("mismatched addresses (-want +got):\n%s", diff)
+			}
+
+			// Iterator should not return anything else.
+			next, done, err = it.Next()
+			if err != nil {
+				t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+			}
+			if !done {
+				t.Error("got Next = (_, false, _), want = (_, true, _)")
+			}
+			if next != nil {
+				t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+			}
+		})
+	}
+}
+
+// TestNDPDNSSearchListOption tests the getters of NDPDNSSearchList.
+func TestNDPDNSSearchListOption(t *testing.T) {
+	tests := []struct {
+		name        string
+		buf         []byte
+		lifetime    time.Duration
+		domainNames []string
+		err         error
+	}{
+		{
+			name: "Valid1Label",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, 'a', 'b', 'c',
+				0,
+				0, 0, 0,
+			},
+			lifetime: time.Second,
+			domainNames: []string{
+				"abc",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid2Label",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 5,
+				3, 'a', 'b', 'c',
+				4, 'a', 'b', 'c', 'd',
+				0,
+				0, 0, 0, 0, 0, 0,
+			},
+			lifetime: 5 * time.Second,
+			domainNames: []string{
+				"abc.abcd",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid3Label",
+			buf: []byte{
+				0, 0,
+				1, 0, 0, 0,
+				3, 'a', 'b', 'c',
+				4, 'a', 'b', 'c', 'd',
+				1, 'e',
+				0,
+				0, 0, 0, 0,
+			},
+			lifetime: 16777216 * time.Second,
+			domainNames: []string{
+				"abc.abcd.e",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid2Domains",
+			buf: []byte{
+				0, 0,
+				1, 2, 3, 4,
+				3, 'a', 'b', 'c',
+				0,
+				2, 'd', 'e',
+				3, 'x', 'y', 'z',
+				0,
+				0, 0, 0,
+			},
+			lifetime: 16909060 * time.Second,
+			domainNames: []string{
+				"abc",
+				"de.xyz",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid3DomainsMixedCase",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				3, 'a', 'B', 'c',
+				0,
+				2, 'd', 'E',
+				3, 'X', 'y', 'z',
+				0,
+				1, 'J',
+				0,
+			},
+			lifetime: 0,
+			domainNames: []string{
+				"abc",
+				"de.xyz",
+				"j",
+			},
+			err: nil,
+		},
+		{
+			name: "ValidDomainAfterNULL",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				3, 'a', 'B', 'c',
+				0, 0, 0, 0,
+				2, 'd', 'E',
+				3, 'X', 'y', 'z',
+				0,
+			},
+			lifetime: 0,
+			domainNames: []string{
+				"abc",
+				"de.xyz",
+			},
+			err: nil,
+		},
+		{
+			name: "Valid0Domains",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				0,
+				0, 0, 0, 0, 0, 0, 0,
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         nil,
+		},
+		{
+			name: "NoTrailingNull",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				7, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         io.ErrUnexpectedEOF,
+		},
+		{
+			name: "IncorrectLength",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				8, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         io.ErrUnexpectedEOF,
+		},
+		{
+			name: "IncorrectLengthWithNULL",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				7, 'a', 'b', 'c', 'd', 'e', 'f',
+				0,
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "LabelOfLength63",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				0,
+			},
+			lifetime: 0,
+			domainNames: []string{
+				"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk",
+			},
+			err: nil,
+		},
+		{
+			name: "LabelOfLength64",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				64, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l',
+				0,
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "DomainNameOfLength255",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				62, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j',
+				0,
+			},
+			lifetime: 0,
+			domainNames: []string{
+				"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijk.abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghij",
+			},
+			err: nil,
+		},
+		{
+			name: "DomainNameOfLength256",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				0,
+			},
+			lifetime:    0,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "StartingDigitForLabel",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, '9', 'b', 'c',
+				0,
+				0, 0, 0,
+			},
+			lifetime:    time.Second,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "StartingHyphenForLabel",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, '-', 'b', 'c',
+				0,
+				0, 0, 0,
+			},
+			lifetime:    time.Second,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "EndingHyphenForLabel",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, 'a', 'b', '-',
+				0,
+				0, 0, 0,
+			},
+			lifetime:    time.Second,
+			domainNames: nil,
+			err:         ErrNDPOptMalformedBody,
+		},
+		{
+			name: "EndingDigitForLabel",
+			buf: []byte{
+				0, 0,
+				0, 0, 0, 1,
+				3, 'a', 'b', '9',
+				0,
+				0, 0, 0,
+			},
+			lifetime: time.Second,
+			domainNames: []string{
+				"ab9",
+			},
+			err: nil,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opt := NDPDNSSearchList(test.buf)
+
+			if got := opt.Lifetime(); got != test.lifetime {
+				t.Errorf("got Lifetime = %d, want = %d", got, test.lifetime)
+			}
+			domainNames, err := opt.DomainNames()
+			if !errors.Is(err, test.err) {
+				t.Errorf("opt.DomainNames() = %s", err)
+			}
+			if diff := cmp.Diff(domainNames, test.domainNames); diff != "" {
+				t.Errorf("mismatched domain names (-want +got):\n%s", diff)
+			}
+		})
+	}
+}
+
+func TestNDPSearchListOptionDomainNameLabelInvalidSymbols(t *testing.T) {
+	for r := rune(0); r <= 255; r++ {
+		t.Run(fmt.Sprintf("RuneVal=%d", r), func(t *testing.T) {
+			buf := []byte{
+				0, 0,
+				0, 0, 0, 0,
+				3, 'a', 0 /* will be replaced */, 'c',
+				0,
+				0, 0, 0,
+			}
+			buf[8] = uint8(r)
+			opt := NDPDNSSearchList(buf)
+
+			// As per RFC 1035 section 2.3.1, the label must only include ASCII
+			// letters, digits and hyphens (a-z, A-Z, 0-9, -).
+			var expectedErr error
+			re := regexp.MustCompile(`[a-zA-Z0-9-]`)
+			if !re.Match([]byte{byte(r)}) {
+				expectedErr = ErrNDPOptMalformedBody
+			}
+
+			if domainNames, err := opt.DomainNames(); !errors.Is(err, expectedErr) {
+				t.Errorf("got opt.DomainNames() = (%s, %v), want = (_, %v)", domainNames, err, ErrNDPOptMalformedBody)
+			}
+		})
+	}
+}
+
+func TestNDPDNSSearchListOptionSerialize(t *testing.T) {
+	b := []byte{
+		9, 8,
+		1, 0, 0, 0,
+		3, 'a', 'b', 'c',
+		4, 'a', 'b', 'c', 'd',
+		1, 'e',
+		0,
+	}
+	targetBuf := []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+	expected := []byte{
+		31, 3, 0, 0,
+		1, 0, 0, 0,
+		3, 'a', 'b', 'c',
+		4, 'a', 'b', 'c', 'd',
+		1, 'e',
+		0,
+		0, 0, 0, 0,
+	}
+	opts := NDPOptions(targetBuf)
+	serializer := NDPOptionsSerializer{
+		NDPDNSSearchList(b),
+	}
+	if got, want := opts.Serialize(serializer), len(expected); got != want {
+		t.Errorf("got Serialize = %d, want = %d", got, want)
+	}
+	if !bytes.Equal(targetBuf, expected) {
+		t.Fatalf("got targetBuf = %x, want = %x", targetBuf, expected)
+	}
+
+	it, err := opts.Iter(true)
+	if err != nil {
+		t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+	}
+
+	next, done, err := it.Next()
+	if err != nil {
+		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if done {
+		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+	}
+	if got := next.Type(); got != NDPDNSSearchListOptionType {
+		t.Errorf("got Type = %d, want = %d", got, NDPDNSSearchListOptionType)
+	}
+
+	opt, ok := next.(NDPDNSSearchList)
+	if !ok {
+		t.Fatalf("next (type = %T) cannot be casted to an NDPDNSSearchList", next)
+	}
+	if got := opt.Type(); got != 31 {
+		t.Errorf("got Type = %d, want = 31", got)
+	}
+	if got := opt.Length(); got != 22 {
+		t.Errorf("got Length = %d, want = 22", got)
+	}
+	if got, want := opt.Lifetime(), 16777216*time.Second; got != want {
+		t.Errorf("got Lifetime = %s, want = %s", got, want)
+	}
+	domainNames, err := opt.DomainNames()
+	if err != nil {
+		t.Errorf("opt.DomainNames() = %s", err)
+	}
+	if diff := cmp.Diff(domainNames, []string{"abc.abcd.e"}); diff != "" {
+		t.Errorf("domain names mismatch (-want +got):\n%s", diff)
+	}
+
+	// Iterator should not return anything else.
+	next, done, err = it.Next()
+	if err != nil {
+		t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if !done {
+		t.Error("got Next = (_, false, _), want = (_, true, _)")
+	}
+	if next != nil {
+		t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+	}
+}
+
+// TestNDPOptionsIterCheck tests that Iter will return false if the NDPOptions
+// the iterator was returned for is malformed.
+func TestNDPOptionsIterCheck(t *testing.T) {
+	tests := []struct {
+		name        string
+		buf         []byte
+		expectedErr error
+	}{
+		{
+			name:        "ZeroLengthField",
+			buf:         []byte{0, 0, 0, 0, 0, 0, 0, 0},
+			expectedErr: ErrNDPOptMalformedHeader,
+		},
+		{
+			name:        "ValidSourceLinkLayerAddressOption",
+			buf:         []byte{1, 1, 1, 2, 3, 4, 5, 6},
+			expectedErr: nil,
+		},
+		{
+			name:        "TooSmallSourceLinkLayerAddressOption",
+			buf:         []byte{1, 1, 1, 2, 3, 4, 5},
+			expectedErr: io.ErrUnexpectedEOF,
+		},
+		{
+			name:        "ValidTargetLinkLayerAddressOption",
+			buf:         []byte{2, 1, 1, 2, 3, 4, 5, 6},
+			expectedErr: nil,
+		},
+		{
+			name:        "TooSmallTargetLinkLayerAddressOption",
+			buf:         []byte{2, 1, 1, 2, 3, 4, 5},
+			expectedErr: io.ErrUnexpectedEOF,
+		},
+		{
+			name: "ValidPrefixInformation",
+			buf: []byte{
+				3, 4, 43, 64,
+				1, 2, 3, 4,
+				5, 6, 7, 8,
+				0, 0, 0, 0,
+				9, 10, 11, 12,
+				13, 14, 15, 16,
+				17, 18, 19, 20,
+				21, 22, 23, 24,
+			},
+			expectedErr: nil,
+		},
+		{
+			name: "TooSmallPrefixInformation",
+			buf: []byte{
+				3, 4, 43, 64,
+				1, 2, 3, 4,
+				5, 6, 7, 8,
+				0, 0, 0, 0,
+				9, 10, 11, 12,
+				13, 14, 15, 16,
+				17, 18, 19, 20,
+				21, 22, 23,
+			},
+			expectedErr: io.ErrUnexpectedEOF,
+		},
+		{
+			name: "InvalidPrefixInformationLength",
+			buf: []byte{
+				3, 3, 43, 64,
+				1, 2, 3, 4,
+				5, 6, 7, 8,
+				0, 0, 0, 0,
+				9, 10, 11, 12,
+				13, 14, 15, 16,
+			},
+			expectedErr: ErrNDPOptMalformedBody,
+		},
+		{
+			name: "ValidSourceAndTargetLinkLayerAddressWithPrefixInformation",
+			buf: []byte{
+				// Source Link-Layer Address.
+				1, 1, 1, 2, 3, 4, 5, 6,
+
+				// Target Link-Layer Address.
+				2, 1, 7, 8, 9, 10, 11, 12,
+
+				// Prefix information.
+				3, 4, 43, 64,
+				1, 2, 3, 4,
+				5, 6, 7, 8,
+				0, 0, 0, 0,
+				9, 10, 11, 12,
+				13, 14, 15, 16,
+				17, 18, 19, 20,
+				21, 22, 23, 24,
+			},
+			expectedErr: nil,
+		},
+		{
+			name: "ValidSourceAndTargetLinkLayerAddressWithPrefixInformationWithUnrecognized",
+			buf: []byte{
+				// Source Link-Layer Address.
+				1, 1, 1, 2, 3, 4, 5, 6,
+
+				// Target Link-Layer Address.
+				2, 1, 7, 8, 9, 10, 11, 12,
+
+				// 255 is an unrecognized type. If 255 ends up
+				// being the type for some recognized type,
+				// update 255 to some other unrecognized value.
+				255, 2, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 8,
+
+				// Prefix information.
+				3, 4, 43, 64,
+				1, 2, 3, 4,
+				5, 6, 7, 8,
+				0, 0, 0, 0,
+				9, 10, 11, 12,
+				13, 14, 15, 16,
+				17, 18, 19, 20,
+				21, 22, 23, 24,
+			},
+			expectedErr: nil,
+		},
+		{
+			name: "InvalidRecursiveDNSServerCutsOffAddress",
+			buf: []byte{
+				25, 4, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+				0, 1, 2, 3, 4, 5, 6, 7,
+			},
+			expectedErr: ErrNDPOptMalformedBody,
+		},
+		{
+			name: "InvalidRecursiveDNSServerInvalidLengthField",
+			buf: []byte{
+				25, 2, 0, 0,
+				0, 0, 0, 0,
+				0, 1, 2, 3, 4, 5, 6, 7, 8,
+			},
+			expectedErr: io.ErrUnexpectedEOF,
+		},
+		{
+			name: "RecursiveDNSServerTooSmall",
+			buf: []byte{
+				25, 1, 0, 0,
+				0, 0, 0,
+			},
+			expectedErr: io.ErrUnexpectedEOF,
+		},
+		{
+			name: "RecursiveDNSServerMulticast",
+			buf: []byte{
+				25, 3, 0, 0,
+				0, 0, 0, 0,
+				255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+			},
+			expectedErr: ErrNDPOptMalformedBody,
+		},
+		{
+			name: "RecursiveDNSServerUnspecified",
+			buf: []byte{
+				25, 3, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			expectedErr: ErrNDPOptMalformedBody,
+		},
+		{
+			name: "DNSSearchListLargeCompliantRFC1035",
+			buf: []byte{
+				31, 33, 0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				62, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j',
+				0,
+			},
+			expectedErr: nil,
+		},
+		{
+			name: "DNSSearchListNonCompliantRFC1035",
+			buf: []byte{
+				31, 33, 0, 0,
+				0, 0, 0, 0,
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				63, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
+				'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
+				'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
+				'i', 'j', 'k',
+				0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			expectedErr: ErrNDPOptMalformedBody,
+		},
+		{
+			name: "DNSSearchListValidSmall",
+			buf: []byte{
+				31, 2, 0, 0,
+				0, 0, 0, 0,
+				6, 'a', 'b', 'c', 'd', 'e', 'f',
+				0,
+			},
+			expectedErr: nil,
+		},
+		{
+			name: "DNSSearchListTooSmall",
+			buf: []byte{
+				31, 1, 0, 0,
+				0, 0, 0,
+			},
+			expectedErr: io.ErrUnexpectedEOF,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opts := NDPOptions(test.buf)
+
+			if _, err := opts.Iter(true); !errors.Is(err, test.expectedErr) {
+				t.Fatalf("got Iter(true) = (_, %v), want = (_, %v)", err, test.expectedErr)
+			}
+
+			// test.buf may be malformed but we chose not to check
+			// the iterator so it must return true.
+			if _, err := opts.Iter(false); err != nil {
+				t.Fatalf("got Iter(false) = (_, %s), want = (_, nil)", err)
+			}
+		})
+	}
+}
+
+// TestNDPOptionsIter tests that we can iterator over a valid NDPOptions. Note,
+// this test does not actually check any of the option's getters, it simply
+// checks the option Type and Body. We have other tests that tests the option
+// field gettings given an option body and don't need to duplicate those tests
+// here.
+func TestNDPOptionsIter(t *testing.T) {
+	buf := []byte{
+		// Source Link-Layer Address.
+		1, 1, 1, 2, 3, 4, 5, 6,
+
+		// Target Link-Layer Address.
+		2, 1, 7, 8, 9, 10, 11, 12,
+
+		// 255 is an unrecognized type. If 255 ends up being the type
+		// for some recognized type, update 255 to some other
+		// unrecognized value. Note, this option should be skipped when
+		// iterating.
+		255, 2, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 8,
+
+		// Prefix information.
+		3, 4, 43, 64,
+		1, 2, 3, 4,
+		5, 6, 7, 8,
+		0, 0, 0, 0,
+		9, 10, 11, 12,
+		13, 14, 15, 16,
+		17, 18, 19, 20,
+		21, 22, 23, 24,
+	}
+
+	opts := NDPOptions(buf)
+	it, err := opts.Iter(true)
+	if err != nil {
+		t.Fatalf("got Iter = (_, %s), want = (_, nil)", err)
+	}
+
+	// Test the first (Source Link-Layer) option.
+	next, done, err := it.Next()
+	if err != nil {
+		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if done {
+		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+	}
+	if got, want := []byte(next.(NDPSourceLinkLayerAddressOption)), buf[2:][:6]; !bytes.Equal(got, want) {
+		t.Errorf("got Next = (%x, _, _), want = (%x, _, _)", got, want)
+	}
+	if got := next.Type(); got != NDPSourceLinkLayerAddressOptionType {
+		t.Errorf("got Type = %d, want = %d", got, NDPSourceLinkLayerAddressOptionType)
+	}
+
+	// Test the next (Target Link-Layer) option.
+	next, done, err = it.Next()
+	if err != nil {
+		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if done {
+		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+	}
+	if got, want := []byte(next.(NDPTargetLinkLayerAddressOption)), buf[10:][:6]; !bytes.Equal(got, want) {
+		t.Errorf("got Next = (%x, _, _), want = (%x, _, _)", got, want)
+	}
+	if got := next.Type(); got != NDPTargetLinkLayerAddressOptionType {
+		t.Errorf("got Type = %d, want = %d", got, NDPTargetLinkLayerAddressOptionType)
+	}
+
+	// Test the next (Prefix Information) option.
+	// Note, the unrecognized option should be skipped.
+	next, done, err = it.Next()
+	if err != nil {
+		t.Fatalf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if done {
+		t.Fatal("got Next = (_, true, _), want = (_, false, _)")
+	}
+	if got, want := next.(NDPPrefixInformation), buf[34:][:30]; !bytes.Equal(got, want) {
+		t.Errorf("got Next = (%x, _, _), want = (%x, _, _)", got, want)
+	}
+	if got := next.Type(); got != NDPPrefixInformationType {
+		t.Errorf("got Type = %d, want = %d", got, NDPPrefixInformationType)
+	}
+
+	// Iterator should not return anything else.
+	next, done, err = it.Next()
+	if err != nil {
+		t.Errorf("got Next = (_, _, %s), want = (_, _, nil)", err)
+	}
+	if !done {
+		t.Error("got Next = (_, false, _), want = (_, true, _)")
+	}
+	if next != nil {
+		t.Errorf("got Next = (%x, _, _), want = (nil, _, _)", next)
+	}
+}
diff --git a/pkg/tcpip/header/ndpoptionidentifier_string.go b/pkg/tcpip/header/ndpoptionidentifier_string.go
new file mode 100644
index 000000000..6fe9a336b
--- /dev/null
+++ b/pkg/tcpip/header/ndpoptionidentifier_string.go
@@ -0,0 +1,50 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by "stringer -type NDPOptionIdentifier ."; DO NOT EDIT.
+
+package header
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[NDPSourceLinkLayerAddressOptionType-1]
+	_ = x[NDPTargetLinkLayerAddressOptionType-2]
+	_ = x[NDPPrefixInformationType-3]
+	_ = x[NDPRecursiveDNSServerOptionType-25]
+}
+
+const (
+	_NDPOptionIdentifier_name_0 = "NDPSourceLinkLayerAddressOptionTypeNDPTargetLinkLayerAddressOptionTypeNDPPrefixInformationType"
+	_NDPOptionIdentifier_name_1 = "NDPRecursiveDNSServerOptionType"
+)
+
+var (
+	_NDPOptionIdentifier_index_0 = [...]uint8{0, 35, 70, 94}
+)
+
+func (i NDPOptionIdentifier) String() string {
+	switch {
+	case 1 <= i && i <= 3:
+		i -= 1
+		return _NDPOptionIdentifier_name_0[_NDPOptionIdentifier_index_0[i]:_NDPOptionIdentifier_index_0[i+1]]
+	case i == 25:
+		return _NDPOptionIdentifier_name_1
+	default:
+		return "NDPOptionIdentifier(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+}
diff --git a/pkg/tcpip/header/tcp.go b/pkg/tcpip/header/tcp.go
new file mode 100644
index 000000000..4c6f808e5
--- /dev/null
+++ b/pkg/tcpip/header/tcp.go
@@ -0,0 +1,621 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"encoding/binary"
+
+	"github.com/google/btree"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+)
+
+// These constants are the offsets of the respective fields in the TCP header.
+const (
+	TCPSrcPortOffset   = 0
+	TCPDstPortOffset   = 2
+	TCPSeqNumOffset    = 4
+	TCPAckNumOffset    = 8
+	TCPDataOffset      = 12
+	TCPFlagsOffset     = 13
+	TCPWinSizeOffset   = 14
+	TCPChecksumOffset  = 16
+	TCPUrgentPtrOffset = 18
+)
+
+const (
+	// MaxWndScale is maximum allowed window scaling, as described in
+	// RFC 1323, section 2.3, page 11.
+	MaxWndScale = 14
+
+	// TCPMaxSACKBlocks is the maximum number of SACK blocks that can
+	// be encoded in a TCP option field.
+	TCPMaxSACKBlocks = 4
+)
+
+// Flags that may be set in a TCP segment.
+const (
+	TCPFlagFin = 1 << iota
+	TCPFlagSyn
+	TCPFlagRst
+	TCPFlagPsh
+	TCPFlagAck
+	TCPFlagUrg
+)
+
+// Options that may be present in a TCP segment.
+const (
+	TCPOptionEOL           = 0
+	TCPOptionNOP           = 1
+	TCPOptionMSS           = 2
+	TCPOptionWS            = 3
+	TCPOptionTS            = 8
+	TCPOptionSACKPermitted = 4
+	TCPOptionSACK          = 5
+)
+
+// Option Lengths.
+const (
+	TCPOptionMSSLength           = 4
+	TCPOptionTSLength            = 10
+	TCPOptionWSLength            = 3
+	TCPOptionSackPermittedLength = 2
+)
+
+// TCPFields contains the fields of a TCP packet. It is used to describe the
+// fields of a packet that needs to be encoded.
+type TCPFields struct {
+	// SrcPort is the "source port" field of a TCP packet.
+	SrcPort uint16
+
+	// DstPort is the "destination port" field of a TCP packet.
+	DstPort uint16
+
+	// SeqNum is the "sequence number" field of a TCP packet.
+	SeqNum uint32
+
+	// AckNum is the "acknowledgement number" field of a TCP packet.
+	AckNum uint32
+
+	// DataOffset is the "data offset" field of a TCP packet. It is the length of
+	// the TCP header in bytes.
+	DataOffset uint8
+
+	// Flags is the "flags" field of a TCP packet.
+	Flags uint8
+
+	// WindowSize is the "window size" field of a TCP packet.
+	WindowSize uint16
+
+	// Checksum is the "checksum" field of a TCP packet.
+	Checksum uint16
+
+	// UrgentPointer is the "urgent pointer" field of a TCP packet.
+	UrgentPointer uint16
+}
+
+// TCPSynOptions is used to return the parsed TCP Options in a syn
+// segment.
+type TCPSynOptions struct {
+	// MSS is the maximum segment size provided by the peer in the SYN.
+	MSS uint16
+
+	// WS is the window scale option provided by the peer in the SYN.
+	//
+	// Set to -1 if no window scale option was provided.
+	WS int
+
+	// TS is true if the timestamp option was provided in the syn/syn-ack.
+	TS bool
+
+	// TSVal is the value of the TSVal field in the timestamp option.
+	TSVal uint32
+
+	// TSEcr is the value of the TSEcr field in the timestamp option.
+	TSEcr uint32
+
+	// SACKPermitted is true if the SACK option was provided in the SYN/SYN-ACK.
+	SACKPermitted bool
+}
+
+// SACKBlock represents a single contiguous SACK block.
+//
+// +stateify savable
+type SACKBlock struct {
+	// Start indicates the lowest sequence number in the block.
+	Start seqnum.Value
+
+	// End indicates the sequence number immediately following the last
+	// sequence number of this block.
+	End seqnum.Value
+}
+
+// Less returns true if r.Start < b.Start.
+func (r SACKBlock) Less(b btree.Item) bool {
+	return r.Start.LessThan(b.(SACKBlock).Start)
+}
+
+// Contains returns true if b is completely contained in r.
+func (r SACKBlock) Contains(b SACKBlock) bool {
+	return r.Start.LessThanEq(b.Start) && b.End.LessThanEq(r.End)
+}
+
+// TCPOptions are used to parse and cache the TCP segment options for a non
+// syn/syn-ack segment.
+//
+// +stateify savable
+type TCPOptions struct {
+	// TS is true if the TimeStamp option is enabled.
+	TS bool
+
+	// TSVal is the value in the TSVal field of the segment.
+	TSVal uint32
+
+	// TSEcr is the value in the TSEcr field of the segment.
+	TSEcr uint32
+
+	// SACKBlocks are the SACK blocks specified in the segment.
+	SACKBlocks []SACKBlock
+}
+
+// TCP represents a TCP header stored in a byte array.
+type TCP []byte
+
+const (
+	// TCPMinimumSize is the minimum size of a valid TCP packet.
+	TCPMinimumSize = 20
+
+	// TCPOptionsMaximumSize is the maximum size of TCP options.
+	TCPOptionsMaximumSize = 40
+
+	// TCPHeaderMaximumSize is the maximum header size of a TCP packet.
+	TCPHeaderMaximumSize = TCPMinimumSize + TCPOptionsMaximumSize
+
+	// TCPProtocolNumber is TCP's transport protocol number.
+	TCPProtocolNumber tcpip.TransportProtocolNumber = 6
+
+	// TCPMinimumMSS is the minimum acceptable value for MSS. This is the
+	// same as the value TCP_MIN_MSS defined net/tcp.h.
+	TCPMinimumMSS = IPv4MaximumHeaderSize + TCPHeaderMaximumSize + MinIPFragmentPayloadSize - IPv4MinimumSize - TCPMinimumSize
+
+	// TCPMaximumMSS is the maximum acceptable value for MSS.
+	TCPMaximumMSS = 0xffff
+
+	// TCPDefaultMSS is the MSS value that should be used if an MSS option
+	// is not received from the peer. It's also the value returned by
+	// TCP_MAXSEG option for a socket in an unconnected state.
+	//
+	// Per RFC 1122, page 85: "If an MSS option is not received at
+	// connection setup, TCP MUST assume a default send MSS of 536."
+	TCPDefaultMSS = 536
+)
+
+// SourcePort returns the "source port" field of the tcp header.
+func (b TCP) SourcePort() uint16 {
+	return binary.BigEndian.Uint16(b[TCPSrcPortOffset:])
+}
+
+// DestinationPort returns the "destination port" field of the tcp header.
+func (b TCP) DestinationPort() uint16 {
+	return binary.BigEndian.Uint16(b[TCPDstPortOffset:])
+}
+
+// SequenceNumber returns the "sequence number" field of the tcp header.
+func (b TCP) SequenceNumber() uint32 {
+	return binary.BigEndian.Uint32(b[TCPSeqNumOffset:])
+}
+
+// AckNumber returns the "ack number" field of the tcp header.
+func (b TCP) AckNumber() uint32 {
+	return binary.BigEndian.Uint32(b[TCPAckNumOffset:])
+}
+
+// DataOffset returns the "data offset" field of the tcp header. The return
+// value is the length of the TCP header in bytes.
+func (b TCP) DataOffset() uint8 {
+	return (b[TCPDataOffset] >> 4) * 4
+}
+
+// Payload returns the data in the tcp packet.
+func (b TCP) Payload() []byte {
+	return b[b.DataOffset():]
+}
+
+// Flags returns the flags field of the tcp header.
+func (b TCP) Flags() uint8 {
+	return b[TCPFlagsOffset]
+}
+
+// WindowSize returns the "window size" field of the tcp header.
+func (b TCP) WindowSize() uint16 {
+	return binary.BigEndian.Uint16(b[TCPWinSizeOffset:])
+}
+
+// Checksum returns the "checksum" field of the tcp header.
+func (b TCP) Checksum() uint16 {
+	return binary.BigEndian.Uint16(b[TCPChecksumOffset:])
+}
+
+// UrgentPointer returns the "urgent pointer" field of the tcp header.
+func (b TCP) UrgentPointer() uint16 {
+	return binary.BigEndian.Uint16(b[TCPUrgentPtrOffset:])
+}
+
+// SetSourcePort sets the "source port" field of the tcp header.
+func (b TCP) SetSourcePort(port uint16) {
+	binary.BigEndian.PutUint16(b[TCPSrcPortOffset:], port)
+}
+
+// SetDestinationPort sets the "destination port" field of the tcp header.
+func (b TCP) SetDestinationPort(port uint16) {
+	binary.BigEndian.PutUint16(b[TCPDstPortOffset:], port)
+}
+
+// SetChecksum sets the checksum field of the tcp header.
+func (b TCP) SetChecksum(checksum uint16) {
+	binary.BigEndian.PutUint16(b[TCPChecksumOffset:], checksum)
+}
+
+// SetDataOffset sets the data offset field of the tcp header. headerLen should
+// be the length of the TCP header in bytes.
+func (b TCP) SetDataOffset(headerLen uint8) {
+	b[TCPDataOffset] = (headerLen / 4) << 4
+}
+
+// SetSequenceNumber sets the sequence number field of the tcp header.
+func (b TCP) SetSequenceNumber(seqNum uint32) {
+	binary.BigEndian.PutUint32(b[TCPSeqNumOffset:], seqNum)
+}
+
+// SetAckNumber sets the ack number field of the tcp header.
+func (b TCP) SetAckNumber(ackNum uint32) {
+	binary.BigEndian.PutUint32(b[TCPAckNumOffset:], ackNum)
+}
+
+// SetFlags sets the flags field of the tcp header.
+func (b TCP) SetFlags(flags uint8) {
+	b[TCPFlagsOffset] = flags
+}
+
+// SetWindowSize sets the window size field of the tcp header.
+func (b TCP) SetWindowSize(rcvwnd uint16) {
+	binary.BigEndian.PutUint16(b[TCPWinSizeOffset:], rcvwnd)
+}
+
+// SetUrgentPoiner sets the window size field of the tcp header.
+func (b TCP) SetUrgentPoiner(urgentPointer uint16) {
+	binary.BigEndian.PutUint16(b[TCPUrgentPtrOffset:], urgentPointer)
+}
+
+// CalculateChecksum calculates the checksum of the tcp segment.
+// partialChecksum is the checksum of the network-layer pseudo-header
+// and the checksum of the segment data.
+func (b TCP) CalculateChecksum(partialChecksum uint16) uint16 {
+	// Calculate the rest of the checksum.
+	return Checksum(b[:b.DataOffset()], partialChecksum)
+}
+
+// Options returns a slice that holds the unparsed TCP options in the segment.
+func (b TCP) Options() []byte {
+	return b[TCPMinimumSize:b.DataOffset()]
+}
+
+// ParsedOptions returns a TCPOptions structure which parses and caches the TCP
+// option values in the TCP segment. NOTE: Invoking this function repeatedly is
+// expensive as it reparses the options on each invocation.
+func (b TCP) ParsedOptions() TCPOptions {
+	return ParseTCPOptions(b.Options())
+}
+
+func (b TCP) encodeSubset(seq, ack uint32, flags uint8, rcvwnd uint16) {
+	binary.BigEndian.PutUint32(b[TCPSeqNumOffset:], seq)
+	binary.BigEndian.PutUint32(b[TCPAckNumOffset:], ack)
+	b[TCPFlagsOffset] = flags
+	binary.BigEndian.PutUint16(b[TCPWinSizeOffset:], rcvwnd)
+}
+
+// Encode encodes all the fields of the tcp header.
+func (b TCP) Encode(t *TCPFields) {
+	b.encodeSubset(t.SeqNum, t.AckNum, t.Flags, t.WindowSize)
+	binary.BigEndian.PutUint16(b[TCPSrcPortOffset:], t.SrcPort)
+	binary.BigEndian.PutUint16(b[TCPDstPortOffset:], t.DstPort)
+	b[TCPDataOffset] = (t.DataOffset / 4) << 4
+	binary.BigEndian.PutUint16(b[TCPChecksumOffset:], t.Checksum)
+	binary.BigEndian.PutUint16(b[TCPUrgentPtrOffset:], t.UrgentPointer)
+}
+
+// EncodePartial updates a subset of the fields of the tcp header. It is useful
+// in cases when similar segments are produced.
+func (b TCP) EncodePartial(partialChecksum, length uint16, seqnum, acknum uint32, flags byte, rcvwnd uint16) {
+	// Add the total length and "flags" field contributions to the checksum.
+	// We don't use the flags field directly from the header because it's a
+	// one-byte field with an odd offset, so it would be accounted for
+	// incorrectly by the Checksum routine.
+	tmp := make([]byte, 4)
+	binary.BigEndian.PutUint16(tmp, length)
+	binary.BigEndian.PutUint16(tmp[2:], uint16(flags))
+	checksum := Checksum(tmp, partialChecksum)
+
+	// Encode the passed-in fields.
+	b.encodeSubset(seqnum, acknum, flags, rcvwnd)
+
+	// Add the contributions of the passed-in fields to the checksum.
+	checksum = Checksum(b[TCPSeqNumOffset:TCPSeqNumOffset+8], checksum)
+	checksum = Checksum(b[TCPWinSizeOffset:TCPWinSizeOffset+2], checksum)
+
+	// Encode the checksum.
+	b.SetChecksum(^checksum)
+}
+
+// ParseSynOptions parses the options received in a SYN segment and returns the
+// relevant ones. opts should point to the option part of the TCP Header.
+func ParseSynOptions(opts []byte, isAck bool) TCPSynOptions {
+	limit := len(opts)
+
+	synOpts := TCPSynOptions{
+		// Per RFC 1122, page 85: "If an MSS option is not received at
+		// connection setup, TCP MUST assume a default send MSS of 536."
+		MSS: TCPDefaultMSS,
+		// If no window scale option is specified, WS in options is
+		// returned as -1; this is because the absence of the option
+		// indicates that the we cannot use window scaling on the
+		// receive end either.
+		WS: -1,
+	}
+
+	for i := 0; i < limit; {
+		switch opts[i] {
+		case TCPOptionEOL:
+			i = limit
+		case TCPOptionNOP:
+			i++
+		case TCPOptionMSS:
+			if i+4 > limit || opts[i+1] != 4 {
+				return synOpts
+			}
+			mss := uint16(opts[i+2])<<8 | uint16(opts[i+3])
+			if mss == 0 {
+				return synOpts
+			}
+			synOpts.MSS = mss
+			i += 4
+
+		case TCPOptionWS:
+			if i+3 > limit || opts[i+1] != 3 {
+				return synOpts
+			}
+			ws := int(opts[i+2])
+			if ws > MaxWndScale {
+				ws = MaxWndScale
+			}
+			synOpts.WS = ws
+			i += 3
+
+		case TCPOptionTS:
+			if i+10 > limit || opts[i+1] != 10 {
+				return synOpts
+			}
+			synOpts.TSVal = binary.BigEndian.Uint32(opts[i+2:])
+			if isAck {
+				// If the segment is a SYN-ACK then store the Timestamp Echo Reply
+				// in the segment.
+				synOpts.TSEcr = binary.BigEndian.Uint32(opts[i+6:])
+			}
+			synOpts.TS = true
+			i += 10
+		case TCPOptionSACKPermitted:
+			if i+2 > limit || opts[i+1] != 2 {
+				return synOpts
+			}
+			synOpts.SACKPermitted = true
+			i += 2
+
+		default:
+			// We don't recognize this option, just skip over it.
+			if i+2 > limit {
+				return synOpts
+			}
+			l := int(opts[i+1])
+			// If the length is incorrect or if l+i overflows the
+			// total options length then return false.
+			if l < 2 || i+l > limit {
+				return synOpts
+			}
+			i += l
+		}
+	}
+
+	return synOpts
+}
+
+// ParseTCPOptions extracts and stores all known options in the provided byte
+// slice in a TCPOptions structure.
+func ParseTCPOptions(b []byte) TCPOptions {
+	opts := TCPOptions{}
+	limit := len(b)
+	for i := 0; i < limit; {
+		switch b[i] {
+		case TCPOptionEOL:
+			i = limit
+		case TCPOptionNOP:
+			i++
+		case TCPOptionTS:
+			if i+10 > limit || (b[i+1] != 10) {
+				return opts
+			}
+			opts.TS = true
+			opts.TSVal = binary.BigEndian.Uint32(b[i+2:])
+			opts.TSEcr = binary.BigEndian.Uint32(b[i+6:])
+			i += 10
+		case TCPOptionSACK:
+			if i+2 > limit {
+				// Malformed SACK block, just return and stop parsing.
+				return opts
+			}
+			sackOptionLen := int(b[i+1])
+			if i+sackOptionLen > limit || (sackOptionLen-2)%8 != 0 {
+				// Malformed SACK block, just return and stop parsing.
+				return opts
+			}
+			numBlocks := (sackOptionLen - 2) / 8
+			opts.SACKBlocks = []SACKBlock{}
+			for j := 0; j < numBlocks; j++ {
+				start := binary.BigEndian.Uint32(b[i+2+j*8:])
+				end := binary.BigEndian.Uint32(b[i+2+j*8+4:])
+				opts.SACKBlocks = append(opts.SACKBlocks, SACKBlock{
+					Start: seqnum.Value(start),
+					End:   seqnum.Value(end),
+				})
+			}
+			i += sackOptionLen
+		default:
+			// We don't recognize this option, just skip over it.
+			if i+2 > limit {
+				return opts
+			}
+			l := int(b[i+1])
+			// If the length is incorrect or if l+i overflows the
+			// total options length then return false.
+			if l < 2 || i+l > limit {
+				return opts
+			}
+			i += l
+		}
+	}
+	return opts
+}
+
+// EncodeMSSOption encodes the MSS TCP option with the provided MSS values in
+// the supplied buffer. If the provided buffer is not large enough then it just
+// returns without encoding anything. It returns the number of bytes written to
+// the provided buffer.
+func EncodeMSSOption(mss uint32, b []byte) int {
+	if len(b) < TCPOptionMSSLength {
+		return 0
+	}
+	b[0], b[1], b[2], b[3] = TCPOptionMSS, TCPOptionMSSLength, byte(mss>>8), byte(mss)
+	return TCPOptionMSSLength
+}
+
+// EncodeWSOption encodes the WS TCP option with the WS value in the
+// provided buffer. If the provided buffer is not large enough then it just
+// returns without encoding anything. It returns the number of bytes written to
+// the provided buffer.
+func EncodeWSOption(ws int, b []byte) int {
+	if len(b) < TCPOptionWSLength {
+		return 0
+	}
+	b[0], b[1], b[2] = TCPOptionWS, TCPOptionWSLength, uint8(ws)
+	return int(b[1])
+}
+
+// EncodeTSOption encodes the provided tsVal and tsEcr values as a TCP timestamp
+// option into the provided buffer. If the buffer is smaller than expected it
+// just returns without encoding anything. It returns the number of bytes
+// written to the provided buffer.
+func EncodeTSOption(tsVal, tsEcr uint32, b []byte) int {
+	if len(b) < TCPOptionTSLength {
+		return 0
+	}
+	b[0], b[1] = TCPOptionTS, TCPOptionTSLength
+	binary.BigEndian.PutUint32(b[2:], tsVal)
+	binary.BigEndian.PutUint32(b[6:], tsEcr)
+	return int(b[1])
+}
+
+// EncodeSACKPermittedOption encodes a SACKPermitted option into the provided
+// buffer. If the buffer is smaller than required it just returns without
+// encoding anything. It returns the number of bytes written to the provided
+// buffer.
+func EncodeSACKPermittedOption(b []byte) int {
+	if len(b) < TCPOptionSackPermittedLength {
+		return 0
+	}
+
+	b[0], b[1] = TCPOptionSACKPermitted, TCPOptionSackPermittedLength
+	return int(b[1])
+}
+
+// EncodeSACKBlocks encodes the provided SACK blocks as a TCP SACK option block
+// in the provided slice. It tries to fit in as many blocks as possible based on
+// number of bytes available in the provided buffer. It returns the number of
+// bytes written to the provided buffer.
+func EncodeSACKBlocks(sackBlocks []SACKBlock, b []byte) int {
+	if len(sackBlocks) == 0 {
+		return 0
+	}
+	l := len(sackBlocks)
+	if l > TCPMaxSACKBlocks {
+		l = TCPMaxSACKBlocks
+	}
+	if ll := (len(b) - 2) / 8; ll < l {
+		l = ll
+	}
+	if l == 0 {
+		// There is not enough space in the provided buffer to add
+		// any SACK blocks.
+		return 0
+	}
+	b[0] = TCPOptionSACK
+	b[1] = byte(l*8 + 2)
+	for i := 0; i < l; i++ {
+		binary.BigEndian.PutUint32(b[i*8+2:], uint32(sackBlocks[i].Start))
+		binary.BigEndian.PutUint32(b[i*8+6:], uint32(sackBlocks[i].End))
+	}
+	return int(b[1])
+}
+
+// EncodeNOP adds an explicit NOP to the option list.
+func EncodeNOP(b []byte) int {
+	if len(b) == 0 {
+		return 0
+	}
+	b[0] = TCPOptionNOP
+	return 1
+}
+
+// AddTCPOptionPadding adds the required number of TCPOptionNOP to quad align
+// the option buffer. It adds padding bytes after the offset specified and
+// returns the number of padding bytes added. The passed in options slice
+// must have space for the padding bytes.
+func AddTCPOptionPadding(options []byte, offset int) int {
+	paddingToAdd := -offset & 3
+	// Now add any padding bytes that might be required to quad align the
+	// options.
+	for i := offset; i < offset+paddingToAdd; i++ {
+		options[i] = TCPOptionNOP
+	}
+	return paddingToAdd
+}
+
+// Acceptable checks if a segment that starts at segSeq and has length segLen is
+// "acceptable" for arriving in a receive window that starts at rcvNxt and ends
+// before rcvAcc, according to the table on page 26 and 69 of RFC 793.
+func Acceptable(segSeq seqnum.Value, segLen seqnum.Size, rcvNxt, rcvAcc seqnum.Value) bool {
+	if rcvNxt == rcvAcc {
+		return segLen == 0 && segSeq == rcvNxt
+	}
+	if segLen == 0 {
+		// rcvWnd is incremented by 1 because that is Linux's behavior despite the
+		// RFC.
+		return segSeq.InRange(rcvNxt, rcvAcc.Add(1))
+	}
+	// Page 70 of RFC 793 allows packets that can be made "acceptable" by trimming
+	// the payload, so we'll accept any payload that overlaps the receieve window.
+	// segSeq < rcvAcc is more correct according to RFC, however, Linux does it
+	// differently, it uses segSeq <= rcvAcc, we'd want to keep the same behavior
+	// as Linux.
+	return rcvNxt.LessThan(segSeq.Add(segLen)) && segSeq.LessThanEq(rcvAcc)
+}
diff --git a/pkg/tcpip/header/tcp_test.go b/pkg/tcpip/header/tcp_test.go
new file mode 100644
index 000000000..72563837b
--- /dev/null
+++ b/pkg/tcpip/header/tcp_test.go
@@ -0,0 +1,148 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header_test
+
+import (
+	"reflect"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+func TestEncodeSACKBlocks(t *testing.T) {
+	testCases := []struct {
+		sackBlocks []header.SACKBlock
+		want       []header.SACKBlock
+		bufSize    int
+	}{
+		{
+			[]header.SACKBlock{{10, 20}, {22, 30}, {32, 40}, {42, 50}, {52, 60}, {62, 70}},
+			[]header.SACKBlock{{10, 20}, {22, 30}, {32, 40}, {42, 50}},
+			40,
+		},
+		{
+			[]header.SACKBlock{{10, 20}, {22, 30}, {32, 40}, {42, 50}, {52, 60}, {62, 70}},
+			[]header.SACKBlock{{10, 20}, {22, 30}, {32, 40}},
+			30,
+		},
+		{
+			[]header.SACKBlock{{10, 20}, {22, 30}, {32, 40}, {42, 50}, {52, 60}, {62, 70}},
+			[]header.SACKBlock{{10, 20}, {22, 30}},
+			20,
+		},
+		{
+			[]header.SACKBlock{{10, 20}, {22, 30}, {32, 40}, {42, 50}, {52, 60}, {62, 70}},
+			[]header.SACKBlock{{10, 20}},
+			10,
+		},
+		{
+			[]header.SACKBlock{{10, 20}, {22, 30}, {32, 40}, {42, 50}, {52, 60}, {62, 70}},
+			nil,
+			8,
+		},
+		{
+			[]header.SACKBlock{{10, 20}, {22, 30}, {32, 40}, {42, 50}, {52, 60}, {62, 70}},
+			[]header.SACKBlock{{10, 20}, {22, 30}, {32, 40}, {42, 50}},
+			60,
+		},
+	}
+	for _, tc := range testCases {
+		b := make([]byte, tc.bufSize)
+		t.Logf("testing: %v", tc)
+		header.EncodeSACKBlocks(tc.sackBlocks, b)
+		opts := header.ParseTCPOptions(b)
+		if got, want := opts.SACKBlocks, tc.want; !reflect.DeepEqual(got, want) {
+			t.Errorf("header.EncodeSACKBlocks(%v, %v), encoded blocks got: %v, want: %v", tc.sackBlocks, b, got, want)
+		}
+	}
+}
+
+func TestTCPParseOptions(t *testing.T) {
+	type tsOption struct {
+		tsVal uint32
+		tsEcr uint32
+	}
+
+	generateOptions := func(tsOpt *tsOption, sackBlocks []header.SACKBlock) []byte {
+		l := 0
+		if tsOpt != nil {
+			l += 10
+		}
+		if len(sackBlocks) != 0 {
+			l += len(sackBlocks)*8 + 2
+		}
+		b := make([]byte, l)
+		offset := 0
+		if tsOpt != nil {
+			offset = header.EncodeTSOption(tsOpt.tsVal, tsOpt.tsEcr, b)
+		}
+		header.EncodeSACKBlocks(sackBlocks, b[offset:])
+		return b
+	}
+
+	testCases := []struct {
+		b    []byte
+		want header.TCPOptions
+	}{
+		// Trivial cases.
+		{nil, header.TCPOptions{false, 0, 0, nil}},
+		{[]byte{header.TCPOptionNOP}, header.TCPOptions{false, 0, 0, nil}},
+		{[]byte{header.TCPOptionNOP, header.TCPOptionNOP}, header.TCPOptions{false, 0, 0, nil}},
+		{[]byte{header.TCPOptionEOL}, header.TCPOptions{false, 0, 0, nil}},
+		{[]byte{header.TCPOptionNOP, header.TCPOptionEOL, header.TCPOptionTS, 10, 1, 1}, header.TCPOptions{false, 0, 0, nil}},
+
+		// Test timestamp parsing.
+		{[]byte{header.TCPOptionNOP, header.TCPOptionTS, 10, 0, 0, 0, 1, 0, 0, 0, 1}, header.TCPOptions{true, 1, 1, nil}},
+		{[]byte{header.TCPOptionTS, 10, 0, 0, 0, 1, 0, 0, 0, 1}, header.TCPOptions{true, 1, 1, nil}},
+
+		// Test malformed timestamp option.
+		{[]byte{header.TCPOptionTS, 8, 1, 1}, header.TCPOptions{false, 0, 0, nil}},
+		{[]byte{header.TCPOptionNOP, header.TCPOptionTS, 8, 1, 1}, header.TCPOptions{false, 0, 0, nil}},
+		{[]byte{header.TCPOptionNOP, header.TCPOptionTS, 8, 0, 0, 0, 1, 0, 0, 0, 1}, header.TCPOptions{false, 0, 0, nil}},
+
+		// Test SACKBlock parsing.
+		{[]byte{header.TCPOptionSACK, 10, 0, 0, 0, 1, 0, 0, 0, 10}, header.TCPOptions{false, 0, 0, []header.SACKBlock{{1, 10}}}},
+		{[]byte{header.TCPOptionSACK, 18, 0, 0, 0, 1, 0, 0, 0, 10, 0, 0, 0, 11, 0, 0, 0, 12}, header.TCPOptions{false, 0, 0, []header.SACKBlock{{1, 10}, {11, 12}}}},
+
+		// Test malformed SACK option.
+		{[]byte{header.TCPOptionSACK, 0}, header.TCPOptions{false, 0, 0, nil}},
+		{[]byte{header.TCPOptionSACK, 8, 0, 0, 0, 1, 0, 0, 0, 10}, header.TCPOptions{false, 0, 0, nil}},
+		{[]byte{header.TCPOptionSACK, 11, 0, 0, 0, 1, 0, 0, 0, 10, 0, 0, 0, 11, 0, 0, 0, 12}, header.TCPOptions{false, 0, 0, nil}},
+		{[]byte{header.TCPOptionSACK, 17, 0, 0, 0, 1, 0, 0, 0, 10, 0, 0, 0, 11, 0, 0, 0, 12}, header.TCPOptions{false, 0, 0, nil}},
+		{[]byte{header.TCPOptionSACK}, header.TCPOptions{false, 0, 0, nil}},
+		{[]byte{header.TCPOptionSACK, 10}, header.TCPOptions{false, 0, 0, nil}},
+		{[]byte{header.TCPOptionSACK, 10, 0, 0, 0, 1, 0, 0, 0}, header.TCPOptions{false, 0, 0, nil}},
+
+		// Test Timestamp + SACK block parsing.
+		{generateOptions(&tsOption{1, 1}, []header.SACKBlock{{1, 10}, {11, 12}}), header.TCPOptions{true, 1, 1, []header.SACKBlock{{1, 10}, {11, 12}}}},
+		{generateOptions(&tsOption{1, 2}, []header.SACKBlock{{1, 10}, {11, 12}}), header.TCPOptions{true, 1, 2, []header.SACKBlock{{1, 10}, {11, 12}}}},
+		{generateOptions(&tsOption{1, 3}, []header.SACKBlock{{1, 10}, {11, 12}, {13, 14}, {14, 15}, {15, 16}}), header.TCPOptions{true, 1, 3, []header.SACKBlock{{1, 10}, {11, 12}, {13, 14}, {14, 15}}}},
+
+		// Test valid timestamp + malformed SACK block parsing.
+		{[]byte{header.TCPOptionTS, 10, 0, 0, 0, 1, 0, 0, 0, 1, header.TCPOptionSACK}, header.TCPOptions{true, 1, 1, nil}},
+		{[]byte{header.TCPOptionTS, 10, 0, 0, 0, 1, 0, 0, 0, 1, header.TCPOptionSACK, 10}, header.TCPOptions{true, 1, 1, nil}},
+		{[]byte{header.TCPOptionTS, 10, 0, 0, 0, 1, 0, 0, 0, 1, header.TCPOptionSACK, 10, 0, 0, 0}, header.TCPOptions{true, 1, 1, nil}},
+		{[]byte{header.TCPOptionTS, 10, 0, 0, 0, 1, 0, 0, 0, 1, header.TCPOptionSACK, 11, 0, 0, 0, 1, 0, 0, 0, 1}, header.TCPOptions{true, 1, 1, nil}},
+		{[]byte{header.TCPOptionSACK, header.TCPOptionTS, 10, 0, 0, 0, 1, 0, 0, 0, 1}, header.TCPOptions{false, 0, 0, nil}},
+		{[]byte{header.TCPOptionSACK, 10, header.TCPOptionTS, 10, 0, 0, 0, 1, 0, 0, 0, 1}, header.TCPOptions{false, 0, 0, []header.SACKBlock{{134873088, 65536}}}},
+		{[]byte{header.TCPOptionSACK, 10, 0, 0, 0, header.TCPOptionTS, 10, 0, 0, 0, 1, 0, 0, 0, 1}, header.TCPOptions{false, 0, 0, []header.SACKBlock{{8, 167772160}}}},
+		{[]byte{header.TCPOptionSACK, 11, 0, 0, 0, 1, 0, 0, 0, 1, header.TCPOptionTS, 10, 0, 0, 0, 1, 0, 0, 0, 1}, header.TCPOptions{false, 0, 0, nil}},
+	}
+	for _, tc := range testCases {
+		if got, want := header.ParseTCPOptions(tc.b), tc.want; !reflect.DeepEqual(got, want) {
+			t.Errorf("ParseTCPOptions(%v) = %v, want: %v", tc.b, got, tc.want)
+		}
+	}
+}
diff --git a/pkg/tcpip/header/udp.go b/pkg/tcpip/header/udp.go
new file mode 100644
index 000000000..9339d637f
--- /dev/null
+++ b/pkg/tcpip/header/udp.go
@@ -0,0 +1,120 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package header
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	udpSrcPort  = 0
+	udpDstPort  = 2
+	udpLength   = 4
+	udpChecksum = 6
+)
+
+const (
+	// UDPMaximumPacketSize is the largest possible UDP packet.
+	UDPMaximumPacketSize = 0xffff
+)
+
+// UDPFields contains the fields of a UDP packet. It is used to describe the
+// fields of a packet that needs to be encoded.
+type UDPFields struct {
+	// SrcPort is the "source port" field of a UDP packet.
+	SrcPort uint16
+
+	// DstPort is the "destination port" field of a UDP packet.
+	DstPort uint16
+
+	// Length is the "length" field of a UDP packet.
+	Length uint16
+
+	// Checksum is the "checksum" field of a UDP packet.
+	Checksum uint16
+}
+
+// UDP represents a UDP header stored in a byte array.
+type UDP []byte
+
+const (
+	// UDPMinimumSize is the minimum size of a valid UDP packet.
+	UDPMinimumSize = 8
+
+	// UDPProtocolNumber is UDP's transport protocol number.
+	UDPProtocolNumber tcpip.TransportProtocolNumber = 17
+)
+
+// SourcePort returns the "source port" field of the udp header.
+func (b UDP) SourcePort() uint16 {
+	return binary.BigEndian.Uint16(b[udpSrcPort:])
+}
+
+// DestinationPort returns the "destination port" field of the udp header.
+func (b UDP) DestinationPort() uint16 {
+	return binary.BigEndian.Uint16(b[udpDstPort:])
+}
+
+// Length returns the "length" field of the udp header.
+func (b UDP) Length() uint16 {
+	return binary.BigEndian.Uint16(b[udpLength:])
+}
+
+// Payload returns the data contained in the UDP datagram.
+func (b UDP) Payload() []byte {
+	return b[UDPMinimumSize:]
+}
+
+// Checksum returns the "checksum" field of the udp header.
+func (b UDP) Checksum() uint16 {
+	return binary.BigEndian.Uint16(b[udpChecksum:])
+}
+
+// SetSourcePort sets the "source port" field of the udp header.
+func (b UDP) SetSourcePort(port uint16) {
+	binary.BigEndian.PutUint16(b[udpSrcPort:], port)
+}
+
+// SetDestinationPort sets the "destination port" field of the udp header.
+func (b UDP) SetDestinationPort(port uint16) {
+	binary.BigEndian.PutUint16(b[udpDstPort:], port)
+}
+
+// SetChecksum sets the "checksum" field of the udp header.
+func (b UDP) SetChecksum(checksum uint16) {
+	binary.BigEndian.PutUint16(b[udpChecksum:], checksum)
+}
+
+// SetLength sets the "length" field of the udp header.
+func (b UDP) SetLength(length uint16) {
+	binary.BigEndian.PutUint16(b[udpLength:], length)
+}
+
+// CalculateChecksum calculates the checksum of the udp packet, given the
+// checksum of the network-layer pseudo-header and the checksum of the payload.
+func (b UDP) CalculateChecksum(partialChecksum uint16) uint16 {
+	// Calculate the rest of the checksum.
+	return Checksum(b[:UDPMinimumSize], partialChecksum)
+}
+
+// Encode encodes all the fields of the udp header.
+func (b UDP) Encode(u *UDPFields) {
+	binary.BigEndian.PutUint16(b[udpSrcPort:], u.SrcPort)
+	binary.BigEndian.PutUint16(b[udpDstPort:], u.DstPort)
+	binary.BigEndian.PutUint16(b[udpLength:], u.Length)
+	binary.BigEndian.PutUint16(b[udpChecksum:], u.Checksum)
+}
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
new file mode 100644
index 000000000..b8b93e78e
--- /dev/null
+++ b/pkg/tcpip/link/channel/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "channel",
+    srcs = ["channel.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
new file mode 100644
index 000000000..20b183da0
--- /dev/null
+++ b/pkg/tcpip/link/channel/channel.go
@@ -0,0 +1,298 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package channel provides the implemention of channel-based data-link layer
+// endpoints. Such endpoints allow injection of inbound packets and store
+// outbound packets in a channel.
+package channel
+
+import (
+	"context"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// PacketInfo holds all the information about an outbound packet.
+type PacketInfo struct {
+	Pkt   *stack.PacketBuffer
+	Proto tcpip.NetworkProtocolNumber
+	GSO   *stack.GSO
+	Route stack.Route
+}
+
+// Notification is the interface for receiving notification from the packet
+// queue.
+type Notification interface {
+	// WriteNotify will be called when a write happens to the queue.
+	WriteNotify()
+}
+
+// NotificationHandle is an opaque handle to the registered notification target.
+// It can be used to unregister the notification when no longer interested.
+//
+// +stateify savable
+type NotificationHandle struct {
+	n Notification
+}
+
+type queue struct {
+	// c is the outbound packet channel.
+	c chan PacketInfo
+	// mu protects fields below.
+	mu     sync.RWMutex
+	notify []*NotificationHandle
+}
+
+func (q *queue) Close() {
+	close(q.c)
+}
+
+func (q *queue) Read() (PacketInfo, bool) {
+	select {
+	case p := <-q.c:
+		return p, true
+	default:
+		return PacketInfo{}, false
+	}
+}
+
+func (q *queue) ReadContext(ctx context.Context) (PacketInfo, bool) {
+	select {
+	case pkt := <-q.c:
+		return pkt, true
+	case <-ctx.Done():
+		return PacketInfo{}, false
+	}
+}
+
+func (q *queue) Write(p PacketInfo) bool {
+	wrote := false
+	select {
+	case q.c <- p:
+		wrote = true
+	default:
+	}
+	q.mu.Lock()
+	notify := q.notify
+	q.mu.Unlock()
+
+	if wrote {
+		// Send notification outside of lock.
+		for _, h := range notify {
+			h.n.WriteNotify()
+		}
+	}
+	return wrote
+}
+
+func (q *queue) Num() int {
+	return len(q.c)
+}
+
+func (q *queue) AddNotify(notify Notification) *NotificationHandle {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	h := &NotificationHandle{n: notify}
+	q.notify = append(q.notify, h)
+	return h
+}
+
+func (q *queue) RemoveNotify(handle *NotificationHandle) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	// Make a copy, since we reads the array outside of lock when notifying.
+	notify := make([]*NotificationHandle, 0, len(q.notify))
+	for _, h := range q.notify {
+		if h != handle {
+			notify = append(notify, h)
+		}
+	}
+	q.notify = notify
+}
+
+// Endpoint is link layer endpoint that stores outbound packets in a channel
+// and allows injection of inbound packets.
+type Endpoint struct {
+	dispatcher         stack.NetworkDispatcher
+	mtu                uint32
+	linkAddr           tcpip.LinkAddress
+	LinkEPCapabilities stack.LinkEndpointCapabilities
+
+	// Outbound packet queue.
+	q *queue
+}
+
+// New creates a new channel endpoint.
+func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint {
+	return &Endpoint{
+		q: &queue{
+			c: make(chan PacketInfo, size),
+		},
+		mtu:      mtu,
+		linkAddr: linkAddr,
+	}
+}
+
+// Close closes e. Further packet injections will panic. Reads continue to
+// succeed until all packets are read.
+func (e *Endpoint) Close() {
+	e.q.Close()
+}
+
+// Read does non-blocking read one packet from the outbound packet queue.
+func (e *Endpoint) Read() (PacketInfo, bool) {
+	return e.q.Read()
+}
+
+// ReadContext does blocking read for one packet from the outbound packet queue.
+// It can be cancelled by ctx, and in this case, it returns false.
+func (e *Endpoint) ReadContext(ctx context.Context) (PacketInfo, bool) {
+	return e.q.ReadContext(ctx)
+}
+
+// Drain removes all outbound packets from the channel and counts them.
+func (e *Endpoint) Drain() int {
+	c := 0
+	for {
+		if _, ok := e.Read(); !ok {
+			return c
+		}
+		c++
+	}
+}
+
+// NumQueued returns the number of packet queued for outbound.
+func (e *Endpoint) NumQueued() int {
+	return e.q.Num()
+}
+
+// InjectInbound injects an inbound packet.
+func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.InjectLinkAddr(protocol, "", pkt)
+}
+
+// InjectLinkAddr injects an inbound packet with a remote link address.
+func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
+}
+
+// Attach saves the stack network-layer dispatcher for use later when packets
+// are injected.
+func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *Endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
+// during construction.
+func (e *Endpoint) MTU() uint32 {
+	return e.mtu
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.LinkEPCapabilities
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (*Endpoint) GSOMaxSize() uint32 {
+	return 1 << 15
+}
+
+// MaxHeaderLength returns the maximum size of the link layer header. Given it
+// doesn't have a header, it just returns 0.
+func (*Endpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.linkAddr
+}
+
+// WritePacket stores outbound packets into the channel.
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	// Clone r then release its resource so we only get the relevant fields from
+	// stack.Route without holding a reference to a NIC's endpoint.
+	route := r.Clone()
+	route.Release()
+	p := PacketInfo{
+		Pkt:   pkt,
+		Proto: protocol,
+		GSO:   gso,
+		Route: route,
+	}
+
+	e.q.Write(p)
+
+	return nil
+}
+
+// WritePackets stores outbound packets into the channel.
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	// Clone r then release its resource so we only get the relevant fields from
+	// stack.Route without holding a reference to a NIC's endpoint.
+	route := r.Clone()
+	route.Release()
+	n := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		p := PacketInfo{
+			Pkt:   pkt,
+			Proto: protocol,
+			GSO:   gso,
+			Route: route,
+		}
+
+		if !e.q.Write(p) {
+			break
+		}
+		n++
+	}
+
+	return n, nil
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	p := PacketInfo{
+		Pkt:   &stack.PacketBuffer{Data: vv},
+		Proto: 0,
+		GSO:   nil,
+	}
+
+	e.q.Write(p)
+
+	return nil
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (*Endpoint) Wait() {}
+
+// AddNotify adds a notification target for receiving event about outgoing
+// packets.
+func (e *Endpoint) AddNotify(notify Notification) *NotificationHandle {
+	return e.q.AddNotify(notify)
+}
+
+// RemoveNotify removes handle from the list of notification targets.
+func (e *Endpoint) RemoveNotify(handle *NotificationHandle) {
+	e.q.RemoveNotify(handle)
+}
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
new file mode 100644
index 000000000..aa6db9aea
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -0,0 +1,40 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "fdbased",
+    srcs = [
+        "endpoint.go",
+        "endpoint_unsafe.go",
+        "mmap.go",
+        "mmap_stub.go",
+        "mmap_unsafe.go",
+        "packet_dispatchers.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/binary",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/rawfile",
+        "//pkg/tcpip/stack",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "fdbased_test",
+    size = "small",
+    srcs = ["endpoint_test.go"],
+    library = ":fdbased",
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/rawfile",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
new file mode 100644
index 000000000..f34082e1a
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -0,0 +1,657 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+// Package fdbased provides the implemention of data-link layer endpoints
+// backed by boundary-preserving file descriptors (e.g., TUN devices,
+// seqpacket/datagram sockets).
+//
+// FD based endpoints can be used in the networking stack by calling New() to
+// create a new endpoint, and then passing it as an argument to
+// Stack.CreateNIC().
+//
+// FD based endpoints can use more than one file descriptor to read incoming
+// packets. If there are more than one FDs specified and the underlying FD is an
+// AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the
+// host kernel will consistently hash the packets to the sockets. This ensures
+// that packets for the same TCP streams are not reordered.
+//
+// Similarly if more than one FD's are specified where the underlying FD is not
+// AF_PACKET then it's the caller's responsibility to ensure that all inbound
+// packets on the descriptors are consistently 5 tuple hashed to one of the
+// descriptors to prevent TCP reordering.
+//
+// Since netstack today does not compute 5 tuple hashes for outgoing packets we
+// only use the first FD to write outbound packets. Once 5 tuple hashes for
+// all outbound packets are available we will make use of all underlying FD's to
+// write outbound packets.
+package fdbased
+
+import (
+	"fmt"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// linkDispatcher reads packets from the link FD and dispatches them to the
+// NetworkDispatcher.
+type linkDispatcher interface {
+	dispatch() (bool, *tcpip.Error)
+}
+
+// PacketDispatchMode are the various supported methods of receiving and
+// dispatching packets from the underlying FD.
+type PacketDispatchMode int
+
+const (
+	// Readv is the default dispatch mode and is the least performant of the
+	// dispatch options but the one that is supported by all underlying FD
+	// types.
+	Readv PacketDispatchMode = iota
+	// RecvMMsg enables use of recvmmsg() syscall instead of readv() to
+	// read inbound packets. This reduces # of syscalls needed to process
+	// packets.
+	//
+	// NOTE: recvmmsg() is only supported for sockets, so if the underlying
+	// FD is not a socket then the code will still fall back to the readv()
+	// path.
+	RecvMMsg
+	// PacketMMap enables use of PACKET_RX_RING to receive packets from the
+	// NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The
+	// primary use-case for this is runsc which uses an AF_PACKET FD to
+	// receive packets from the veth device.
+	PacketMMap
+)
+
+func (p PacketDispatchMode) String() string {
+	switch p {
+	case Readv:
+		return "Readv"
+	case RecvMMsg:
+		return "RecvMMsg"
+	case PacketMMap:
+		return "PacketMMap"
+	default:
+		return fmt.Sprintf("unknown packet dispatch mode '%d'", p)
+	}
+}
+
+type endpoint struct {
+	// fds is the set of file descriptors each identifying one inbound/outbound
+	// channel. The endpoint will dispatch from all inbound channels as well as
+	// hash outbound packets to specific channels based on the packet hash.
+	fds []int
+
+	// mtu (maximum transmission unit) is the maximum size of a packet.
+	mtu uint32
+
+	// hdrSize specifies the link-layer header size. If set to 0, no header
+	// is added/removed; otherwise an ethernet header is used.
+	hdrSize int
+
+	// addr is the address of the endpoint.
+	addr tcpip.LinkAddress
+
+	// caps holds the endpoint capabilities.
+	caps stack.LinkEndpointCapabilities
+
+	// closed is a function to be called when the FD's peer (if any) closes
+	// its end of the communication pipe.
+	closed func(*tcpip.Error)
+
+	inboundDispatchers []linkDispatcher
+	dispatcher         stack.NetworkDispatcher
+
+	// packetDispatchMode controls the packet dispatcher used by this
+	// endpoint.
+	packetDispatchMode PacketDispatchMode
+
+	// gsoMaxSize is the maximum GSO packet size. It is zero if GSO is
+	// disabled.
+	gsoMaxSize uint32
+
+	// wg keeps track of running goroutines.
+	wg sync.WaitGroup
+}
+
+// Options specify the details about the fd-based endpoint to be created.
+type Options struct {
+	// FDs is a set of FDs used to read/write packets.
+	FDs []int
+
+	// MTU is the mtu to use for this endpoint.
+	MTU uint32
+
+	// EthernetHeader if true, indicates that the endpoint should read/write
+	// ethernet frames instead of IP packets.
+	EthernetHeader bool
+
+	// ClosedFunc is a function to be called when an endpoint's peer (if
+	// any) closes its end of the communication pipe.
+	ClosedFunc func(*tcpip.Error)
+
+	// Address is the link address for this endpoint. Only used if
+	// EthernetHeader is true.
+	Address tcpip.LinkAddress
+
+	// SaveRestore if true, indicates that this NIC capability set should
+	// include CapabilitySaveRestore
+	SaveRestore bool
+
+	// DisconnectOk if true, indicates that this NIC capability set should
+	// include CapabilityDisconnectOk.
+	DisconnectOk bool
+
+	// GSOMaxSize is the maximum GSO packet size. It is zero if GSO is
+	// disabled.
+	GSOMaxSize uint32
+
+	// SoftwareGSOEnabled indicates whether software GSO is enabled or not.
+	SoftwareGSOEnabled bool
+
+	// PacketDispatchMode specifies the type of inbound dispatcher to be
+	// used for this endpoint.
+	PacketDispatchMode PacketDispatchMode
+
+	// TXChecksumOffload if true, indicates that this endpoints capability
+	// set should include CapabilityTXChecksumOffload.
+	TXChecksumOffload bool
+
+	// RXChecksumOffload if true, indicates that this endpoints capability
+	// set should include CapabilityRXChecksumOffload.
+	RXChecksumOffload bool
+}
+
+// fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT
+// support in the host kernel. This allows us to use multiple FD's to receive
+// from the same underlying NIC. The fanoutID needs to be the same for a given
+// set of FD's that point to the same NIC. Trying to set the PACKET_FANOUT
+// option for an FD with a fanoutID already in use by another FD for a different
+// NIC will return an EINVAL.
+var fanoutID = 1
+
+// New creates a new fd-based endpoint.
+//
+// Makes fd non-blocking, but does not take ownership of fd, which must remain
+// open for the lifetime of the returned endpoint (until after the endpoint has
+// stopped being using and Wait returns).
+func New(opts *Options) (stack.LinkEndpoint, error) {
+	caps := stack.LinkEndpointCapabilities(0)
+	if opts.RXChecksumOffload {
+		caps |= stack.CapabilityRXChecksumOffload
+	}
+
+	if opts.TXChecksumOffload {
+		caps |= stack.CapabilityTXChecksumOffload
+	}
+
+	hdrSize := 0
+	if opts.EthernetHeader {
+		hdrSize = header.EthernetMinimumSize
+		caps |= stack.CapabilityResolutionRequired
+	}
+
+	if opts.SaveRestore {
+		caps |= stack.CapabilitySaveRestore
+	}
+
+	if opts.DisconnectOk {
+		caps |= stack.CapabilityDisconnectOk
+	}
+
+	if len(opts.FDs) == 0 {
+		return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified")
+	}
+
+	e := &endpoint{
+		fds:                opts.FDs,
+		mtu:                opts.MTU,
+		caps:               caps,
+		closed:             opts.ClosedFunc,
+		addr:               opts.Address,
+		hdrSize:            hdrSize,
+		packetDispatchMode: opts.PacketDispatchMode,
+	}
+
+	// Create per channel dispatchers.
+	for i := 0; i < len(e.fds); i++ {
+		fd := e.fds[i]
+		if err := syscall.SetNonblock(fd, true); err != nil {
+			return nil, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", fd, err)
+		}
+
+		isSocket, err := isSocketFD(fd)
+		if err != nil {
+			return nil, err
+		}
+		if isSocket {
+			if opts.GSOMaxSize != 0 {
+				if opts.SoftwareGSOEnabled {
+					e.caps |= stack.CapabilitySoftwareGSO
+				} else {
+					e.caps |= stack.CapabilityHardwareGSO
+				}
+				e.gsoMaxSize = opts.GSOMaxSize
+			}
+		}
+		inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket)
+		if err != nil {
+			return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err)
+		}
+		e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher)
+	}
+
+	// Increment fanoutID to ensure that we don't re-use the same fanoutID for
+	// the next endpoint.
+	fanoutID++
+
+	return e, nil
+}
+
+func createInboundDispatcher(e *endpoint, fd int, isSocket bool) (linkDispatcher, error) {
+	// By default use the readv() dispatcher as it works with all kinds of
+	// FDs (tap/tun/unix domain sockets and af_packet).
+	inboundDispatcher, err := newReadVDispatcher(fd, e)
+	if err != nil {
+		return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err)
+	}
+
+	if isSocket {
+		sa, err := unix.Getsockname(fd)
+		if err != nil {
+			return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err)
+		}
+		switch sa.(type) {
+		case *unix.SockaddrLinklayer:
+			// enable PACKET_FANOUT mode is the underlying socket is
+			// of type AF_PACKET.
+			const fanoutType = 0x8000 // PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG
+			fanoutArg := fanoutID | fanoutType<<16
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil {
+				return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err)
+			}
+		}
+
+		switch e.packetDispatchMode {
+		case PacketMMap:
+			inboundDispatcher, err = newPacketMMapDispatcher(fd, e)
+			if err != nil {
+				return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err)
+			}
+		case RecvMMsg:
+			// If the provided FD is a socket then we optimize
+			// packet reads by using recvmmsg() instead of read() to
+			// read packets in a batch.
+			inboundDispatcher, err = newRecvMMsgDispatcher(fd, e)
+			if err != nil {
+				return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err)
+			}
+		}
+	}
+	return inboundDispatcher, nil
+}
+
+func isSocketFD(fd int) (bool, error) {
+	var stat syscall.Stat_t
+	if err := syscall.Fstat(fd, &stat); err != nil {
+		return false, fmt.Errorf("syscall.Fstat(%v,...) failed: %v", fd, err)
+	}
+	return (stat.Mode & syscall.S_IFSOCK) == syscall.S_IFSOCK, nil
+}
+
+// Attach launches the goroutine that reads packets from the file descriptor and
+// dispatches them via the provided dispatcher.
+func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+	// Link endpoints are not savable. When transportation endpoints are
+	// saved, they stop sending outgoing packets and all incoming packets
+	// are rejected.
+	for i := range e.inboundDispatchers {
+		e.wg.Add(1)
+		go func(i int) { // S/R-SAFE: See above.
+			e.dispatchLoop(e.inboundDispatchers[i])
+			e.wg.Done()
+		}(i)
+	}
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
+// during construction.
+func (e *endpoint) MTU() uint32 {
+	return e.mtu
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.caps
+}
+
+// MaxHeaderLength returns the maximum size of the link-layer header.
+func (e *endpoint) MaxHeaderLength() uint16 {
+	return uint16(e.hdrSize)
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (e *endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.addr
+}
+
+// Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop
+// reading from its FD.
+func (e *endpoint) Wait() {
+	e.wg.Wait()
+}
+
+// virtioNetHdr is declared in linux/virtio_net.h.
+type virtioNetHdr struct {
+	flags      uint8
+	gsoType    uint8
+	hdrLen     uint16
+	gsoSize    uint16
+	csumStart  uint16
+	csumOffset uint16
+}
+
+// These constants are declared in linux/virtio_net.h.
+const (
+	_VIRTIO_NET_HDR_F_NEEDS_CSUM = 1
+
+	_VIRTIO_NET_HDR_GSO_TCPV4 = 1
+	_VIRTIO_NET_HDR_GSO_TCPV6 = 4
+)
+
+// WritePacket writes outbound packets to the file descriptor. If it is not
+// currently writable, the packet is dropped.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	if e.hdrSize > 0 {
+		// Add ethernet header if needed.
+		eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
+		pkt.LinkHeader = buffer.View(eth)
+		ethHdr := &header.EthernetFields{
+			DstAddr: r.RemoteLinkAddress,
+			Type:    protocol,
+		}
+
+		// Preserve the src address if it's set in the route.
+		if r.LocalLinkAddress != "" {
+			ethHdr.SrcAddr = r.LocalLinkAddress
+		} else {
+			ethHdr.SrcAddr = e.addr
+		}
+		eth.Encode(ethHdr)
+	}
+
+	fd := e.fds[pkt.Hash%uint32(len(e.fds))]
+	if e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+		vnetHdr := virtioNetHdr{}
+		if gso != nil {
+			vnetHdr.hdrLen = uint16(pkt.Header.UsedLength())
+			if gso.NeedsCsum {
+				vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
+				vnetHdr.csumStart = header.EthernetMinimumSize + gso.L3HdrLen
+				vnetHdr.csumOffset = gso.CsumOffset
+			}
+			if gso.Type != stack.GSONone && uint16(pkt.Data.Size()) > gso.MSS {
+				switch gso.Type {
+				case stack.GSOTCPv4:
+					vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
+				case stack.GSOTCPv6:
+					vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
+				default:
+					panic(fmt.Sprintf("Unknown gso type: %v", gso.Type))
+				}
+				vnetHdr.gsoSize = gso.MSS
+			}
+		}
+
+		vnetHdrBuf := binary.Marshal(make([]byte, 0, virtioNetHdrSize), binary.LittleEndian, vnetHdr)
+		return rawfile.NonBlockingWrite3(fd, vnetHdrBuf, pkt.Header.View(), pkt.Data.ToView())
+	}
+
+	if pkt.Data.Size() == 0 {
+		return rawfile.NonBlockingWrite(fd, pkt.Header.View())
+	}
+	if pkt.Header.UsedLength() == 0 {
+		return rawfile.NonBlockingWrite(fd, pkt.Data.ToView())
+	}
+
+	return rawfile.NonBlockingWrite3(fd, pkt.Header.View(), pkt.Data.ToView(), nil)
+}
+
+func (e *endpoint) sendBatch(batchFD int, batch []*stack.PacketBuffer) (int, *tcpip.Error) {
+	// Send a batch of packets through batchFD.
+	mmsgHdrs := make([]rawfile.MMsgHdr, 0, len(batch))
+	for _, pkt := range batch {
+		var ethHdrBuf []byte
+		iovLen := 0
+		if e.hdrSize > 0 {
+			// Add ethernet header if needed.
+			ethHdrBuf = make([]byte, header.EthernetMinimumSize)
+			eth := header.Ethernet(ethHdrBuf)
+			ethHdr := &header.EthernetFields{
+				DstAddr: pkt.EgressRoute.RemoteLinkAddress,
+				Type:    pkt.NetworkProtocolNumber,
+			}
+
+			// Preserve the src address if it's set in the route.
+			if pkt.EgressRoute.LocalLinkAddress != "" {
+				ethHdr.SrcAddr = pkt.EgressRoute.LocalLinkAddress
+			} else {
+				ethHdr.SrcAddr = e.addr
+			}
+			eth.Encode(ethHdr)
+			iovLen++
+		}
+
+		vnetHdr := virtioNetHdr{}
+		var vnetHdrBuf []byte
+		if e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+			if pkt.GSOOptions != nil {
+				vnetHdr.hdrLen = uint16(pkt.Header.UsedLength())
+				if pkt.GSOOptions.NeedsCsum {
+					vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
+					vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen
+					vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset
+				}
+				if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data.Size()) > pkt.GSOOptions.MSS {
+					switch pkt.GSOOptions.Type {
+					case stack.GSOTCPv4:
+						vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
+					case stack.GSOTCPv6:
+						vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
+					default:
+						panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type))
+					}
+					vnetHdr.gsoSize = pkt.GSOOptions.MSS
+				}
+			}
+			vnetHdrBuf = binary.Marshal(make([]byte, 0, virtioNetHdrSize), binary.LittleEndian, vnetHdr)
+			iovLen++
+		}
+
+		iovecs := make([]syscall.Iovec, iovLen+1+len(pkt.Data.Views()))
+		var mmsgHdr rawfile.MMsgHdr
+		mmsgHdr.Msg.Iov = &iovecs[0]
+		iovecIdx := 0
+		if vnetHdrBuf != nil {
+			v := &iovecs[iovecIdx]
+			v.Base = &vnetHdrBuf[0]
+			v.Len = uint64(len(vnetHdrBuf))
+			iovecIdx++
+		}
+		if ethHdrBuf != nil {
+			v := &iovecs[iovecIdx]
+			v.Base = &ethHdrBuf[0]
+			v.Len = uint64(len(ethHdrBuf))
+			iovecIdx++
+		}
+		pktSize := uint64(0)
+		// Encode L3 Header
+		v := &iovecs[iovecIdx]
+		hdr := &pkt.Header
+		hdrView := hdr.View()
+		v.Base = &hdrView[0]
+		v.Len = uint64(len(hdrView))
+		pktSize += v.Len
+		iovecIdx++
+
+		// Now encode the Transport Payload.
+		pktViews := pkt.Data.Views()
+		for i := range pktViews {
+			vec := &iovecs[iovecIdx]
+			iovecIdx++
+			vec.Base = &pktViews[i][0]
+			vec.Len = uint64(len(pktViews[i]))
+			pktSize += vec.Len
+		}
+		mmsgHdr.Msg.Iovlen = uint64(iovecIdx)
+		mmsgHdrs = append(mmsgHdrs, mmsgHdr)
+	}
+
+	packets := 0
+	for len(mmsgHdrs) > 0 {
+		sent, err := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs)
+		if err != nil {
+			return packets, err
+		}
+		packets += sent
+		mmsgHdrs = mmsgHdrs[sent:]
+	}
+
+	return packets, nil
+}
+
+// WritePackets writes outbound packets to the underlying file descriptors. If
+// one is not currently writable, the packet is dropped.
+//
+// Being a batch API, each packet in pkts should have the following
+// fields populated:
+//  - pkt.EgressRoute
+//  - pkt.GSOOptions
+//  - pkt.NetworkProtocolNumber
+func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, _ tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	// Preallocate to avoid repeated reallocation as we append to batch.
+	// batchSz is 47 because when SWGSO is in use then a single 65KB TCP
+	// segment can get split into 46 segments of 1420 bytes and a single 216
+	// byte segment.
+	const batchSz = 47
+	batch := make([]*stack.PacketBuffer, 0, batchSz)
+	batchFD := -1
+	sentPackets := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		if len(batch) == 0 {
+			batchFD = e.fds[pkt.Hash%uint32(len(e.fds))]
+		}
+		pktFD := e.fds[pkt.Hash%uint32(len(e.fds))]
+		if sendNow := pktFD != batchFD; !sendNow {
+			batch = append(batch, pkt)
+			continue
+		}
+		n, err := e.sendBatch(batchFD, batch)
+		sentPackets += n
+		if err != nil {
+			return sentPackets, err
+		}
+		batch = batch[:0]
+		batch = append(batch, pkt)
+		batchFD = pktFD
+	}
+
+	if len(batch) != 0 {
+		n, err := e.sendBatch(batchFD, batch)
+		sentPackets += n
+		if err != nil {
+			return sentPackets, err
+		}
+	}
+	return sentPackets, nil
+}
+
+// viewsEqual tests whether v1 and v2 refer to the same backing bytes.
+func viewsEqual(vs1, vs2 []buffer.View) bool {
+	return len(vs1) == len(vs2) && (len(vs1) == 0 || &vs1[0] == &vs2[0])
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	return rawfile.NonBlockingWrite(e.fds[0], vv.ToView())
+}
+
+// InjectOutobund implements stack.InjectableEndpoint.InjectOutbound.
+func (e *endpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error {
+	return rawfile.NonBlockingWrite(e.fds[0], packet)
+}
+
+// dispatchLoop reads packets from the file descriptor in a loop and dispatches
+// them to the network stack.
+func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) *tcpip.Error {
+	for {
+		cont, err := inboundDispatcher.dispatch()
+		if err != nil || !cont {
+			if e.closed != nil {
+				e.closed(err)
+			}
+			return err
+		}
+	}
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (e *endpoint) GSOMaxSize() uint32 {
+	return e.gsoMaxSize
+}
+
+// InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes
+// to the FD, but does not read from it. All reads come from injected packets.
+type InjectableEndpoint struct {
+	endpoint
+
+	dispatcher stack.NetworkDispatcher
+}
+
+// Attach saves the stack network-layer dispatcher for use later when packets
+// are injected.
+func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// InjectInbound injects an inbound packet.
+func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, pkt)
+}
+
+// NewInjectable creates a new fd-based InjectableEndpoint.
+func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) *InjectableEndpoint {
+	syscall.SetNonblock(fd, true)
+
+	return &InjectableEndpoint{endpoint: endpoint{
+		fds:  []int{fd},
+		mtu:  mtu,
+		caps: capabilities,
+	}}
+}
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
new file mode 100644
index 000000000..eaee7e5d7
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -0,0 +1,502 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package fdbased
+
+import (
+	"bytes"
+	"fmt"
+	"math/rand"
+	"reflect"
+	"syscall"
+	"testing"
+	"time"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	mtu        = 1500
+	laddr      = tcpip.LinkAddress("\x11\x22\x33\x44\x55\x66")
+	raddr      = tcpip.LinkAddress("\x77\x88\x99\xaa\xbb\xcc")
+	proto      = 10
+	csumOffset = 48
+	gsoMSS     = 500
+)
+
+type packetInfo struct {
+	raddr    tcpip.LinkAddress
+	proto    tcpip.NetworkProtocolNumber
+	contents *stack.PacketBuffer
+}
+
+type context struct {
+	t        *testing.T
+	readFDs  []int
+	writeFDs []int
+	ep       stack.LinkEndpoint
+	ch       chan packetInfo
+	done     chan struct{}
+}
+
+func newContext(t *testing.T, opt *Options) *context {
+	firstFDPair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
+	if err != nil {
+		t.Fatalf("Socketpair failed: %v", err)
+	}
+	secondFDPair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
+	if err != nil {
+		t.Fatalf("Socketpair failed: %v", err)
+	}
+
+	done := make(chan struct{}, 2)
+	opt.ClosedFunc = func(*tcpip.Error) {
+		done <- struct{}{}
+	}
+
+	opt.FDs = []int{firstFDPair[1], secondFDPair[1]}
+	ep, err := New(opt)
+	if err != nil {
+		t.Fatalf("Failed to create FD endpoint: %v", err)
+	}
+
+	c := &context{
+		t:        t,
+		readFDs:  []int{firstFDPair[0], secondFDPair[0]},
+		writeFDs: opt.FDs,
+		ep:       ep,
+		ch:       make(chan packetInfo, 100),
+		done:     done,
+	}
+
+	ep.Attach(c)
+
+	return c
+}
+
+func (c *context) cleanup() {
+	for _, fd := range c.readFDs {
+		syscall.Close(fd)
+	}
+	<-c.done
+	<-c.done
+	for _, fd := range c.writeFDs {
+		syscall.Close(fd)
+	}
+}
+
+func (c *context) DeliverNetworkPacket(remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	c.ch <- packetInfo{remote, protocol, pkt}
+}
+
+func TestNoEthernetProperties(t *testing.T) {
+	c := newContext(t, &Options{MTU: mtu})
+	defer c.cleanup()
+
+	if want, v := uint16(0), c.ep.MaxHeaderLength(); want != v {
+		t.Fatalf("MaxHeaderLength() = %v, want %v", v, want)
+	}
+
+	if want, v := uint32(mtu), c.ep.MTU(); want != v {
+		t.Fatalf("MTU() = %v, want %v", v, want)
+	}
+}
+
+func TestEthernetProperties(t *testing.T) {
+	c := newContext(t, &Options{EthernetHeader: true, MTU: mtu})
+	defer c.cleanup()
+
+	if want, v := uint16(header.EthernetMinimumSize), c.ep.MaxHeaderLength(); want != v {
+		t.Fatalf("MaxHeaderLength() = %v, want %v", v, want)
+	}
+
+	if want, v := uint32(mtu), c.ep.MTU(); want != v {
+		t.Fatalf("MTU() = %v, want %v", v, want)
+	}
+}
+
+func TestAddress(t *testing.T) {
+	addrs := []tcpip.LinkAddress{"", "abc", "def"}
+	for _, a := range addrs {
+		t.Run(fmt.Sprintf("Address: %q", a), func(t *testing.T) {
+			c := newContext(t, &Options{Address: a, MTU: mtu})
+			defer c.cleanup()
+
+			if want, v := a, c.ep.LinkAddress(); want != v {
+				t.Fatalf("LinkAddress() = %v, want %v", v, want)
+			}
+		})
+	}
+}
+
+func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32, hash uint32) {
+	c := newContext(t, &Options{Address: laddr, MTU: mtu, EthernetHeader: eth, GSOMaxSize: gsoMaxSize})
+	defer c.cleanup()
+
+	r := &stack.Route{
+		RemoteLinkAddress: raddr,
+	}
+
+	// Build header.
+	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()) + 100)
+	b := hdr.Prepend(100)
+	for i := range b {
+		b[i] = uint8(rand.Intn(256))
+	}
+
+	// Build payload and write.
+	payload := make(buffer.View, plen)
+	for i := range payload {
+		payload[i] = uint8(rand.Intn(256))
+	}
+	want := append(hdr.View(), payload...)
+	var gso *stack.GSO
+	if gsoMaxSize != 0 {
+		gso = &stack.GSO{
+			Type:       stack.GSOTCPv6,
+			NeedsCsum:  true,
+			CsumOffset: csumOffset,
+			MSS:        gsoMSS,
+			MaxSize:    gsoMaxSize,
+			L3HdrLen:   header.IPv4MaximumHeaderSize,
+		}
+	}
+	if err := c.ep.WritePacket(r, gso, proto, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   payload.ToVectorisedView(),
+		Hash:   hash,
+	}); err != nil {
+		t.Fatalf("WritePacket failed: %v", err)
+	}
+
+	// Read from the corresponding FD, then compare with what we wrote.
+	b = make([]byte, mtu)
+	fd := c.readFDs[hash%uint32(len(c.readFDs))]
+	n, err := syscall.Read(fd, b)
+	if err != nil {
+		t.Fatalf("Read failed: %v", err)
+	}
+	b = b[:n]
+	if gsoMaxSize != 0 {
+		vnetHdr := *(*virtioNetHdr)(unsafe.Pointer(&b[0]))
+		if vnetHdr.flags&_VIRTIO_NET_HDR_F_NEEDS_CSUM == 0 {
+			t.Fatalf("virtioNetHdr.flags %v  doesn't contain %v", vnetHdr.flags, _VIRTIO_NET_HDR_F_NEEDS_CSUM)
+		}
+		csumStart := header.EthernetMinimumSize + gso.L3HdrLen
+		if vnetHdr.csumStart != csumStart {
+			t.Fatalf("vnetHdr.csumStart = %v, want %v", vnetHdr.csumStart, csumStart)
+		}
+		if vnetHdr.csumOffset != csumOffset {
+			t.Fatalf("vnetHdr.csumOffset = %v, want %v", vnetHdr.csumOffset, csumOffset)
+		}
+		gsoType := uint8(0)
+		if int(gso.MSS) < plen {
+			gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
+		}
+		if vnetHdr.gsoType != gsoType {
+			t.Fatalf("vnetHdr.gsoType = %v, want %v", vnetHdr.gsoType, gsoType)
+		}
+		b = b[virtioNetHdrSize:]
+	}
+	if eth {
+		h := header.Ethernet(b)
+		b = b[header.EthernetMinimumSize:]
+
+		if a := h.SourceAddress(); a != laddr {
+			t.Fatalf("SourceAddress() = %v, want %v", a, laddr)
+		}
+
+		if a := h.DestinationAddress(); a != raddr {
+			t.Fatalf("DestinationAddress() = %v, want %v", a, raddr)
+		}
+
+		if et := h.Type(); et != proto {
+			t.Fatalf("Type() = %v, want %v", et, proto)
+		}
+	}
+	if len(b) != len(want) {
+		t.Fatalf("Read returned %v bytes, want %v", len(b), len(want))
+	}
+	if !bytes.Equal(b, want) {
+		t.Fatalf("Read returned %x, want %x", b, want)
+	}
+}
+
+func TestWritePacket(t *testing.T) {
+	lengths := []int{0, 100, 1000}
+	eths := []bool{true, false}
+	gsos := []uint32{0, 32768}
+
+	for _, eth := range eths {
+		for _, plen := range lengths {
+			for _, gso := range gsos {
+				t.Run(
+					fmt.Sprintf("Eth=%v,PayloadLen=%v,GSOMaxSize=%v", eth, plen, gso),
+					func(t *testing.T) {
+						testWritePacket(t, plen, eth, gso, 0)
+					},
+				)
+			}
+		}
+	}
+}
+
+func TestHashedWritePacket(t *testing.T) {
+	lengths := []int{0, 100, 1000}
+	eths := []bool{true, false}
+	gsos := []uint32{0, 32768}
+	hashes := []uint32{0, 1}
+	for _, eth := range eths {
+		for _, plen := range lengths {
+			for _, gso := range gsos {
+				for _, hash := range hashes {
+					t.Run(
+						fmt.Sprintf("Eth=%v,PayloadLen=%v,GSOMaxSize=%v,Hash=%d", eth, plen, gso, hash),
+						func(t *testing.T) {
+							testWritePacket(t, plen, eth, gso, hash)
+						},
+					)
+				}
+			}
+		}
+	}
+}
+
+func TestPreserveSrcAddress(t *testing.T) {
+	baddr := tcpip.LinkAddress("\xcc\xbb\xaa\x77\x88\x99")
+
+	c := newContext(t, &Options{Address: laddr, MTU: mtu, EthernetHeader: true})
+	defer c.cleanup()
+
+	// Set LocalLinkAddress in route to the value of the bridged address.
+	r := &stack.Route{
+		RemoteLinkAddress: raddr,
+		LocalLinkAddress:  baddr,
+	}
+
+	// WritePacket panics given a prependable with anything less than
+	// the minimum size of the ethernet header.
+	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
+	if err := c.ep.WritePacket(r, nil /* gso */, proto, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   buffer.VectorisedView{},
+	}); err != nil {
+		t.Fatalf("WritePacket failed: %v", err)
+	}
+
+	// Read from the FD, then compare with what we wrote.
+	b := make([]byte, mtu)
+	n, err := syscall.Read(c.readFDs[0], b)
+	if err != nil {
+		t.Fatalf("Read failed: %v", err)
+	}
+	b = b[:n]
+	h := header.Ethernet(b)
+
+	if a := h.SourceAddress(); a != baddr {
+		t.Fatalf("SourceAddress() = %v, want %v", a, baddr)
+	}
+}
+
+func TestDeliverPacket(t *testing.T) {
+	lengths := []int{100, 1000}
+	eths := []bool{true, false}
+
+	for _, eth := range eths {
+		for _, plen := range lengths {
+			t.Run(fmt.Sprintf("Eth=%v,PayloadLen=%v", eth, plen), func(t *testing.T) {
+				c := newContext(t, &Options{Address: laddr, MTU: mtu, EthernetHeader: eth})
+				defer c.cleanup()
+
+				// Build packet.
+				b := make([]byte, plen)
+				all := b
+				for i := range b {
+					b[i] = uint8(rand.Intn(256))
+				}
+
+				var hdr header.Ethernet
+				if !eth {
+					// So that it looks like an IPv4 packet.
+					b[0] = 0x40
+				} else {
+					hdr = make(header.Ethernet, header.EthernetMinimumSize)
+					hdr.Encode(&header.EthernetFields{
+						SrcAddr: raddr,
+						DstAddr: laddr,
+						Type:    proto,
+					})
+					all = append(hdr, b...)
+				}
+
+				// Write packet via the file descriptor.
+				if _, err := syscall.Write(c.readFDs[0], all); err != nil {
+					t.Fatalf("Write failed: %v", err)
+				}
+
+				// Receive packet through the endpoint.
+				select {
+				case pi := <-c.ch:
+					want := packetInfo{
+						raddr: raddr,
+						proto: proto,
+						contents: &stack.PacketBuffer{
+							Data:       buffer.View(b).ToVectorisedView(),
+							LinkHeader: buffer.View(hdr),
+						},
+					}
+					if !eth {
+						want.proto = header.IPv4ProtocolNumber
+						want.raddr = ""
+					}
+					// want.contents.Data will be a single
+					// view, so make pi do the same for the
+					// DeepEqual check.
+					pi.contents.Data = pi.contents.Data.ToView().ToVectorisedView()
+					if !reflect.DeepEqual(want, pi) {
+						t.Fatalf("Unexpected received packet: %+v, want %+v", pi, want)
+					}
+				case <-time.After(10 * time.Second):
+					t.Fatalf("Timed out waiting for packet")
+				}
+			})
+		}
+	}
+}
+
+func TestBufConfigMaxLength(t *testing.T) {
+	got := 0
+	for _, i := range BufConfig {
+		got += i
+	}
+	want := header.MaxIPPacketSize // maximum TCP packet size
+	if got < want {
+		t.Errorf("total buffer size is invalid: got %d, want >= %d", got, want)
+	}
+}
+
+func TestBufConfigFirst(t *testing.T) {
+	// The stack assumes that the TCP/IP header is enterily contained in the first view.
+	// Therefore, the first view needs to be large enough to contain the maximum TCP/IP
+	// header, which is 120 bytes (60 bytes for IP + 60 bytes for TCP).
+	want := 120
+	got := BufConfig[0]
+	if got < want {
+		t.Errorf("first view has an invalid size: got %d, want >= %d", got, want)
+	}
+}
+
+var capLengthTestCases = []struct {
+	comment     string
+	config      []int
+	n           int
+	wantUsed    int
+	wantLengths []int
+}{
+	{
+		comment:     "Single slice",
+		config:      []int{2},
+		n:           1,
+		wantUsed:    1,
+		wantLengths: []int{1},
+	},
+	{
+		comment:     "Multiple slices",
+		config:      []int{1, 2},
+		n:           2,
+		wantUsed:    2,
+		wantLengths: []int{1, 1},
+	},
+	{
+		comment:     "Entire buffer",
+		config:      []int{1, 2},
+		n:           3,
+		wantUsed:    2,
+		wantLengths: []int{1, 2},
+	},
+	{
+		comment:     "Entire buffer but not on the last slice",
+		config:      []int{1, 2, 3},
+		n:           3,
+		wantUsed:    2,
+		wantLengths: []int{1, 2, 3},
+	},
+}
+
+func TestReadVDispatcherCapLength(t *testing.T) {
+	for _, c := range capLengthTestCases {
+		// fd does not matter for this test.
+		d := readVDispatcher{fd: -1, e: &endpoint{}}
+		d.views = make([]buffer.View, len(c.config))
+		d.iovecs = make([]syscall.Iovec, len(c.config))
+		d.allocateViews(c.config)
+
+		used := d.capViews(c.n, c.config)
+		if used != c.wantUsed {
+			t.Errorf("Test %q failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed)
+		}
+		lengths := make([]int, len(d.views))
+		for i, v := range d.views {
+			lengths[i] = len(v)
+		}
+		if !reflect.DeepEqual(lengths, c.wantLengths) {
+			t.Errorf("Test %q failed when calling capViews(%d, %v). Got %v. Want %v", c.comment, c.n, c.config, lengths, c.wantLengths)
+		}
+	}
+}
+
+func TestRecvMMsgDispatcherCapLength(t *testing.T) {
+	for _, c := range capLengthTestCases {
+		d := recvMMsgDispatcher{
+			fd:      -1, // fd does not matter for this test.
+			e:       &endpoint{},
+			views:   make([][]buffer.View, 1),
+			iovecs:  make([][]syscall.Iovec, 1),
+			msgHdrs: make([]rawfile.MMsgHdr, 1),
+		}
+
+		for i, _ := range d.views {
+			d.views[i] = make([]buffer.View, len(c.config))
+		}
+		for i := range d.iovecs {
+			d.iovecs[i] = make([]syscall.Iovec, len(c.config))
+		}
+		for k, msgHdr := range d.msgHdrs {
+			msgHdr.Msg.Iov = &d.iovecs[k][0]
+			msgHdr.Msg.Iovlen = uint64(len(c.config))
+		}
+
+		d.allocateViews(c.config)
+
+		used := d.capViews(0, c.n, c.config)
+		if used != c.wantUsed {
+			t.Errorf("Test %q failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed)
+		}
+		lengths := make([]int, len(d.views[0]))
+		for i, v := range d.views[0] {
+			lengths[i] = len(v)
+		}
+		if !reflect.DeepEqual(lengths, c.wantLengths) {
+			t.Errorf("Test %q failed when calling capViews(%d, %v). Got %v. Want %v", c.comment, c.n, c.config, lengths, c.wantLengths)
+		}
+
+	}
+}
diff --git a/pkg/tcpip/link/fdbased/endpoint_unsafe.go b/pkg/tcpip/link/fdbased/endpoint_unsafe.go
new file mode 100644
index 000000000..df14eaad1
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/endpoint_unsafe.go
@@ -0,0 +1,23 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package fdbased
+
+import (
+	"unsafe"
+)
+
+const virtioNetHdrSize = int(unsafe.Sizeof(virtioNetHdr{}))
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
new file mode 100644
index 000000000..2dfd29aa9
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -0,0 +1,199 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux,amd64 linux,arm64
+
+package fdbased
+
+import (
+	"encoding/binary"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	tPacketAlignment = uintptr(16)
+	tpStatusKernel   = 0
+	tpStatusUser     = 1
+	tpStatusCopy     = 2
+	tpStatusLosing   = 4
+)
+
+// We overallocate the frame size to accommodate space for the
+// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding.
+//
+// Memory allocated for the ring buffer: tpBlockSize * tpBlockNR = 2 MiB
+//
+// NOTE:
+//   Frames need to be aligned at 16 byte boundaries.
+//   BlockSize needs to be page aligned.
+//
+//   For details see PACKET_MMAP setting constraints in
+//   https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
+const (
+	tpFrameSize = 65536 + 128
+	tpBlockSize = tpFrameSize * 32
+	tpBlockNR   = 1
+	tpFrameNR   = (tpBlockSize * tpBlockNR) / tpFrameSize
+)
+
+// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct
+// translation of the TPACKET_ALIGN macro in <linux/if_packet.h>.
+func tPacketAlign(v uintptr) uintptr {
+	return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1))
+}
+
+// tPacketReq is the tpacket_req structure as described in
+// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
+type tPacketReq struct {
+	tpBlockSize uint32
+	tpBlockNR   uint32
+	tpFrameSize uint32
+	tpFrameNR   uint32
+}
+
+// tPacketHdr is tpacket_hdr structure as described in <linux/if_packet.h>
+type tPacketHdr []byte
+
+const (
+	tpStatusOffset  = 0
+	tpLenOffset     = 8
+	tpSnapLenOffset = 12
+	tpMacOffset     = 16
+	tpNetOffset     = 18
+	tpSecOffset     = 20
+	tpUSecOffset    = 24
+)
+
+func (t tPacketHdr) tpLen() uint32 {
+	return binary.LittleEndian.Uint32(t[tpLenOffset:])
+}
+
+func (t tPacketHdr) tpSnapLen() uint32 {
+	return binary.LittleEndian.Uint32(t[tpSnapLenOffset:])
+}
+
+func (t tPacketHdr) tpMac() uint16 {
+	return binary.LittleEndian.Uint16(t[tpMacOffset:])
+}
+
+func (t tPacketHdr) tpNet() uint16 {
+	return binary.LittleEndian.Uint16(t[tpNetOffset:])
+}
+
+func (t tPacketHdr) tpSec() uint32 {
+	return binary.LittleEndian.Uint32(t[tpSecOffset:])
+}
+
+func (t tPacketHdr) tpUSec() uint32 {
+	return binary.LittleEndian.Uint32(t[tpUSecOffset:])
+}
+
+func (t tPacketHdr) Payload() []byte {
+	return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()]
+}
+
+// packetMMapDispatcher uses PACKET_RX_RING's to read/dispatch inbound packets.
+// See: mmap_amd64_unsafe.go for implementation details.
+type packetMMapDispatcher struct {
+	// fd is the file descriptor used to send and receive packets.
+	fd int
+
+	// e is the endpoint this dispatcher is attached to.
+	e *endpoint
+
+	// ringBuffer is only used when PacketMMap dispatcher is used and points
+	// to the start of the mmapped PACKET_RX_RING buffer.
+	ringBuffer []byte
+
+	// ringOffset is the current offset into the ring buffer where the next
+	// inbound packet will be placed by the kernel.
+	ringOffset int
+}
+
+func (d *packetMMapDispatcher) readMMappedPacket() ([]byte, *tcpip.Error) {
+	hdr := tPacketHdr(d.ringBuffer[d.ringOffset*tpFrameSize:])
+	for hdr.tpStatus()&tpStatusUser == 0 {
+		event := rawfile.PollEvent{
+			FD:     int32(d.fd),
+			Events: unix.POLLIN | unix.POLLERR,
+		}
+		if _, errno := rawfile.BlockingPoll(&event, 1, nil); errno != 0 {
+			if errno == syscall.EINTR {
+				continue
+			}
+			return nil, rawfile.TranslateErrno(errno)
+		}
+		if hdr.tpStatus()&tpStatusCopy != 0 {
+			// This frame is truncated so skip it after flipping the
+			// buffer to the kernel.
+			hdr.setTPStatus(tpStatusKernel)
+			d.ringOffset = (d.ringOffset + 1) % tpFrameNR
+			hdr = (tPacketHdr)(d.ringBuffer[d.ringOffset*tpFrameSize:])
+			continue
+		}
+	}
+
+	// Copy out the packet from the mmapped frame to a locally owned buffer.
+	pkt := make([]byte, hdr.tpSnapLen())
+	copy(pkt, hdr.Payload())
+	// Release packet to kernel.
+	hdr.setTPStatus(tpStatusKernel)
+	d.ringOffset = (d.ringOffset + 1) % tpFrameNR
+	return pkt, nil
+}
+
+// dispatch reads packets from an mmaped ring buffer and dispatches them to the
+// network stack.
+func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
+	pkt, err := d.readMMappedPacket()
+	if err != nil {
+		return false, err
+	}
+	var (
+		p             tcpip.NetworkProtocolNumber
+		remote, local tcpip.LinkAddress
+		eth           header.Ethernet
+	)
+	if d.e.hdrSize > 0 {
+		eth = header.Ethernet(pkt)
+		p = eth.Type()
+		remote = eth.SourceAddress()
+		local = eth.DestinationAddress()
+	} else {
+		// We don't get any indication of what the packet is, so try to guess
+		// if it's an IPv4 or IPv6 packet.
+		switch header.IPVersion(pkt) {
+		case header.IPv4Version:
+			p = header.IPv4ProtocolNumber
+		case header.IPv6Version:
+			p = header.IPv6ProtocolNumber
+		default:
+			return true, nil
+		}
+	}
+
+	pkt = pkt[d.e.hdrSize:]
+	d.e.dispatcher.DeliverNetworkPacket(remote, local, p, &stack.PacketBuffer{
+		Data:       buffer.View(pkt).ToVectorisedView(),
+		LinkHeader: buffer.View(eth),
+	})
+	return true, nil
+}
diff --git a/pkg/tcpip/link/fdbased/mmap_stub.go b/pkg/tcpip/link/fdbased/mmap_stub.go
new file mode 100644
index 000000000..67be52d67
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/mmap_stub.go
@@ -0,0 +1,23 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !linux !amd64,!arm64
+
+package fdbased
+
+// Stubbed out version for non-linux/non-amd64/non-arm64 platforms.
+
+func newPacketMMapDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
+	return nil, nil
+}
diff --git a/pkg/tcpip/link/fdbased/mmap_unsafe.go b/pkg/tcpip/link/fdbased/mmap_unsafe.go
new file mode 100644
index 000000000..3894185ae
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/mmap_unsafe.go
@@ -0,0 +1,84 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux,amd64 linux,arm64
+
+package fdbased
+
+import (
+	"fmt"
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+// tPacketHdrlen is the TPACKET_HDRLEN variable defined in <linux/if_packet.h>.
+var tPacketHdrlen = tPacketAlign(unsafe.Sizeof(tPacketHdr{}) + unsafe.Sizeof(syscall.RawSockaddrLinklayer{}))
+
+// tpStatus returns the frame status field.
+// The status is concurrently updated by the kernel as a result we must
+// use atomic operations to prevent races.
+func (t tPacketHdr) tpStatus() uint32 {
+	hdr := unsafe.Pointer(&t[0])
+	statusPtr := unsafe.Pointer(uintptr(hdr) + uintptr(tpStatusOffset))
+	return atomic.LoadUint32((*uint32)(statusPtr))
+}
+
+// setTPStatus set's the frame status to the provided status.
+// The status is concurrently updated by the kernel as a result we must
+// use atomic operations to prevent races.
+func (t tPacketHdr) setTPStatus(status uint32) {
+	hdr := unsafe.Pointer(&t[0])
+	statusPtr := unsafe.Pointer(uintptr(hdr) + uintptr(tpStatusOffset))
+	atomic.StoreUint32((*uint32)(statusPtr), status)
+}
+
+func newPacketMMapDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
+	d := &packetMMapDispatcher{
+		fd: fd,
+		e:  e,
+	}
+	pageSize := unix.Getpagesize()
+	if tpBlockSize%pageSize != 0 {
+		return nil, fmt.Errorf("tpBlockSize: %d is not page aligned, pagesize: %d", tpBlockSize, pageSize)
+	}
+	tReq := tPacketReq{
+		tpBlockSize: uint32(tpBlockSize),
+		tpBlockNR:   uint32(tpBlockNR),
+		tpFrameSize: uint32(tpFrameSize),
+		tpFrameNR:   uint32(tpFrameNR),
+	}
+	// Setup PACKET_RX_RING.
+	if err := setsockopt(d.fd, syscall.SOL_PACKET, syscall.PACKET_RX_RING, unsafe.Pointer(&tReq), unsafe.Sizeof(tReq)); err != nil {
+		return nil, fmt.Errorf("failed to enable PACKET_RX_RING: %v", err)
+	}
+	// Let's mmap the blocks.
+	sz := tpBlockSize * tpBlockNR
+	buf, err := syscall.Mmap(d.fd, 0, sz, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
+	if err != nil {
+		return nil, fmt.Errorf("syscall.Mmap(...,0, %v, ...) failed = %v", sz, err)
+	}
+	d.ringBuffer = buf
+	return d, nil
+}
+
+func setsockopt(fd, level, name int, val unsafe.Pointer, vallen uintptr) error {
+	if _, _, errno := syscall.Syscall6(syscall.SYS_SETSOCKOPT, uintptr(fd), uintptr(level), uintptr(name), uintptr(val), vallen, 0); errno != 0 {
+		return error(errno)
+	}
+
+	return nil
+}
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go
new file mode 100644
index 000000000..f04738cfb
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go
@@ -0,0 +1,317 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package fdbased
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// BufConfig defines the shape of the vectorised view used to read packets from the NIC.
+var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}
+
+// readVDispatcher uses readv() system call to read inbound packets and
+// dispatches them.
+type readVDispatcher struct {
+	// fd is the file descriptor used to send and receive packets.
+	fd int
+
+	// e is the endpoint this dispatcher is attached to.
+	e *endpoint
+
+	// views are the actual buffers that hold the packet contents.
+	views []buffer.View
+
+	// iovecs are initialized with base pointers/len of the corresponding
+	// entries in the views defined above, except when GSO is enabled then
+	// the first iovec points to a buffer for the vnet header which is
+	// stripped before the views are passed up the stack for further
+	// processing.
+	iovecs []syscall.Iovec
+}
+
+func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
+	d := &readVDispatcher{fd: fd, e: e}
+	d.views = make([]buffer.View, len(BufConfig))
+	iovLen := len(BufConfig)
+	if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+		iovLen++
+	}
+	d.iovecs = make([]syscall.Iovec, iovLen)
+	return d, nil
+}
+
+func (d *readVDispatcher) allocateViews(bufConfig []int) {
+	var vnetHdr [virtioNetHdrSize]byte
+	vnetHdrOff := 0
+	if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+		// The kernel adds virtioNetHdr before each packet, but
+		// we don't use it, so so we allocate a buffer for it,
+		// add it in iovecs but don't add it in a view.
+		d.iovecs[0] = syscall.Iovec{
+			Base: &vnetHdr[0],
+			Len:  uint64(virtioNetHdrSize),
+		}
+		vnetHdrOff++
+	}
+	for i := 0; i < len(bufConfig); i++ {
+		if d.views[i] != nil {
+			break
+		}
+		b := buffer.NewView(bufConfig[i])
+		d.views[i] = b
+		d.iovecs[i+vnetHdrOff] = syscall.Iovec{
+			Base: &b[0],
+			Len:  uint64(len(b)),
+		}
+	}
+}
+
+func (d *readVDispatcher) capViews(n int, buffers []int) int {
+	c := 0
+	for i, s := range buffers {
+		c += s
+		if c >= n {
+			d.views[i].CapLength(s - (c - n))
+			return i + 1
+		}
+	}
+	return len(buffers)
+}
+
+// dispatch reads one packet from the file descriptor and dispatches it.
+func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
+	d.allocateViews(BufConfig)
+
+	n, err := rawfile.BlockingReadv(d.fd, d.iovecs)
+	if err != nil {
+		return false, err
+	}
+	if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+		// Skip virtioNetHdr which is added before each packet, it
+		// isn't used and it isn't in a view.
+		n -= virtioNetHdrSize
+	}
+	if n <= d.e.hdrSize {
+		return false, nil
+	}
+
+	var (
+		p             tcpip.NetworkProtocolNumber
+		remote, local tcpip.LinkAddress
+		eth           header.Ethernet
+	)
+	if d.e.hdrSize > 0 {
+		eth = header.Ethernet(d.views[0][:header.EthernetMinimumSize])
+		p = eth.Type()
+		remote = eth.SourceAddress()
+		local = eth.DestinationAddress()
+	} else {
+		// We don't get any indication of what the packet is, so try to guess
+		// if it's an IPv4 or IPv6 packet.
+		switch header.IPVersion(d.views[0]) {
+		case header.IPv4Version:
+			p = header.IPv4ProtocolNumber
+		case header.IPv6Version:
+			p = header.IPv6ProtocolNumber
+		default:
+			return true, nil
+		}
+	}
+
+	used := d.capViews(n, BufConfig)
+	pkt := &stack.PacketBuffer{
+		Data:       buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...)),
+		LinkHeader: buffer.View(eth),
+	}
+	pkt.Data.TrimFront(d.e.hdrSize)
+
+	d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt)
+
+	// Prepare e.views for another packet: release used views.
+	for i := 0; i < used; i++ {
+		d.views[i] = nil
+	}
+
+	return true, nil
+}
+
+// recvMMsgDispatcher uses the recvmmsg system call to read inbound packets and
+// dispatches them.
+type recvMMsgDispatcher struct {
+	// fd is the file descriptor used to send and receive packets.
+	fd int
+
+	// e is the endpoint this dispatcher is attached to.
+	e *endpoint
+
+	// views is an array of array of buffers that contain packet contents.
+	views [][]buffer.View
+
+	// iovecs is an array of array of iovec records where each iovec base
+	// pointer and length are initialzed to the corresponding view above,
+	// except when GSO is enabled then the first iovec in each array of
+	// iovecs points to a buffer for the vnet header which is stripped
+	// before the views are passed up the stack for further processing.
+	iovecs [][]syscall.Iovec
+
+	// msgHdrs is an array of MMsgHdr objects where each MMsghdr is used to
+	// reference an array of iovecs in the iovecs field defined above.  This
+	// array is passed as the parameter to recvmmsg call to retrieve
+	// potentially more than 1 packet per syscall.
+	msgHdrs []rawfile.MMsgHdr
+}
+
+const (
+	// MaxMsgsPerRecv is the maximum number of packets we want to retrieve
+	// in a single RecvMMsg call.
+	MaxMsgsPerRecv = 8
+)
+
+func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
+	d := &recvMMsgDispatcher{
+		fd: fd,
+		e:  e,
+	}
+	d.views = make([][]buffer.View, MaxMsgsPerRecv)
+	for i := range d.views {
+		d.views[i] = make([]buffer.View, len(BufConfig))
+	}
+	d.iovecs = make([][]syscall.Iovec, MaxMsgsPerRecv)
+	iovLen := len(BufConfig)
+	if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+		// virtioNetHdr is prepended before each packet.
+		iovLen++
+	}
+	for i := range d.iovecs {
+		d.iovecs[i] = make([]syscall.Iovec, iovLen)
+	}
+	d.msgHdrs = make([]rawfile.MMsgHdr, MaxMsgsPerRecv)
+	for i := range d.msgHdrs {
+		d.msgHdrs[i].Msg.Iov = &d.iovecs[i][0]
+		d.msgHdrs[i].Msg.Iovlen = uint64(iovLen)
+	}
+	return d, nil
+}
+
+func (d *recvMMsgDispatcher) capViews(k, n int, buffers []int) int {
+	c := 0
+	for i, s := range buffers {
+		c += s
+		if c >= n {
+			d.views[k][i].CapLength(s - (c - n))
+			return i + 1
+		}
+	}
+	return len(buffers)
+}
+
+func (d *recvMMsgDispatcher) allocateViews(bufConfig []int) {
+	for k := 0; k < len(d.views); k++ {
+		var vnetHdr [virtioNetHdrSize]byte
+		vnetHdrOff := 0
+		if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+			// The kernel adds virtioNetHdr before each packet, but
+			// we don't use it, so so we allocate a buffer for it,
+			// add it in iovecs but don't add it in a view.
+			d.iovecs[k][0] = syscall.Iovec{
+				Base: &vnetHdr[0],
+				Len:  uint64(virtioNetHdrSize),
+			}
+			vnetHdrOff++
+		}
+		for i := 0; i < len(bufConfig); i++ {
+			if d.views[k][i] != nil {
+				break
+			}
+			b := buffer.NewView(bufConfig[i])
+			d.views[k][i] = b
+			d.iovecs[k][i+vnetHdrOff] = syscall.Iovec{
+				Base: &b[0],
+				Len:  uint64(len(b)),
+			}
+		}
+	}
+}
+
+// recvMMsgDispatch reads more than one packet at a time from the file
+// descriptor and dispatches it.
+func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
+	d.allocateViews(BufConfig)
+
+	nMsgs, err := rawfile.BlockingRecvMMsg(d.fd, d.msgHdrs)
+	if err != nil {
+		return false, err
+	}
+	// Process each of received packets.
+	for k := 0; k < nMsgs; k++ {
+		n := int(d.msgHdrs[k].Len)
+		if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+			n -= virtioNetHdrSize
+		}
+		if n <= d.e.hdrSize {
+			return false, nil
+		}
+
+		var (
+			p             tcpip.NetworkProtocolNumber
+			remote, local tcpip.LinkAddress
+			eth           header.Ethernet
+		)
+		if d.e.hdrSize > 0 {
+			eth = header.Ethernet(d.views[k][0])
+			p = eth.Type()
+			remote = eth.SourceAddress()
+			local = eth.DestinationAddress()
+		} else {
+			// We don't get any indication of what the packet is, so try to guess
+			// if it's an IPv4 or IPv6 packet.
+			switch header.IPVersion(d.views[k][0]) {
+			case header.IPv4Version:
+				p = header.IPv4ProtocolNumber
+			case header.IPv6Version:
+				p = header.IPv6ProtocolNumber
+			default:
+				return true, nil
+			}
+		}
+
+		used := d.capViews(k, int(n), BufConfig)
+		pkt := &stack.PacketBuffer{
+			Data:       buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...)),
+			LinkHeader: buffer.View(eth),
+		}
+		pkt.Data.TrimFront(d.e.hdrSize)
+		d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt)
+
+		// Prepare e.views for another packet: release used views.
+		for i := 0; i < used; i++ {
+			d.views[k][i] = nil
+		}
+	}
+
+	for k := 0; k < nMsgs; k++ {
+		d.msgHdrs[k].Len = 0
+	}
+
+	return true, nil
+}
diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD
new file mode 100644
index 000000000..6bf3805b7
--- /dev/null
+++ b/pkg/tcpip/link/loopback/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "loopback",
+    srcs = ["loopback.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
new file mode 100644
index 000000000..568c6874f
--- /dev/null
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -0,0 +1,115 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package loopback provides the implemention of loopback data-link layer
+// endpoints. Such endpoints just turn outbound packets into inbound ones.
+//
+// Loopback endpoints can be used in the networking stack by calling New() to
+// create a new endpoint, and then passing it as an argument to
+// Stack.CreateNIC().
+package loopback
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+type endpoint struct {
+	dispatcher stack.NetworkDispatcher
+}
+
+// New creates a new loopback endpoint. This link-layer endpoint just turns
+// outbound packets into inbound packets.
+func New() stack.LinkEndpoint {
+	return &endpoint{}
+}
+
+// Attach implements stack.LinkEndpoint.Attach. It just saves the stack network-
+// layer dispatcher for later use when packets need to be dispatched.
+func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It returns a constant that matches the
+// linux loopback interface.
+func (*endpoint) MTU() uint32 {
+	return 65536
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities. Loopback advertises
+// itself as supporting checksum offload, but in reality it's just omitted.
+func (*endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return stack.CapabilityRXChecksumOffload | stack.CapabilityTXChecksumOffload | stack.CapabilitySaveRestore | stack.CapabilityLoopback
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. Given that the
+// loopback interface doesn't have a header, it just returns 0.
+func (*endpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (*endpoint) LinkAddress() tcpip.LinkAddress {
+	return ""
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (*endpoint) Wait() {}
+
+// WritePacket implements stack.LinkEndpoint.WritePacket. It delivers outbound
+// packets to the network-layer dispatcher.
+func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
+	views[0] = pkt.Header.View()
+	views = append(views, pkt.Data.Views()...)
+
+	// Because we're immediately turning around and writing the packet back
+	// to the rx path, we intentionally don't preserve the remote and local
+	// link addresses from the stack.Route we're passed.
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, &stack.PacketBuffer{
+		Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
+	})
+
+	return nil
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	// There should be an ethernet header at the beginning of vv.
+	hdr, ok := vv.PullUp(header.EthernetMinimumSize)
+	if !ok {
+		// Reject the packet if it's shorter than an ethernet header.
+		return tcpip.ErrBadAddress
+	}
+	linkHeader := header.Ethernet(hdr)
+	vv.TrimFront(len(linkHeader))
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, linkHeader.Type(), &stack.PacketBuffer{
+		Data:       vv,
+		LinkHeader: buffer.View(linkHeader),
+	})
+
+	return nil
+}
diff --git a/pkg/tcpip/link/muxed/BUILD b/pkg/tcpip/link/muxed/BUILD
new file mode 100644
index 000000000..82b441b79
--- /dev/null
+++ b/pkg/tcpip/link/muxed/BUILD
@@ -0,0 +1,28 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "muxed",
+    srcs = ["injectable.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
+    ],
+)
+
+go_test(
+    name = "muxed_test",
+    size = "small",
+    srcs = ["injectable_test.go"],
+    library = ":muxed",
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/link/fdbased",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
new file mode 100644
index 000000000..c69d6b7e9
--- /dev/null
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -0,0 +1,137 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package muxed provides a muxed link endpoints.
+package muxed
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// InjectableEndpoint is an injectable multi endpoint. The endpoint has
+// trivial routing rules that determine which InjectableEndpoint a given packet
+// will be written to. Note that HandleLocal works differently for this
+// endpoint (see WritePacket).
+type InjectableEndpoint struct {
+	routes     map[tcpip.Address]stack.InjectableLinkEndpoint
+	dispatcher stack.NetworkDispatcher
+}
+
+// MTU implements stack.LinkEndpoint.
+func (m *InjectableEndpoint) MTU() uint32 {
+	minMTU := ^uint32(0)
+	for _, endpoint := range m.routes {
+		if endpointMTU := endpoint.MTU(); endpointMTU < minMTU {
+			minMTU = endpointMTU
+		}
+	}
+	return minMTU
+}
+
+// Capabilities implements stack.LinkEndpoint.
+func (m *InjectableEndpoint) Capabilities() stack.LinkEndpointCapabilities {
+	minCapabilities := stack.LinkEndpointCapabilities(^uint(0))
+	for _, endpoint := range m.routes {
+		minCapabilities &= endpoint.Capabilities()
+	}
+	return minCapabilities
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.
+func (m *InjectableEndpoint) MaxHeaderLength() uint16 {
+	minHeaderLen := ^uint16(0)
+	for _, endpoint := range m.routes {
+		if headerLen := endpoint.MaxHeaderLength(); headerLen < minHeaderLen {
+			minHeaderLen = headerLen
+		}
+	}
+	return minHeaderLen
+}
+
+// LinkAddress implements stack.LinkEndpoint.
+func (m *InjectableEndpoint) LinkAddress() tcpip.LinkAddress {
+	return ""
+}
+
+// Attach implements stack.LinkEndpoint.
+func (m *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	for _, endpoint := range m.routes {
+		endpoint.Attach(dispatcher)
+	}
+	m.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.
+func (m *InjectableEndpoint) IsAttached() bool {
+	return m.dispatcher != nil
+}
+
+// InjectInbound implements stack.InjectableLinkEndpoint.
+func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	m.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, pkt)
+}
+
+// WritePackets writes outbound packets to the appropriate
+// LinkInjectableEndpoint based on the RemoteAddress. HandleLocal only works if
+// r.RemoteAddress has a route registered in this endpoint.
+func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	endpoint, ok := m.routes[r.RemoteAddress]
+	if !ok {
+		return 0, tcpip.ErrNoRoute
+	}
+	return endpoint.WritePackets(r, gso, pkts, protocol)
+}
+
+// WritePacket writes outbound packets to the appropriate LinkInjectableEndpoint
+// based on the RemoteAddress. HandleLocal only works if r.RemoteAddress has a
+// route registered in this endpoint.
+func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	if endpoint, ok := m.routes[r.RemoteAddress]; ok {
+		return endpoint.WritePacket(r, gso, protocol, pkt)
+	}
+	return tcpip.ErrNoRoute
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (m *InjectableEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
+	// WriteRawPacket doesn't get a route or network address, so there's
+	// nowhere to write this.
+	return tcpip.ErrNoRoute
+}
+
+// InjectOutbound writes outbound packets to the appropriate
+// LinkInjectableEndpoint based on the dest address.
+func (m *InjectableEndpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error {
+	endpoint, ok := m.routes[dest]
+	if !ok {
+		return tcpip.ErrNoRoute
+	}
+	return endpoint.InjectOutbound(dest, packet)
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (m *InjectableEndpoint) Wait() {
+	for _, ep := range m.routes {
+		ep.Wait()
+	}
+}
+
+// NewInjectableEndpoint creates a new multi-endpoint injectable endpoint.
+func NewInjectableEndpoint(routes map[tcpip.Address]stack.InjectableLinkEndpoint) *InjectableEndpoint {
+	return &InjectableEndpoint{
+		routes: routes,
+	}
+}
diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go
new file mode 100644
index 000000000..0744f66d6
--- /dev/null
+++ b/pkg/tcpip/link/muxed/injectable_test.go
@@ -0,0 +1,98 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package muxed
+
+import (
+	"bytes"
+	"net"
+	"os"
+	"syscall"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+func TestInjectableEndpointRawDispatch(t *testing.T) {
+	endpoint, sock, dstIP := makeTestInjectableEndpoint(t)
+
+	endpoint.InjectOutbound(dstIP, []byte{0xFA})
+
+	buf := make([]byte, ipv4.MaxTotalSize)
+	bytesRead, err := sock.Read(buf)
+	if err != nil {
+		t.Fatalf("Unable to read from socketpair: %v", err)
+	}
+	if got, want := buf[:bytesRead], []byte{0xFA}; !bytes.Equal(got, want) {
+		t.Fatalf("Read %v from the socketpair, wanted %v", got, want)
+	}
+}
+
+func TestInjectableEndpointDispatch(t *testing.T) {
+	endpoint, sock, dstIP := makeTestInjectableEndpoint(t)
+
+	hdr := buffer.NewPrependable(1)
+	hdr.Prepend(1)[0] = 0xFA
+	packetRoute := stack.Route{RemoteAddress: dstIP}
+
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   buffer.NewViewFromBytes([]byte{0xFB}).ToVectorisedView(),
+	})
+
+	buf := make([]byte, 6500)
+	bytesRead, err := sock.Read(buf)
+	if err != nil {
+		t.Fatalf("Unable to read from socketpair: %v", err)
+	}
+	if got, want := buf[:bytesRead], []byte{0xFA, 0xFB}; !bytes.Equal(got, want) {
+		t.Fatalf("Read %v from the socketpair, wanted %v", got, want)
+	}
+}
+
+func TestInjectableEndpointDispatchHdrOnly(t *testing.T) {
+	endpoint, sock, dstIP := makeTestInjectableEndpoint(t)
+	hdr := buffer.NewPrependable(1)
+	hdr.Prepend(1)[0] = 0xFA
+	packetRoute := stack.Route{RemoteAddress: dstIP}
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   buffer.NewView(0).ToVectorisedView(),
+	})
+	buf := make([]byte, 6500)
+	bytesRead, err := sock.Read(buf)
+	if err != nil {
+		t.Fatalf("Unable to read from socketpair: %v", err)
+	}
+	if got, want := buf[:bytesRead], []byte{0xFA}; !bytes.Equal(got, want) {
+		t.Fatalf("Read %v from the socketpair, wanted %v", got, want)
+	}
+}
+
+func makeTestInjectableEndpoint(t *testing.T) (*InjectableEndpoint, *os.File, tcpip.Address) {
+	dstIP := tcpip.Address(net.ParseIP("1.2.3.4").To4())
+	pair, err := syscall.Socketpair(syscall.AF_UNIX,
+		syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC|syscall.SOCK_NONBLOCK, 0)
+	if err != nil {
+		t.Fatal("Failed to create socket pair:", err)
+	}
+	underlyingEndpoint := fdbased.NewInjectable(pair[1], 6500, stack.CapabilityNone)
+	routes := map[tcpip.Address]stack.InjectableLinkEndpoint{dstIP: underlyingEndpoint}
+	endpoint := NewInjectableEndpoint(routes)
+	return endpoint, os.NewFile(uintptr(pair[0]), "test route end"), dstIP
+}
diff --git a/pkg/tcpip/link/nested/BUILD b/pkg/tcpip/link/nested/BUILD
new file mode 100644
index 000000000..bdd5276ad
--- /dev/null
+++ b/pkg/tcpip/link/nested/BUILD
@@ -0,0 +1,31 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "nested",
+    srcs = [
+        "nested.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
+    ],
+)
+
+go_test(
+    name = "nested_test",
+    size = "small",
+    srcs = [
+        "nested_test.go",
+    ],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/nested",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/nested/nested.go b/pkg/tcpip/link/nested/nested.go
new file mode 100644
index 000000000..2998f9c4f
--- /dev/null
+++ b/pkg/tcpip/link/nested/nested.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package nested provides helpers to implement the pattern of nested
+// stack.LinkEndpoints.
+package nested
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// Endpoint is a wrapper around stack.LinkEndpoint and stack.NetworkDispatcher
+// that can be used to implement nesting safely by providing lifecycle
+// concurrency guards.
+//
+// See the tests in this package for example usage.
+type Endpoint struct {
+	child    stack.LinkEndpoint
+	embedder stack.NetworkDispatcher
+
+	// mu protects dispatcher.
+	mu         sync.RWMutex
+	dispatcher stack.NetworkDispatcher
+}
+
+var _ stack.GSOEndpoint = (*Endpoint)(nil)
+var _ stack.LinkEndpoint = (*Endpoint)(nil)
+var _ stack.NetworkDispatcher = (*Endpoint)(nil)
+
+// Init initializes a nested.Endpoint that uses embedder as the dispatcher for
+// child on Attach.
+//
+// See the tests in this package for example usage.
+func (e *Endpoint) Init(child stack.LinkEndpoint, embedder stack.NetworkDispatcher) {
+	e.child = child
+	e.embedder = embedder
+}
+
+// DeliverNetworkPacket implements stack.NetworkDispatcher.
+func (e *Endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.mu.RLock()
+	d := e.dispatcher
+	e.mu.RUnlock()
+	if d != nil {
+		d.DeliverNetworkPacket(remote, local, protocol, pkt)
+	}
+}
+
+// Attach implements stack.LinkEndpoint.
+func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.mu.Lock()
+	e.dispatcher = dispatcher
+	e.mu.Unlock()
+	// If we're attaching to a valid dispatcher, pass embedder as the dispatcher
+	// to our child, otherwise detach the child by giving it a nil dispatcher.
+	var pass stack.NetworkDispatcher
+	if dispatcher != nil {
+		pass = e.embedder
+	}
+	e.child.Attach(pass)
+}
+
+// IsAttached implements stack.LinkEndpoint.
+func (e *Endpoint) IsAttached() bool {
+	e.mu.RLock()
+	isAttached := e.dispatcher != nil
+	e.mu.RUnlock()
+	return isAttached
+}
+
+// MTU implements stack.LinkEndpoint.
+func (e *Endpoint) MTU() uint32 {
+	return e.child.MTU()
+}
+
+// Capabilities implements stack.LinkEndpoint.
+func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.child.Capabilities()
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.
+func (e *Endpoint) MaxHeaderLength() uint16 {
+	return e.child.MaxHeaderLength()
+}
+
+// LinkAddress implements stack.LinkEndpoint.
+func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.child.LinkAddress()
+}
+
+// WritePacket implements stack.LinkEndpoint.
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	return e.child.WritePacket(r, gso, protocol, pkt)
+}
+
+// WritePackets implements stack.LinkEndpoint.
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	return e.child.WritePackets(r, gso, pkts, protocol)
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.
+func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	return e.child.WriteRawPacket(vv)
+}
+
+// Wait implements stack.LinkEndpoint.
+func (e *Endpoint) Wait() {
+	e.child.Wait()
+}
+
+// GSOMaxSize implements stack.GSOEndpoint.
+func (e *Endpoint) GSOMaxSize() uint32 {
+	if e, ok := e.child.(stack.GSOEndpoint); ok {
+		return e.GSOMaxSize()
+	}
+	return 0
+}
diff --git a/pkg/tcpip/link/nested/nested_test.go b/pkg/tcpip/link/nested/nested_test.go
new file mode 100644
index 000000000..c1a219f02
--- /dev/null
+++ b/pkg/tcpip/link/nested/nested_test.go
@@ -0,0 +1,105 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nested_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/nested"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+type parentEndpoint struct {
+	nested.Endpoint
+}
+
+var _ stack.LinkEndpoint = (*parentEndpoint)(nil)
+var _ stack.NetworkDispatcher = (*parentEndpoint)(nil)
+
+type childEndpoint struct {
+	stack.LinkEndpoint
+	dispatcher stack.NetworkDispatcher
+}
+
+var _ stack.LinkEndpoint = (*childEndpoint)(nil)
+
+func (c *childEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	c.dispatcher = dispatcher
+}
+
+func (c *childEndpoint) IsAttached() bool {
+	return c.dispatcher != nil
+}
+
+type counterDispatcher struct {
+	count int
+}
+
+var _ stack.NetworkDispatcher = (*counterDispatcher)(nil)
+
+func (d *counterDispatcher) DeliverNetworkPacket(tcpip.LinkAddress, tcpip.LinkAddress, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) {
+	d.count++
+}
+
+func TestNestedLinkEndpoint(t *testing.T) {
+	const emptyAddress = tcpip.LinkAddress("")
+
+	var (
+		childEP  childEndpoint
+		nestedEP parentEndpoint
+		disp     counterDispatcher
+	)
+	nestedEP.Endpoint.Init(&childEP, &nestedEP)
+
+	if childEP.IsAttached() {
+		t.Error("On init, childEP.IsAttached() = true, want = false")
+	}
+	if nestedEP.IsAttached() {
+		t.Error("On init, nestedEP.IsAttached() = true, want = false")
+	}
+
+	nestedEP.Attach(&disp)
+	if disp.count != 0 {
+		t.Fatalf("After attach, got disp.count = %d, want = 0", disp.count)
+	}
+	if !childEP.IsAttached() {
+		t.Error("After attach, childEP.IsAttached() = false, want = true")
+	}
+	if !nestedEP.IsAttached() {
+		t.Error("After attach, nestedEP.IsAttached() = false, want = true")
+	}
+
+	nestedEP.DeliverNetworkPacket(emptyAddress, emptyAddress, header.IPv4ProtocolNumber, &stack.PacketBuffer{})
+	if disp.count != 1 {
+		t.Errorf("After first packet with dispatcher attached, got disp.count = %d, want = 1", disp.count)
+	}
+
+	nestedEP.Attach(nil)
+	if childEP.IsAttached() {
+		t.Error("After detach, childEP.IsAttached() = true, want = false")
+	}
+	if nestedEP.IsAttached() {
+		t.Error("After detach, nestedEP.IsAttached() = true, want = false")
+	}
+
+	disp.count = 0
+	nestedEP.DeliverNetworkPacket(emptyAddress, emptyAddress, header.IPv4ProtocolNumber, &stack.PacketBuffer{})
+	if disp.count != 0 {
+		t.Errorf("After second packet with dispatcher detached, got disp.count = %d, want = 0", disp.count)
+	}
+
+}
diff --git a/pkg/tcpip/link/qdisc/fifo/BUILD b/pkg/tcpip/link/qdisc/fifo/BUILD
new file mode 100644
index 000000000..054c213bc
--- /dev/null
+++ b/pkg/tcpip/link/qdisc/fifo/BUILD
@@ -0,0 +1,19 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "fifo",
+    srcs = [
+        "endpoint.go",
+        "packet_buffer_queue.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sleep",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/qdisc/fifo/endpoint.go b/pkg/tcpip/link/qdisc/fifo/endpoint.go
new file mode 100644
index 000000000..b5dfb7850
--- /dev/null
+++ b/pkg/tcpip/link/qdisc/fifo/endpoint.go
@@ -0,0 +1,209 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fifo provides the implementation of data-link layer endpoints that
+// wrap another endpoint and queues all outbound packets and asynchronously
+// dispatches them to the lower endpoint.
+package fifo
+
+import (
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// endpoint represents a LinkEndpoint which implements a FIFO queue for all
+// outgoing packets. endpoint can have 1 or more underlying queueDispatchers.
+// All outgoing packets are consistenly hashed to a single underlying queue
+// using the PacketBuffer.Hash if set, otherwise all packets are queued to the
+// first queue to avoid reordering in case of missing hash.
+type endpoint struct {
+	dispatcher  stack.NetworkDispatcher
+	lower       stack.LinkEndpoint
+	wg          sync.WaitGroup
+	dispatchers []*queueDispatcher
+}
+
+// queueDispatcher is responsible for dispatching all outbound packets in its
+// queue. It will also smartly batch packets when possible and write them
+// through the lower LinkEndpoint.
+type queueDispatcher struct {
+	lower          stack.LinkEndpoint
+	q              *packetBufferQueue
+	newPacketWaker sleep.Waker
+	closeWaker     sleep.Waker
+}
+
+// New creates a new fifo link endpoint with the n queues with maximum
+// capacity of queueLen.
+func New(lower stack.LinkEndpoint, n int, queueLen int) stack.LinkEndpoint {
+	e := &endpoint{
+		lower: lower,
+	}
+	// Create the required dispatchers
+	for i := 0; i < n; i++ {
+		qd := &queueDispatcher{
+			q:     &packetBufferQueue{limit: queueLen},
+			lower: lower,
+		}
+		e.dispatchers = append(e.dispatchers, qd)
+		e.wg.Add(1)
+		go func() {
+			defer e.wg.Done()
+			qd.dispatchLoop()
+		}()
+	}
+	return e
+}
+
+func (q *queueDispatcher) dispatchLoop() {
+	const newPacketWakerID = 1
+	const closeWakerID = 2
+	s := sleep.Sleeper{}
+	s.AddWaker(&q.newPacketWaker, newPacketWakerID)
+	s.AddWaker(&q.closeWaker, closeWakerID)
+	defer s.Done()
+
+	const batchSize = 32
+	var batch stack.PacketBufferList
+	for {
+		id, ok := s.Fetch(true)
+		if ok && id == closeWakerID {
+			return
+		}
+		for pkt := q.q.dequeue(); pkt != nil; pkt = q.q.dequeue() {
+			batch.PushBack(pkt)
+			if batch.Len() < batchSize && !q.q.empty() {
+				continue
+			}
+			// We pass a protocol of zero here because each packet carries its
+			// NetworkProtocol.
+			q.lower.WritePackets(nil /* route */, nil /* gso */, batch, 0 /* protocol */)
+			for pkt := batch.Front(); pkt != nil; pkt = pkt.Next() {
+				pkt.EgressRoute.Release()
+				batch.Remove(pkt)
+			}
+			batch.Reset()
+		}
+	}
+}
+
+// DeliverNetworkPacket implements stack.NetworkDispatcher.DeliverNetworkPacket.
+func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
+}
+
+// Attach implements stack.LinkEndpoint.Attach.
+func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+	e.lower.Attach(e)
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU.
+func (e *endpoint) MTU() uint32 {
+	return e.lower.MTU()
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.lower.Capabilities()
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength.
+func (e *endpoint) MaxHeaderLength() uint16 {
+	return e.lower.MaxHeaderLength()
+}
+
+// LinkAddress implements stack.LinkEndpoint.LinkAddress.
+func (e *endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.lower.LinkAddress()
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (e *endpoint) GSOMaxSize() uint32 {
+	if gso, ok := e.lower.(stack.GSOEndpoint); ok {
+		return gso.GSOMaxSize()
+	}
+	return 0
+}
+
+// WritePacket implements stack.LinkEndpoint.WritePacket.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	// WritePacket caller's do not set the following fields in PacketBuffer
+	// so we populate them here.
+	newRoute := r.Clone()
+	pkt.EgressRoute = &newRoute
+	pkt.GSOOptions = gso
+	pkt.NetworkProtocolNumber = protocol
+	d := e.dispatchers[int(pkt.Hash)%len(e.dispatchers)]
+	if !d.q.enqueue(pkt) {
+		return tcpip.ErrNoBufferSpace
+	}
+	d.newPacketWaker.Assert()
+	return nil
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+//
+// Being a batch API, each packet in pkts should have the following fields
+// populated:
+//   - pkt.EgressRoute
+//   - pkt.GSOOptions
+//   - pkt.NetworkProtocolNumber
+func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, _ tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	enqueued := 0
+	for pkt := pkts.Front(); pkt != nil; {
+		d := e.dispatchers[int(pkt.Hash)%len(e.dispatchers)]
+		nxt := pkt.Next()
+		// Since qdisc can hold onto a packet for long we should Clone
+		// the route here to ensure it doesn't get released while the
+		// packet is still in our queue.
+		newRoute := pkt.EgressRoute.Clone()
+		pkt.EgressRoute = &newRoute
+		if !d.q.enqueue(pkt) {
+			if enqueued > 0 {
+				d.newPacketWaker.Assert()
+			}
+			return enqueued, tcpip.ErrNoBufferSpace
+		}
+		pkt = nxt
+		enqueued++
+		d.newPacketWaker.Assert()
+	}
+	return enqueued, nil
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	return e.lower.WriteRawPacket(vv)
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (e *endpoint) Wait() {
+	e.lower.Wait()
+
+	// The linkEP is gone. Teardown the outbound dispatcher goroutines.
+	for i := range e.dispatchers {
+		e.dispatchers[i].closeWaker.Assert()
+	}
+
+	e.wg.Wait()
+}
diff --git a/pkg/tcpip/link/qdisc/fifo/packet_buffer_queue.go b/pkg/tcpip/link/qdisc/fifo/packet_buffer_queue.go
new file mode 100644
index 000000000..eb5abb906
--- /dev/null
+++ b/pkg/tcpip/link/qdisc/fifo/packet_buffer_queue.go
@@ -0,0 +1,84 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fifo
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// packetBufferQueue is a bounded, thread-safe queue of PacketBuffers.
+//
+type packetBufferQueue struct {
+	mu    sync.Mutex
+	list  stack.PacketBufferList
+	limit int
+	used  int
+}
+
+// emptyLocked determines if the queue is empty.
+// Preconditions: q.mu must be held.
+func (q *packetBufferQueue) emptyLocked() bool {
+	return q.used == 0
+}
+
+// empty determines if the queue is empty.
+func (q *packetBufferQueue) empty() bool {
+	q.mu.Lock()
+	r := q.emptyLocked()
+	q.mu.Unlock()
+
+	return r
+}
+
+// setLimit updates the limit. No PacketBuffers are immediately dropped in case
+// the queue becomes full due to the new limit.
+func (q *packetBufferQueue) setLimit(limit int) {
+	q.mu.Lock()
+	q.limit = limit
+	q.mu.Unlock()
+}
+
+// enqueue adds the given packet to the queue.
+//
+// Returns true when the PacketBuffer is successfully added to the queue, in
+// which case ownership of the reference is transferred to the queue. And
+// returns false if the queue is full, in which case ownership is retained by
+// the caller.
+func (q *packetBufferQueue) enqueue(s *stack.PacketBuffer) bool {
+	q.mu.Lock()
+	r := q.used < q.limit
+	if r {
+		q.list.PushBack(s)
+		q.used++
+	}
+	q.mu.Unlock()
+
+	return r
+}
+
+// dequeue removes and returns the next PacketBuffer from queue, if one exists.
+// Ownership is transferred to the caller.
+func (q *packetBufferQueue) dequeue() *stack.PacketBuffer {
+	q.mu.Lock()
+	s := q.list.Front()
+	if s != nil {
+		q.list.Remove(s)
+		q.used--
+	}
+	q.mu.Unlock()
+
+	return s
+}
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
new file mode 100644
index 000000000..14b527bc2
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -0,0 +1,20 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "rawfile",
+    srcs = [
+        "blockingpoll_amd64.s",
+        "blockingpoll_arm64.s",
+        "blockingpoll_noyield_unsafe.go",
+        "blockingpoll_yield_unsafe.go",
+        "errors.go",
+        "rawfile_unsafe.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
new file mode 100644
index 000000000..298bad55d
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
@@ -0,0 +1,41 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// BlockingPoll makes the ppoll() syscall while calling the version of
+// entersyscall that relinquishes the P so that other Gs can run. This is meant
+// to be called in cases when the syscall is expected to block.
+//
+// func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (n int, err syscall.Errno)
+TEXT ·BlockingPoll(SB),NOSPLIT,$0-40
+	CALL	·callEntersyscallblock(SB)
+	MOVQ	fds+0(FP), DI
+	MOVQ	nfds+8(FP), SI
+	MOVQ	timeout+16(FP), DX
+	MOVQ	$0x0, R10  // sigmask parameter which isn't used here
+	MOVQ	$0x10f, AX // SYS_PPOLL
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	ok
+	MOVQ	$-1, n+24(FP)
+	NEGQ	AX
+	MOVQ	AX, err+32(FP)
+	CALL	·callExitsyscall(SB)
+	RET
+ok:
+	MOVQ	AX, n+24(FP)
+	MOVQ	$0, err+32(FP)
+	CALL	·callExitsyscall(SB)
+	RET
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_arm64.s b/pkg/tcpip/link/rawfile/blockingpoll_arm64.s
new file mode 100644
index 000000000..b62888b93
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/blockingpoll_arm64.s
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// BlockingPoll makes the ppoll() syscall while calling the version of
+// entersyscall that relinquishes the P so that other Gs can run. This is meant
+// to be called in cases when the syscall is expected to block.
+//
+// func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (n int, err syscall.Errno)
+TEXT ·BlockingPoll(SB),NOSPLIT,$0-40
+	BL	·callEntersyscallblock(SB)
+	MOVD	fds+0(FP), R0
+	MOVD	nfds+8(FP), R1
+	MOVD	timeout+16(FP), R2
+	MOVD	$0x0, R3  // sigmask parameter which isn't used here
+	MOVD	$0x49, R8 // SYS_PPOLL
+	SVC
+	CMP	$0xfffffffffffff001, R0
+	BLS	ok
+	MOVD	$-1, R1
+	MOVD	R1, n+24(FP)
+	NEG	R0, R0
+	MOVD	R0, err+32(FP)
+	BL	·callExitsyscall(SB)
+	RET
+ok:
+	MOVD	R0, n+24(FP)
+	MOVD	$0, err+32(FP)
+	BL	·callExitsyscall(SB)
+	RET
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_noyield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_noyield_unsafe.go
new file mode 100644
index 000000000..621ab8d29
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/blockingpoll_noyield_unsafe.go
@@ -0,0 +1,31 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux,!amd64,!arm64
+
+package rawfile
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+// BlockingPoll is just a stub function that forwards to the ppoll() system call
+// on non-amd64 and non-arm64 platforms.
+func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (int, syscall.Errno) {
+	n, _, e := syscall.Syscall6(syscall.SYS_PPOLL, uintptr(unsafe.Pointer(fds)),
+		uintptr(nfds), uintptr(unsafe.Pointer(timeout)), 0, 0, 0)
+
+	return int(n), e
+}
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
new file mode 100644
index 000000000..99313ee25
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
@@ -0,0 +1,66 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux,amd64 linux,arm64
+// +build go1.12
+// +build !go1.16
+
+// Check go:linkname function signatures when updating Go version.
+
+package rawfile
+
+import (
+	"syscall"
+	_ "unsafe" // for go:linkname
+)
+
+// BlockingPoll on amd64/arm64 makes the ppoll() syscall while calling the
+// version of entersyscall that relinquishes the P so that other Gs can
+// run. This is meant to be called in cases when the syscall is expected to
+// block. On non amd64/arm64 platforms it just forwards to the ppoll() system
+// call.
+//
+//go:noescape
+func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (int, syscall.Errno)
+
+// Use go:linkname to call into the runtime. As of Go 1.12 this has to
+// be done from Go code so that we make an ABIInternal call to an
+// ABIInternal function; see https://golang.org/issue/27539.
+
+// We need to call both entersyscallblock and exitsyscall this way so
+// that the runtime's check on the stack pointer lines up.
+
+// Note that calling an unexported function in the runtime package is
+// unsafe and this hack is likely to break in future Go releases.
+
+//go:linkname entersyscallblock runtime.entersyscallblock
+func entersyscallblock()
+
+//go:linkname exitsyscall runtime.exitsyscall
+func exitsyscall()
+
+// These forwarding functions must be nosplit because 1) we must
+// disallow preemption between entersyscallblock and exitsyscall, and
+// 2) we have an untyped assembly frame on the stack which can not be
+// grown or moved.
+
+//go:nosplit
+func callEntersyscallblock() {
+	entersyscallblock()
+}
+
+//go:nosplit
+func callExitsyscall() {
+	exitsyscall()
+}
diff --git a/pkg/tcpip/link/rawfile/errors.go b/pkg/tcpip/link/rawfile/errors.go
new file mode 100644
index 000000000..a0a873c84
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/errors.go
@@ -0,0 +1,70 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package rawfile
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const maxErrno = 134
+
+var translations [maxErrno]*tcpip.Error
+
+// TranslateErrno translate an errno from the syscall package into a
+// *tcpip.Error.
+//
+// Valid, but unrecognized errnos will be translated to
+// tcpip.ErrInvalidEndpointState (EINVAL). Panics on invalid errnos.
+func TranslateErrno(e syscall.Errno) *tcpip.Error {
+	if err := translations[e]; err != nil {
+		return err
+	}
+	return tcpip.ErrInvalidEndpointState
+}
+
+func addTranslation(host syscall.Errno, trans *tcpip.Error) {
+	if translations[host] != nil {
+		panic(fmt.Sprintf("duplicate translation for host errno %q (%d)", host.Error(), host))
+	}
+	translations[host] = trans
+}
+
+func init() {
+	addTranslation(syscall.EEXIST, tcpip.ErrDuplicateAddress)
+	addTranslation(syscall.ENETUNREACH, tcpip.ErrNoRoute)
+	addTranslation(syscall.EINVAL, tcpip.ErrInvalidEndpointState)
+	addTranslation(syscall.EALREADY, tcpip.ErrAlreadyConnecting)
+	addTranslation(syscall.EISCONN, tcpip.ErrAlreadyConnected)
+	addTranslation(syscall.EADDRINUSE, tcpip.ErrPortInUse)
+	addTranslation(syscall.EADDRNOTAVAIL, tcpip.ErrBadLocalAddress)
+	addTranslation(syscall.EPIPE, tcpip.ErrClosedForSend)
+	addTranslation(syscall.EWOULDBLOCK, tcpip.ErrWouldBlock)
+	addTranslation(syscall.ECONNREFUSED, tcpip.ErrConnectionRefused)
+	addTranslation(syscall.ETIMEDOUT, tcpip.ErrTimeout)
+	addTranslation(syscall.EINPROGRESS, tcpip.ErrConnectStarted)
+	addTranslation(syscall.EDESTADDRREQ, tcpip.ErrDestinationRequired)
+	addTranslation(syscall.ENOTSUP, tcpip.ErrNotSupported)
+	addTranslation(syscall.ENOTTY, tcpip.ErrQueueSizeNotSupported)
+	addTranslation(syscall.ENOTCONN, tcpip.ErrNotConnected)
+	addTranslation(syscall.ECONNRESET, tcpip.ErrConnectionReset)
+	addTranslation(syscall.ECONNABORTED, tcpip.ErrConnectionAborted)
+	addTranslation(syscall.EMSGSIZE, tcpip.ErrMessageTooLong)
+	addTranslation(syscall.ENOBUFS, tcpip.ErrNoBufferSpace)
+}
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
new file mode 100644
index 000000000..69de6eb3e
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -0,0 +1,192 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+// Package rawfile contains utilities for using the netstack with raw host
+// files on Linux hosts.
+package rawfile
+
+import (
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+// GetMTU determines the MTU of a network interface device.
+func GetMTU(name string) (uint32, error) {
+	fd, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0)
+	if err != nil {
+		return 0, err
+	}
+
+	defer syscall.Close(fd)
+
+	var ifreq struct {
+		name [16]byte
+		mtu  int32
+		_    [20]byte
+	}
+
+	copy(ifreq.name[:], name)
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.SIOCGIFMTU, uintptr(unsafe.Pointer(&ifreq)))
+	if errno != 0 {
+		return 0, errno
+	}
+
+	return uint32(ifreq.mtu), nil
+}
+
+// NonBlockingWrite writes the given buffer to a file descriptor. It fails if
+// partial data is written.
+func NonBlockingWrite(fd int, buf []byte) *tcpip.Error {
+	var ptr unsafe.Pointer
+	if len(buf) > 0 {
+		ptr = unsafe.Pointer(&buf[0])
+	}
+
+	_, _, e := syscall.RawSyscall(syscall.SYS_WRITE, uintptr(fd), uintptr(ptr), uintptr(len(buf)))
+	if e != 0 {
+		return TranslateErrno(e)
+	}
+
+	return nil
+}
+
+// NonBlockingWrite3 writes up to three byte slices to a file descriptor in a
+// single syscall. It fails if partial data is written.
+func NonBlockingWrite3(fd int, b1, b2, b3 []byte) *tcpip.Error {
+	// If there is no second and third buffer, issue a regular write.
+	if len(b2) == 0 && len(b3) == 0 {
+		return NonBlockingWrite(fd, b1)
+	}
+
+	// Build the iovec that represents them and issue a writev syscall.
+	iovec := [3]syscall.Iovec{
+		{
+			Base: &b1[0],
+			Len:  uint64(len(b1)),
+		},
+		{
+			Base: &b2[0],
+			Len:  uint64(len(b2)),
+		},
+	}
+	iovecLen := uintptr(2)
+
+	if len(b3) > 0 {
+		iovecLen++
+		iovec[2].Base = &b3[0]
+		iovec[2].Len = uint64(len(b3))
+	}
+
+	_, _, e := syscall.RawSyscall(syscall.SYS_WRITEV, uintptr(fd), uintptr(unsafe.Pointer(&iovec[0])), iovecLen)
+	if e != 0 {
+		return TranslateErrno(e)
+	}
+
+	return nil
+}
+
+// NonBlockingSendMMsg sends multiple messages on a socket.
+func NonBlockingSendMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) {
+	n, _, e := syscall.RawSyscall6(unix.SYS_SENDMMSG, uintptr(fd), uintptr(unsafe.Pointer(&msgHdrs[0])), uintptr(len(msgHdrs)), syscall.MSG_DONTWAIT, 0, 0)
+	if e != 0 {
+		return 0, TranslateErrno(e)
+	}
+
+	return int(n), nil
+}
+
+// PollEvent represents the pollfd structure passed to a poll() system call.
+type PollEvent struct {
+	FD      int32
+	Events  int16
+	Revents int16
+}
+
+// BlockingRead reads from a file descriptor that is set up as non-blocking. If
+// no data is available, it will block in a poll() syscall until the file
+// descriptor becomes readable.
+func BlockingRead(fd int, b []byte) (int, *tcpip.Error) {
+	for {
+		n, _, e := syscall.RawSyscall(syscall.SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(&b[0])), uintptr(len(b)))
+		if e == 0 {
+			return int(n), nil
+		}
+
+		event := PollEvent{
+			FD:     int32(fd),
+			Events: 1, // POLLIN
+		}
+
+		_, e = BlockingPoll(&event, 1, nil)
+		if e != 0 && e != syscall.EINTR {
+			return 0, TranslateErrno(e)
+		}
+	}
+}
+
+// BlockingReadv reads from a file descriptor that is set up as non-blocking and
+// stores the data in a list of iovecs buffers. If no data is available, it will
+// block in a poll() syscall until the file descriptor becomes readable.
+func BlockingReadv(fd int, iovecs []syscall.Iovec) (int, *tcpip.Error) {
+	for {
+		n, _, e := syscall.RawSyscall(syscall.SYS_READV, uintptr(fd), uintptr(unsafe.Pointer(&iovecs[0])), uintptr(len(iovecs)))
+		if e == 0 {
+			return int(n), nil
+		}
+
+		event := PollEvent{
+			FD:     int32(fd),
+			Events: 1, // POLLIN
+		}
+
+		_, e = BlockingPoll(&event, 1, nil)
+		if e != 0 && e != syscall.EINTR {
+			return 0, TranslateErrno(e)
+		}
+	}
+}
+
+// MMsgHdr represents the mmsg_hdr structure required by recvmmsg() on linux.
+type MMsgHdr struct {
+	Msg syscall.Msghdr
+	Len uint32
+	_   [4]byte
+}
+
+// BlockingRecvMMsg reads from a file descriptor that is set up as non-blocking
+// and stores the received messages in a slice of MMsgHdr structures. If no data
+// is available, it will block in a poll() syscall until the file descriptor
+// becomes readable.
+func BlockingRecvMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) {
+	for {
+		n, _, e := syscall.RawSyscall6(syscall.SYS_RECVMMSG, uintptr(fd), uintptr(unsafe.Pointer(&msgHdrs[0])), uintptr(len(msgHdrs)), syscall.MSG_DONTWAIT, 0, 0)
+		if e == 0 {
+			return int(n), nil
+		}
+
+		event := PollEvent{
+			FD:     int32(fd),
+			Events: 1, // POLLIN
+		}
+
+		if _, e := BlockingPoll(&event, 1, nil); e != 0 && e != syscall.EINTR {
+			return 0, TranslateErrno(e)
+		}
+	}
+}
diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD
new file mode 100644
index 000000000..13243ebbb
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/BUILD
@@ -0,0 +1,41 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "sharedmem",
+    srcs = [
+        "rx.go",
+        "sharedmem.go",
+        "sharedmem_unsafe.go",
+        "tx.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/log",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/rawfile",
+        "//pkg/tcpip/link/sharedmem/queue",
+        "//pkg/tcpip/stack",
+    ],
+)
+
+go_test(
+    name = "sharedmem_test",
+    srcs = [
+        "sharedmem_test.go",
+    ],
+    library = ":sharedmem",
+    deps = [
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/sharedmem/pipe",
+        "//pkg/tcpip/link/sharedmem/queue",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD
new file mode 100644
index 000000000..87020ec08
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/pipe/BUILD
@@ -0,0 +1,23 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "pipe",
+    srcs = [
+        "pipe.go",
+        "pipe_unsafe.go",
+        "rx.go",
+        "tx.go",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+go_test(
+    name = "pipe_test",
+    srcs = [
+        "pipe_test.go",
+    ],
+    library = ":pipe",
+    deps = ["//pkg/sync"],
+)
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe.go b/pkg/tcpip/link/sharedmem/pipe/pipe.go
new file mode 100644
index 000000000..74c9f0311
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe.go
@@ -0,0 +1,78 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pipe implements a shared memory ring buffer on which a single reader
+// and a single writer can operate (read/write) concurrently. The ring buffer
+// allows for data of different sizes to be written, and preserves the boundary
+// of the written data.
+//
+// Example usage is as follows:
+//
+//	wb := t.Push(20)
+//	// Write data to wb.
+//	t.Flush()
+//
+//	rb := r.Pull()
+//	// Do something with data in rb.
+//	t.Flush()
+package pipe
+
+import (
+	"math"
+)
+
+const (
+	jump           uint64 = math.MaxUint32 + 1
+	offsetMask     uint64 = math.MaxUint32
+	revolutionMask uint64 = ^offsetMask
+
+	sizeOfSlotHeader        = 8 // sizeof(uint64)
+	slotFree         uint64 = 1 << 63
+	slotSizeMask     uint64 = math.MaxUint32
+)
+
+// payloadToSlotSize calculates the total size of a slot based on its payload
+// size. The  total size is the header size, plus the payload size, plus padding
+// if necessary to make the total size a multiple of sizeOfSlotHeader.
+func payloadToSlotSize(payloadSize uint64) uint64 {
+	s := sizeOfSlotHeader + payloadSize
+	return (s + sizeOfSlotHeader - 1) &^ (sizeOfSlotHeader - 1)
+}
+
+// slotToPayloadSize calculates the payload size of a slot based on the total
+// size of the slot. This is only meant to be used when creating slots that
+// don't carry information (e.g., free slots or wrap slots).
+func slotToPayloadSize(offset uint64) uint64 {
+	return offset - sizeOfSlotHeader
+}
+
+// pipe is a basic data structure used by both (transmit & receive) ends of a
+// pipe. Indices into this pipe are split into two fields: offset, which counts
+// the number of bytes from the beginning of the buffer, and revolution, which
+// counts the number of times the index has wrapped around.
+type pipe struct {
+	buffer []byte
+}
+
+// init initializes the pipe buffer such that its size is a multiple of the size
+// of the slot header.
+func (p *pipe) init(b []byte) {
+	p.buffer = b[:len(b)&^(sizeOfSlotHeader-1)]
+}
+
+// data returns a section of the buffer starting at the given index (which may
+// include revolution information) and with the given size.
+func (p *pipe) data(idx uint64, size uint64) []byte {
+	return p.buffer[(idx&offsetMask)+sizeOfSlotHeader:][:size]
+}
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
new file mode 100644
index 000000000..dc239a0d0
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
@@ -0,0 +1,518 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"math/rand"
+	"reflect"
+	"runtime"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+func TestSimpleReadWrite(t *testing.T) {
+	// Check that a simple write can be properly read from the rx side.
+	tr := rand.New(rand.NewSource(99))
+	rr := rand.New(rand.NewSource(99))
+
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	wb := tx.Push(10)
+	if wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	for i := range wb {
+		wb[i] = byte(tr.Intn(256))
+	}
+	tx.Flush()
+
+	var rx Rx
+	rx.Init(b)
+	rb := rx.Pull()
+	if len(rb) != 10 {
+		t.Fatalf("Bad buffer size returned: got %v, want %v", len(rb), 10)
+	}
+
+	for i := range rb {
+		if v := byte(rr.Intn(256)); v != rb[i] {
+			t.Fatalf("Bad read buffer at index %v: got %v, want %v", i, rb[i], v)
+		}
+	}
+	rx.Flush()
+}
+
+func TestEmptyRead(t *testing.T) {
+	// Check that pulling from an empty pipe fails.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on empty pipe")
+	}
+}
+
+func TestTooLargeWrite(t *testing.T) {
+	// Check that writes that are too large are properly rejected.
+	b := make([]byte, 96)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(96); wb != nil {
+		t.Fatalf("Write of 96 bytes succeeded on 96-byte pipe")
+	}
+
+	if wb := tx.Push(88); wb != nil {
+		t.Fatalf("Write of 88 bytes succeeded on 96-byte pipe")
+	}
+
+	if wb := tx.Push(80); wb == nil {
+		t.Fatalf("Write of 80 bytes failed on 96-byte pipe")
+	}
+}
+
+func TestFullWrite(t *testing.T) {
+	// Check that writes fail when the pipe is full.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(80); wb == nil {
+		t.Fatalf("Write of 80 bytes failed on 96-byte pipe")
+	}
+
+	if wb := tx.Push(1); wb != nil {
+		t.Fatalf("Write succeeded on full pipe")
+	}
+}
+
+func TestFullAndFlushedWrite(t *testing.T) {
+	// Check that writes fail when the pipe is full and has already been
+	// flushed.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(80); wb == nil {
+		t.Fatalf("Write of 80 bytes failed on 96-byte pipe")
+	}
+
+	tx.Flush()
+
+	if wb := tx.Push(1); wb != nil {
+		t.Fatalf("Write succeeded on full pipe")
+	}
+}
+
+func TestTxFlushTwice(t *testing.T) {
+	// Checks that a second consecutive tx flush is a no-op.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	tx.Flush()
+
+	// Make copy of original tx queue, flush it, then check that it didn't
+	// change.
+	orig := tx
+	tx.Flush()
+
+	if !reflect.DeepEqual(orig, tx) {
+		t.Fatalf("Flush mutated tx pipe: got %v, want %v", tx, orig)
+	}
+}
+
+func TestRxFlushTwice(t *testing.T) {
+	// Checks that a second consecutive rx flush is a no-op.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	tx.Flush()
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+	rx.Flush()
+
+	// Make copy of original rx queue, flush it, then check that it didn't
+	// change.
+	orig := rx
+	rx.Flush()
+
+	if !reflect.DeepEqual(orig, rx) {
+		t.Fatalf("Flush mutated rx pipe: got %v, want %v", rx, orig)
+	}
+}
+
+func TestWrapInMiddleOfTransaction(t *testing.T) {
+	// Check that writes are not flushed when we need to wrap the buffer
+	// around.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	tx.Flush()
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+	rx.Flush()
+
+	// At this point the ring buffer is empty, but the write is at offset
+	// 64 (50 + sizeOfSlotHeader + padding-for-8-byte-alignment).
+	if wb := tx.Push(10); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on non-full pipe")
+	}
+
+	// We haven't flushed yet, so pull must return nil.
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on non-flushed pipe")
+	}
+
+	tx.Flush()
+
+	// The two buffers must be available now.
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+}
+
+func TestWriteAbort(t *testing.T) {
+	// Check that a read fails on a pipe that has had data pushed to it but
+	// has aborted the push.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(10); wb == nil {
+		t.Fatalf("Write failed on empty pipe")
+	}
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on empty pipe")
+	}
+
+	tx.Abort()
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on empty pipe")
+	}
+}
+
+func TestWrappedWriteAbort(t *testing.T) {
+	// Check that writes are properly aborted even if the writes wrap
+	// around.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	tx.Flush()
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+	rx.Flush()
+
+	// At this point the ring buffer is empty, but the write is at offset
+	// 64 (50 + sizeOfSlotHeader + padding-for-8-byte-alignment).
+	if wb := tx.Push(10); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on non-full pipe")
+	}
+
+	// We haven't flushed yet, so pull must return nil.
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on non-flushed pipe")
+	}
+
+	tx.Abort()
+
+	// The pushes were aborted, so no data should be readable.
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on non-flushed pipe")
+	}
+
+	// Try the same transactions again, but flush this time.
+	if wb := tx.Push(10); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on non-full pipe")
+	}
+
+	tx.Flush()
+
+	// The two buffers must be available now.
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+}
+
+func TestEmptyReadOnNonFlushedWrite(t *testing.T) {
+	// Check that a read fails on a pipe that has had data pushed to it
+	// but not yet flushed.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(10); wb == nil {
+		t.Fatalf("Write failed on empty pipe")
+	}
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on empty pipe")
+	}
+
+	tx.Flush()
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull on failed on non-empty pipe")
+	}
+}
+
+func TestPullAfterPullingEntirePipe(t *testing.T) {
+	// Check that Pull fails when the pipe is full, but all of it has
+	// already been pulled but not yet flushed.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	tx.Flush()
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+	rx.Flush()
+
+	// At this point the ring buffer is empty, but the write is at offset
+	// 64 (50 + sizeOfSlotHeader + padding-for-8-byte-alignment). Write 3
+	// buffers that will fill the pipe.
+	if wb := tx.Push(10); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+
+	if wb := tx.Push(20); wb == nil {
+		t.Fatalf("Push failed on non-full pipe")
+	}
+
+	if wb := tx.Push(24); wb == nil {
+		t.Fatalf("Push failed on non-full pipe")
+	}
+
+	tx.Flush()
+
+	// The three buffers must be available now.
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+
+	// Fourth pull must fail.
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on empty pipe")
+	}
+}
+
+func TestNoRoomToWrapOnPush(t *testing.T) {
+	// Check that Push fails when it tries to allocate room to add a wrap
+	// message.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	tx.Flush()
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+	rx.Flush()
+
+	// At this point the ring buffer is empty, but the write is at offset
+	// 64 (50 + sizeOfSlotHeader + padding-for-8-byte-alignment). Write 20,
+	// which won't fit (64+20+8+padding = 96, which wouldn't leave room for
+	// the padding), so it wraps around.
+	if wb := tx.Push(20); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+
+	tx.Flush()
+
+	// Buffer offset is at 28. Try to write 70, which would require a wrap
+	// slot which cannot be created now.
+	if wb := tx.Push(70); wb != nil {
+		t.Fatalf("Push succeeded on pipe with no room for wrap message")
+	}
+}
+
+func TestRxImplicitFlushOfWrapMessage(t *testing.T) {
+	// Check if the first read is that of a wrapping message, that it gets
+	// immediately flushed.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	tx.Flush()
+
+	// This will cause a wrapping message to written.
+	if wb := tx.Push(60); wb != nil {
+		t.Fatalf("Push succeeded when there is no room in pipe")
+	}
+
+	var rx Rx
+	rx.Init(b)
+
+	// Read the first message.
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+	rx.Flush()
+
+	// This should fail because of the wrapping message is taking up space.
+	if wb := tx.Push(60); wb != nil {
+		t.Fatalf("Push succeeded when there is no room in pipe")
+	}
+
+	// Try to read the next one. This should consume the wrapping message.
+	rx.Pull()
+
+	// This must now succeed.
+	if wb := tx.Push(60); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+}
+
+func TestConcurrentReaderWriter(t *testing.T) {
+	// Push a million buffers of random sizes and random contents. Check
+	// that buffers read match what was written.
+	tr := rand.New(rand.NewSource(99))
+	rr := rand.New(rand.NewSource(99))
+
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	var rx Rx
+	rx.Init(b)
+
+	const count = 1000000
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		runtime.Gosched()
+		for i := 0; i < count; i++ {
+			n := 1 + tr.Intn(80)
+			wb := tx.Push(uint64(n))
+			for wb == nil {
+				wb = tx.Push(uint64(n))
+			}
+
+			for j := range wb {
+				wb[j] = byte(tr.Intn(256))
+			}
+
+			tx.Flush()
+		}
+	}()
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		runtime.Gosched()
+		for i := 0; i < count; i++ {
+			n := 1 + rr.Intn(80)
+			rb := rx.Pull()
+			for rb == nil {
+				rb = rx.Pull()
+			}
+
+			if n != len(rb) {
+				t.Fatalf("Bad %v-th buffer length: got %v, want %v", i, len(rb), n)
+			}
+
+			for j := range rb {
+				if v := byte(rr.Intn(256)); v != rb[j] {
+					t.Fatalf("Bad %v-th read buffer at index %v: got %v, want %v", i, j, rb[j], v)
+				}
+			}
+
+			rx.Flush()
+		}
+	}()
+
+	wg.Wait()
+}
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go b/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
new file mode 100644
index 000000000..62d17029e
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
@@ -0,0 +1,35 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+func (p *pipe) write(idx uint64, v uint64) {
+	ptr := (*uint64)(unsafe.Pointer(&p.buffer[idx&offsetMask:][:8][0]))
+	*ptr = v
+}
+
+func (p *pipe) writeAtomic(idx uint64, v uint64) {
+	ptr := (*uint64)(unsafe.Pointer(&p.buffer[idx&offsetMask:][:8][0]))
+	atomic.StoreUint64(ptr, v)
+}
+
+func (p *pipe) readAtomic(idx uint64) uint64 {
+	ptr := (*uint64)(unsafe.Pointer(&p.buffer[idx&offsetMask:][:8][0]))
+	return atomic.LoadUint64(ptr)
+}
diff --git a/pkg/tcpip/link/sharedmem/pipe/rx.go b/pkg/tcpip/link/sharedmem/pipe/rx.go
new file mode 100644
index 000000000..f22e533ac
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/pipe/rx.go
@@ -0,0 +1,93 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+// Rx is the receive side of the shared memory ring buffer.
+type Rx struct {
+	p pipe
+
+	tail uint64
+	head uint64
+}
+
+// Init initializes the receive end of the pipe. In the initial state, the next
+// slot to be inspected is the very first one.
+func (r *Rx) Init(b []byte) {
+	r.p.init(b)
+	r.tail = 0xfffffffe * jump
+	r.head = r.tail
+}
+
+// Pull reads the next buffer from the pipe, returning nil if there isn't one
+// currently available.
+//
+// The returned slice is available until Flush() is next called. After that, it
+// must not be touched.
+func (r *Rx) Pull() []byte {
+	if r.head == r.tail+jump {
+		// We've already pulled the whole pipe.
+		return nil
+	}
+
+	header := r.p.readAtomic(r.head)
+	if header&slotFree != 0 {
+		// The next slot is free, we can't pull it yet.
+		return nil
+	}
+
+	payloadSize := header & slotSizeMask
+	newHead := r.head + payloadToSlotSize(payloadSize)
+	headWrap := (r.head & revolutionMask) | uint64(len(r.p.buffer))
+
+	// Check if this is a wrapping slot. If that's the case, it carries no
+	// data, so we just skip it and try again from the first slot.
+	if int64(newHead-headWrap) >= 0 {
+		if int64(newHead-headWrap) > int64(jump) || newHead&offsetMask != 0 {
+			return nil
+		}
+
+		if r.tail == r.head {
+			// If this is the first pull since the last Flush()
+			// call, we flush the state so that the sender can use
+			// this space if it needs to.
+			r.p.writeAtomic(r.head, slotFree|slotToPayloadSize(newHead-r.head))
+			r.tail = newHead
+		}
+
+		r.head = newHead
+		return r.Pull()
+	}
+
+	// Grab the buffer before updating r.head.
+	b := r.p.data(r.head, payloadSize)
+	r.head = newHead
+	return b
+}
+
+// Flush tells the transmitter that all buffers pulled since the last Flush()
+// have been used, so the transmitter is free to used their slots for further
+// transmission.
+func (r *Rx) Flush() {
+	if r.head == r.tail {
+		return
+	}
+	r.p.writeAtomic(r.tail, slotFree|slotToPayloadSize(r.head-r.tail))
+	r.tail = r.head
+}
+
+// Bytes returns the byte slice on which the pipe operates.
+func (r *Rx) Bytes() []byte {
+	return r.p.buffer
+}
diff --git a/pkg/tcpip/link/sharedmem/pipe/tx.go b/pkg/tcpip/link/sharedmem/pipe/tx.go
new file mode 100644
index 000000000..9841eb231
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/pipe/tx.go
@@ -0,0 +1,161 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+// Tx is the transmit side of the shared memory ring buffer.
+type Tx struct {
+	p              pipe
+	maxPayloadSize uint64
+
+	head uint64
+	tail uint64
+	next uint64
+
+	tailHeader uint64
+}
+
+// Init initializes the transmit end of the pipe. In the initial state, the next
+// slot to be written is the very first one, and the transmitter has the whole
+// ring buffer available to it.
+func (t *Tx) Init(b []byte) {
+	t.p.init(b)
+	// maxPayloadSize excludes the header of the payload, and the header
+	// of the wrapping message.
+	t.maxPayloadSize = uint64(len(t.p.buffer)) - 2*sizeOfSlotHeader
+	t.tail = 0xfffffffe * jump
+	t.next = t.tail
+	t.head = t.tail + jump
+	t.p.write(t.tail, slotFree)
+}
+
+// Capacity determines how many records of the given size can be written to the
+// pipe before it fills up.
+func (t *Tx) Capacity(recordSize uint64) uint64 {
+	available := uint64(len(t.p.buffer)) - sizeOfSlotHeader
+	entryLen := payloadToSlotSize(recordSize)
+	return available / entryLen
+}
+
+// Push reserves "payloadSize" bytes for transmission in the pipe. The caller
+// populates the returned slice with the data to be transferred and enventually
+// calls Flush() to make the data visible to the reader, or Abort() to make the
+// pipe forget all Push() calls since the last Flush().
+//
+// The returned slice is available until Flush() or Abort() is next called.
+// After that, it must not be touched.
+func (t *Tx) Push(payloadSize uint64) []byte {
+	// Fail request if we know we will never have enough room.
+	if payloadSize > t.maxPayloadSize {
+		return nil
+	}
+
+	totalLen := payloadToSlotSize(payloadSize)
+	newNext := t.next + totalLen
+	nextWrap := (t.next & revolutionMask) | uint64(len(t.p.buffer))
+	if int64(newNext-nextWrap) >= 0 {
+		// The new buffer would overflow the pipe, so we push a wrapping
+		// slot, then try to add the actual slot to the front of the
+		// pipe.
+		newNext = (newNext & revolutionMask) + jump
+		wrappingPayloadSize := slotToPayloadSize(newNext - t.next)
+		if !t.reclaim(newNext) {
+			return nil
+		}
+
+		oldNext := t.next
+		t.next = newNext
+		if oldNext != t.tail {
+			t.p.write(oldNext, wrappingPayloadSize)
+		} else {
+			t.tailHeader = wrappingPayloadSize
+			t.Flush()
+		}
+
+		newNext += totalLen
+	}
+
+	// Check that we have enough room for the buffer.
+	if !t.reclaim(newNext) {
+		return nil
+	}
+
+	if t.next != t.tail {
+		t.p.write(t.next, payloadSize)
+	} else {
+		t.tailHeader = payloadSize
+	}
+
+	// Grab the buffer before updating t.next.
+	b := t.p.data(t.next, payloadSize)
+	t.next = newNext
+
+	return b
+}
+
+// reclaim attempts to advance the head until at least newNext. If the head is
+// already at or beyond newNext, nothing happens and true is returned; otherwise
+// it tries to reclaim slots that have already been consumed by the receive end
+// of the pipe (they will be marked as free) and returns a boolean indicating
+// whether it was successful in reclaiming enough slots.
+func (t *Tx) reclaim(newNext uint64) bool {
+	for int64(newNext-t.head) > 0 {
+		// Can't reclaim if slot is not free.
+		header := t.p.readAtomic(t.head)
+		if header&slotFree == 0 {
+			return false
+		}
+
+		payloadSize := header & slotSizeMask
+		newHead := t.head + payloadToSlotSize(payloadSize)
+
+		// Check newHead is within bounds and valid.
+		if int64(newHead-t.tail) > int64(jump) || newHead&offsetMask >= uint64(len(t.p.buffer)) {
+			return false
+		}
+
+		t.head = newHead
+	}
+
+	return true
+}
+
+// Abort causes all Push() calls since the last Flush() to be forgotten and
+// therefore they will not be made visible to the receiver.
+func (t *Tx) Abort() {
+	t.next = t.tail
+}
+
+// Flush causes all buffers pushed since the last Flush() [or Abort(), whichever
+// is the most recent] to be made visible to the receiver.
+func (t *Tx) Flush() {
+	if t.next == t.tail {
+		// Nothing to do if there are no pushed buffers.
+		return
+	}
+
+	if t.next != t.head {
+		// The receiver will spin in t.next, so we must make sure that
+		// the slotFree bit is set.
+		t.p.write(t.next, slotFree)
+	}
+
+	t.p.writeAtomic(t.tail, t.tailHeader)
+	t.tail = t.next
+}
+
+// Bytes returns the byte slice on which the pipe operates.
+func (t *Tx) Bytes() []byte {
+	return t.p.buffer
+}
diff --git a/pkg/tcpip/link/sharedmem/queue/BUILD b/pkg/tcpip/link/sharedmem/queue/BUILD
new file mode 100644
index 000000000..3ba06af73
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/queue/BUILD
@@ -0,0 +1,27 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "queue",
+    srcs = [
+        "rx.go",
+        "tx.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/log",
+        "//pkg/tcpip/link/sharedmem/pipe",
+    ],
+)
+
+go_test(
+    name = "queue_test",
+    srcs = [
+        "queue_test.go",
+    ],
+    library = ":queue",
+    deps = [
+        "//pkg/tcpip/link/sharedmem/pipe",
+    ],
+)
diff --git a/pkg/tcpip/link/sharedmem/queue/queue_test.go b/pkg/tcpip/link/sharedmem/queue/queue_test.go
new file mode 100644
index 000000000..9a0aad5d7
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/queue/queue_test.go
@@ -0,0 +1,517 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package queue
+
+import (
+	"encoding/binary"
+	"reflect"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe"
+)
+
+func TestBasicTxQueue(t *testing.T) {
+	// Tests that a basic transmit on a queue works, and that completion
+	// gets properly reported as well.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Tx
+	q.Init(pb1, pb2)
+
+	// Enqueue two buffers.
+	b := []TxBuffer{
+		{nil, 100, 60},
+		{nil, 200, 40},
+	}
+
+	b[0].Next = &b[1]
+
+	const usedID = 1002
+	const usedTotalSize = 100
+	if !q.Enqueue(usedID, usedTotalSize, 2, &b[0]) {
+		t.Fatalf("Enqueue failed on empty queue")
+	}
+
+	// Check the contents of the pipe.
+	d := rxp.Pull()
+	if d == nil {
+		t.Fatalf("Tx pipe is empty after Enqueue")
+	}
+
+	want := []byte{
+		234, 3, 0, 0, 0, 0, 0, 0, // id
+		100, 0, 0, 0, // total size
+		0, 0, 0, 0, // reserved
+		100, 0, 0, 0, 0, 0, 0, 0, // offset 1
+		60, 0, 0, 0, // size 1
+		200, 0, 0, 0, 0, 0, 0, 0, // offset 2
+		40, 0, 0, 0, // size 2
+	}
+
+	if !reflect.DeepEqual(want, d) {
+		t.Fatalf("Bad posted packet: got %v, want %v", d, want)
+	}
+
+	rxp.Flush()
+
+	// Check that there are no completions yet.
+	if _, ok := q.CompletedPacket(); ok {
+		t.Fatalf("Packet reported as completed too soon")
+	}
+
+	// Post a completion.
+	d = txp.Push(8)
+	if d == nil {
+		t.Fatalf("Unable to push to rx pipe")
+	}
+	binary.LittleEndian.PutUint64(d, usedID)
+	txp.Flush()
+
+	// Check that completion is properly reported.
+	id, ok := q.CompletedPacket()
+	if !ok {
+		t.Fatalf("Completion not reported")
+	}
+
+	if id != usedID {
+		t.Fatalf("Bad completion id: got %v, want %v", id, usedID)
+	}
+}
+
+func TestBasicRxQueue(t *testing.T) {
+	// Tests that a basic receive on a queue works.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Rx
+	q.Init(pb1, pb2, nil)
+
+	// Post two buffers.
+	b := []RxBuffer{
+		{100, 60, 1077, 0},
+		{200, 40, 2123, 0},
+	}
+
+	if !q.PostBuffers(b) {
+		t.Fatalf("PostBuffers failed on empty queue")
+	}
+
+	// Check the contents of the pipe.
+	want := [][]byte{
+		{
+			100, 0, 0, 0, 0, 0, 0, 0, // Offset1
+			60, 0, 0, 0, // Size1
+			0, 0, 0, 0, // Remaining in group 1
+			0, 0, 0, 0, 0, 0, 0, 0, // User data 1
+			53, 4, 0, 0, 0, 0, 0, 0, // ID 1
+		},
+		{
+			200, 0, 0, 0, 0, 0, 0, 0, // Offset2
+			40, 0, 0, 0, // Size2
+			0, 0, 0, 0, // Remaining in group 2
+			0, 0, 0, 0, 0, 0, 0, 0, // User data 2
+			75, 8, 0, 0, 0, 0, 0, 0, // ID 2
+		},
+	}
+
+	for i := range b {
+		d := rxp.Pull()
+		if d == nil {
+			t.Fatalf("Tx pipe is empty after PostBuffers")
+		}
+
+		if !reflect.DeepEqual(want[i], d) {
+			t.Fatalf("Bad posted packet: got %v, want %v", d, want[i])
+		}
+
+		rxp.Flush()
+	}
+
+	// Check that there are no completions.
+	if _, n := q.Dequeue(nil); n != 0 {
+		t.Fatalf("Packet reported as received too soon")
+	}
+
+	// Post a completion.
+	d := txp.Push(sizeOfConsumedPacketHeader + 2*sizeOfConsumedBuffer)
+	if d == nil {
+		t.Fatalf("Unable to push to rx pipe")
+	}
+
+	copy(d, []byte{
+		100, 0, 0, 0, // packet size
+		0, 0, 0, 0, // reserved
+
+		100, 0, 0, 0, 0, 0, 0, 0, // offset 1
+		60, 0, 0, 0, // size 1
+		0, 0, 0, 0, 0, 0, 0, 0, // user data 1
+		53, 4, 0, 0, 0, 0, 0, 0, // ID 1
+
+		200, 0, 0, 0, 0, 0, 0, 0, // offset 2
+		40, 0, 0, 0, // size 2
+		0, 0, 0, 0, 0, 0, 0, 0, // user data 2
+		75, 8, 0, 0, 0, 0, 0, 0, // ID 2
+	})
+
+	txp.Flush()
+
+	// Check that completion is properly reported.
+	bufs, n := q.Dequeue(nil)
+	if n != 100 {
+		t.Fatalf("Bad packet size: got %v, want %v", n, 100)
+	}
+
+	if !reflect.DeepEqual(bufs, b) {
+		t.Fatalf("Bad returned buffers: got %v, want %v", bufs, b)
+	}
+}
+
+func TestBadTxCompletion(t *testing.T) {
+	// Check that tx completions with bad sizes are properly ignored.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Tx
+	q.Init(pb1, pb2)
+
+	// Post a completion that is too short, and check that it is ignored.
+	if d := txp.Push(7); d == nil {
+		t.Fatalf("Unable to push to rx pipe")
+	}
+	txp.Flush()
+
+	if _, ok := q.CompletedPacket(); ok {
+		t.Fatalf("Bad completion not ignored")
+	}
+
+	// Post a completion that is too long, and check that it is ignored.
+	if d := txp.Push(10); d == nil {
+		t.Fatalf("Unable to push to rx pipe")
+	}
+	txp.Flush()
+
+	if _, ok := q.CompletedPacket(); ok {
+		t.Fatalf("Bad completion not ignored")
+	}
+}
+
+func TestBadRxCompletion(t *testing.T) {
+	// Check that bad rx completions are properly ignored.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Rx
+	q.Init(pb1, pb2, nil)
+
+	// Post a completion that is too short, and check that it is ignored.
+	if d := txp.Push(7); d == nil {
+		t.Fatalf("Unable to push to rx pipe")
+	}
+	txp.Flush()
+
+	if b, _ := q.Dequeue(nil); b != nil {
+		t.Fatalf("Bad completion not ignored")
+	}
+
+	// Post a completion whose buffer sizes add up to less than the total
+	// size.
+	d := txp.Push(sizeOfConsumedPacketHeader + 2*sizeOfConsumedBuffer)
+	if d == nil {
+		t.Fatalf("Unable to push to rx pipe")
+	}
+
+	copy(d, []byte{
+		100, 0, 0, 0, // packet size
+		0, 0, 0, 0, // reserved
+
+		100, 0, 0, 0, 0, 0, 0, 0, // offset 1
+		10, 0, 0, 0, // size 1
+		0, 0, 0, 0, 0, 0, 0, 0, // user data 1
+		53, 4, 0, 0, 0, 0, 0, 0, // ID 1
+
+		200, 0, 0, 0, 0, 0, 0, 0, // offset 2
+		10, 0, 0, 0, // size 2
+		0, 0, 0, 0, 0, 0, 0, 0, // user data 2
+		75, 8, 0, 0, 0, 0, 0, 0, // ID 2
+	})
+
+	txp.Flush()
+	if b, _ := q.Dequeue(nil); b != nil {
+		t.Fatalf("Bad completion not ignored")
+	}
+
+	// Post a completion whose buffer sizes will cause a 32-bit overflow,
+	// but adds up to the right number.
+	d = txp.Push(sizeOfConsumedPacketHeader + 2*sizeOfConsumedBuffer)
+	if d == nil {
+		t.Fatalf("Unable to push to rx pipe")
+	}
+
+	copy(d, []byte{
+		100, 0, 0, 0, // packet size
+		0, 0, 0, 0, // reserved
+
+		100, 0, 0, 0, 0, 0, 0, 0, // offset 1
+		255, 255, 255, 255, // size 1
+		0, 0, 0, 0, 0, 0, 0, 0, // user data 1
+		53, 4, 0, 0, 0, 0, 0, 0, // ID 1
+
+		200, 0, 0, 0, 0, 0, 0, 0, // offset 2
+		101, 0, 0, 0, // size 2
+		0, 0, 0, 0, 0, 0, 0, 0, // user data 2
+		75, 8, 0, 0, 0, 0, 0, 0, // ID 2
+	})
+
+	txp.Flush()
+	if b, _ := q.Dequeue(nil); b != nil {
+		t.Fatalf("Bad completion not ignored")
+	}
+}
+
+func TestFillTxPipe(t *testing.T) {
+	// Check that transmitting a new buffer when the buffer pipe is full
+	// fails gracefully.
+	pb1 := make([]byte, 104)
+	pb2 := make([]byte, 104)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Tx
+	q.Init(pb1, pb2)
+
+	// Transmit twice, which should fill the tx pipe.
+	b := []TxBuffer{
+		{nil, 100, 60},
+		{nil, 200, 40},
+	}
+
+	b[0].Next = &b[1]
+
+	const usedID = 1002
+	const usedTotalSize = 100
+	for i := uint64(0); i < 2; i++ {
+		if !q.Enqueue(usedID+i, usedTotalSize, 2, &b[0]) {
+			t.Fatalf("Failed to transmit buffer")
+		}
+	}
+
+	// Transmit another packet now that the tx pipe is full.
+	if q.Enqueue(usedID+2, usedTotalSize, 2, &b[0]) {
+		t.Fatalf("Enqueue succeeded when tx pipe is full")
+	}
+}
+
+func TestFillRxPipe(t *testing.T) {
+	// Check that posting a new buffer when the buffer pipe is full fails
+	// gracefully.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Rx
+	q.Init(pb1, pb2, nil)
+
+	// Post a buffer twice, it should fill the tx pipe.
+	b := []RxBuffer{
+		{100, 60, 1077, 0},
+	}
+
+	for i := 0; i < 2; i++ {
+		if !q.PostBuffers(b) {
+			t.Fatalf("PostBuffers failed on non-full queue")
+		}
+	}
+
+	// Post another buffer now that the tx pipe is full.
+	if q.PostBuffers(b) {
+		t.Fatalf("PostBuffers succeeded on full queue")
+	}
+}
+
+func TestLotsOfTransmissions(t *testing.T) {
+	// Make sure pipes are being properly flushed when transmitting packets.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Tx
+	q.Init(pb1, pb2)
+
+	// Prepare packet with two buffers.
+	b := []TxBuffer{
+		{nil, 100, 60},
+		{nil, 200, 40},
+	}
+
+	b[0].Next = &b[1]
+
+	const usedID = 1002
+	const usedTotalSize = 100
+
+	// Post 100000 packets and completions.
+	for i := 100000; i > 0; i-- {
+		if !q.Enqueue(usedID, usedTotalSize, 2, &b[0]) {
+			t.Fatalf("Enqueue failed on non-full queue")
+		}
+
+		if d := rxp.Pull(); d == nil {
+			t.Fatalf("Tx pipe is empty after Enqueue")
+		}
+		rxp.Flush()
+
+		d := txp.Push(8)
+		if d == nil {
+			t.Fatalf("Unable to write to rx pipe")
+		}
+		binary.LittleEndian.PutUint64(d, usedID)
+		txp.Flush()
+		if _, ok := q.CompletedPacket(); !ok {
+			t.Fatalf("Completion not returned")
+		}
+	}
+}
+
+func TestLotsOfReceptions(t *testing.T) {
+	// Make sure pipes are being properly flushed when receiving packets.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Rx
+	q.Init(pb1, pb2, nil)
+
+	// Prepare for posting two buffers.
+	b := []RxBuffer{
+		{100, 60, 1077, 0},
+		{200, 40, 2123, 0},
+	}
+
+	// Post 100000 buffers and completions.
+	for i := 100000; i > 0; i-- {
+		if !q.PostBuffers(b) {
+			t.Fatalf("PostBuffers failed on non-full queue")
+		}
+
+		if d := rxp.Pull(); d == nil {
+			t.Fatalf("Tx pipe is empty after PostBuffers")
+		}
+		rxp.Flush()
+
+		if d := rxp.Pull(); d == nil {
+			t.Fatalf("Tx pipe is empty after PostBuffers")
+		}
+		rxp.Flush()
+
+		d := txp.Push(sizeOfConsumedPacketHeader + 2*sizeOfConsumedBuffer)
+		if d == nil {
+			t.Fatalf("Unable to push to rx pipe")
+		}
+
+		copy(d, []byte{
+			100, 0, 0, 0, // packet size
+			0, 0, 0, 0, // reserved
+
+			100, 0, 0, 0, 0, 0, 0, 0, // offset 1
+			60, 0, 0, 0, // size 1
+			0, 0, 0, 0, 0, 0, 0, 0, // user data 1
+			53, 4, 0, 0, 0, 0, 0, 0, // ID 1
+
+			200, 0, 0, 0, 0, 0, 0, 0, // offset 2
+			40, 0, 0, 0, // size 2
+			0, 0, 0, 0, 0, 0, 0, 0, // user data 2
+			75, 8, 0, 0, 0, 0, 0, 0, // ID 2
+		})
+
+		txp.Flush()
+
+		if _, n := q.Dequeue(nil); n == 0 {
+			t.Fatalf("Dequeue failed when there is a completion")
+		}
+	}
+}
+
+func TestRxEnableNotification(t *testing.T) {
+	// Check that enabling nofifications results in properly updated state.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var state uint32
+	var q Rx
+	q.Init(pb1, pb2, &state)
+
+	q.EnableNotification()
+	if state != eventFDEnabled {
+		t.Fatalf("Bad value in shared state: got %v, want %v", state, eventFDEnabled)
+	}
+}
+
+func TestRxDisableNotification(t *testing.T) {
+	// Check that disabling nofifications results in properly updated state.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var state uint32
+	var q Rx
+	q.Init(pb1, pb2, &state)
+
+	q.DisableNotification()
+	if state != eventFDDisabled {
+		t.Fatalf("Bad value in shared state: got %v, want %v", state, eventFDDisabled)
+	}
+}
diff --git a/pkg/tcpip/link/sharedmem/queue/rx.go b/pkg/tcpip/link/sharedmem/queue/rx.go
new file mode 100644
index 000000000..696e6c9e5
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/queue/rx.go
@@ -0,0 +1,221 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package queue provides the implementation of transmit and receive queues
+// based on shared memory ring buffers.
+package queue
+
+import (
+	"encoding/binary"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe"
+)
+
+const (
+	// Offsets within a posted buffer.
+	postedOffset           = 0
+	postedSize             = 8
+	postedRemainingInGroup = 12
+	postedUserData         = 16
+	postedID               = 24
+
+	sizeOfPostedBuffer = 32
+
+	// Offsets within a received packet header.
+	consumedPacketSize     = 0
+	consumedPacketReserved = 4
+
+	sizeOfConsumedPacketHeader = 8
+
+	// Offsets within a consumed buffer.
+	consumedOffset   = 0
+	consumedSize     = 8
+	consumedUserData = 12
+	consumedID       = 20
+
+	sizeOfConsumedBuffer = 28
+
+	// The following are the allowed states of the shared data area.
+	eventFDUninitialized = 0
+	eventFDDisabled      = 1
+	eventFDEnabled       = 2
+)
+
+// RxBuffer is the descriptor of a receive buffer.
+type RxBuffer struct {
+	Offset   uint64
+	Size     uint32
+	ID       uint64
+	UserData uint64
+}
+
+// Rx is a receive queue. It is implemented with one tx and one rx pipe: the tx
+// pipe is used to "post" buffers, while the rx pipe is used to receive packets
+// whose contents have been written to previously posted buffers.
+//
+// This struct is thread-compatible.
+type Rx struct {
+	tx                 pipe.Tx
+	rx                 pipe.Rx
+	sharedEventFDState *uint32
+}
+
+// Init initializes the receive queue with the given pipes, and shared state
+// pointer -- the latter is used to enable/disable eventfd notifications.
+func (r *Rx) Init(tx, rx []byte, sharedEventFDState *uint32) {
+	r.sharedEventFDState = sharedEventFDState
+	r.tx.Init(tx)
+	r.rx.Init(rx)
+}
+
+// EnableNotification updates the shared state such that the peer will notify
+// the eventfd when there are packets to be dequeued.
+func (r *Rx) EnableNotification() {
+	atomic.StoreUint32(r.sharedEventFDState, eventFDEnabled)
+}
+
+// DisableNotification updates the shared state such that the peer will not
+// notify the eventfd.
+func (r *Rx) DisableNotification() {
+	atomic.StoreUint32(r.sharedEventFDState, eventFDDisabled)
+}
+
+// PostedBuffersLimit returns the maximum number of buffers that can be posted
+// before the tx queue fills up.
+func (r *Rx) PostedBuffersLimit() uint64 {
+	return r.tx.Capacity(sizeOfPostedBuffer)
+}
+
+// PostBuffers makes the given buffers available for receiving data from the
+// peer. Once they are posted, the peer is free to write to them and will
+// eventually post them back for consumption.
+func (r *Rx) PostBuffers(buffers []RxBuffer) bool {
+	for i := range buffers {
+		b := r.tx.Push(sizeOfPostedBuffer)
+		if b == nil {
+			r.tx.Abort()
+			return false
+		}
+
+		pb := &buffers[i]
+		binary.LittleEndian.PutUint64(b[postedOffset:], pb.Offset)
+		binary.LittleEndian.PutUint32(b[postedSize:], pb.Size)
+		binary.LittleEndian.PutUint32(b[postedRemainingInGroup:], 0)
+		binary.LittleEndian.PutUint64(b[postedUserData:], pb.UserData)
+		binary.LittleEndian.PutUint64(b[postedID:], pb.ID)
+	}
+
+	r.tx.Flush()
+
+	return true
+}
+
+// Dequeue receives buffers that have been previously posted by PostBuffers()
+// and that have been filled by the peer and posted back.
+//
+// This is similar to append() in that new buffers are appended to "bufs", with
+// reallocation only if "bufs" doesn't have enough capacity.
+func (r *Rx) Dequeue(bufs []RxBuffer) ([]RxBuffer, uint32) {
+	for {
+		outBufs := bufs
+
+		// Pull the next descriptor from the rx pipe.
+		b := r.rx.Pull()
+		if b == nil {
+			return bufs, 0
+		}
+
+		if len(b) < sizeOfConsumedPacketHeader {
+			log.Warningf("Ignoring packet header: size (%v) is less than header size (%v)", len(b), sizeOfConsumedPacketHeader)
+			r.rx.Flush()
+			continue
+		}
+
+		totalDataSize := binary.LittleEndian.Uint32(b[consumedPacketSize:])
+
+		// Calculate the number of buffer descriptors and copy them
+		// over to the output.
+		count := (len(b) - sizeOfConsumedPacketHeader) / sizeOfConsumedBuffer
+		offset := sizeOfConsumedPacketHeader
+		buffersSize := uint32(0)
+		for i := count; i > 0; i-- {
+			s := binary.LittleEndian.Uint32(b[offset+consumedSize:])
+			buffersSize += s
+			if buffersSize < s {
+				// The buffer size overflows an unsigned 32-bit
+				// integer, so break out and force it to be
+				// ignored.
+				totalDataSize = 1
+				buffersSize = 0
+				break
+			}
+
+			outBufs = append(outBufs, RxBuffer{
+				Offset: binary.LittleEndian.Uint64(b[offset+consumedOffset:]),
+				Size:   s,
+				ID:     binary.LittleEndian.Uint64(b[offset+consumedID:]),
+			})
+
+			offset += sizeOfConsumedBuffer
+		}
+
+		r.rx.Flush()
+
+		if buffersSize < totalDataSize {
+			// The descriptor is corrupted, ignore it.
+			log.Warningf("Ignoring packet: actual data size (%v) less than expected size (%v)", buffersSize, totalDataSize)
+			continue
+		}
+
+		return outBufs, totalDataSize
+	}
+}
+
+// Bytes returns the byte slices on which the queue operates.
+func (r *Rx) Bytes() (tx, rx []byte) {
+	return r.tx.Bytes(), r.rx.Bytes()
+}
+
+// DecodeRxBufferHeader decodes the header of a buffer posted on an rx queue.
+func DecodeRxBufferHeader(b []byte) RxBuffer {
+	return RxBuffer{
+		Offset:   binary.LittleEndian.Uint64(b[postedOffset:]),
+		Size:     binary.LittleEndian.Uint32(b[postedSize:]),
+		ID:       binary.LittleEndian.Uint64(b[postedID:]),
+		UserData: binary.LittleEndian.Uint64(b[postedUserData:]),
+	}
+}
+
+// RxCompletionSize returns the number of bytes needed to encode an rx
+// completion containing "count" buffers.
+func RxCompletionSize(count int) uint64 {
+	return sizeOfConsumedPacketHeader + uint64(count)*sizeOfConsumedBuffer
+}
+
+// EncodeRxCompletion encodes an rx completion header.
+func EncodeRxCompletion(b []byte, size, reserved uint32) {
+	binary.LittleEndian.PutUint32(b[consumedPacketSize:], size)
+	binary.LittleEndian.PutUint32(b[consumedPacketReserved:], reserved)
+}
+
+// EncodeRxCompletionBuffer encodes the i-th rx completion buffer header.
+func EncodeRxCompletionBuffer(b []byte, i int, rxb RxBuffer) {
+	b = b[RxCompletionSize(i):]
+	binary.LittleEndian.PutUint64(b[consumedOffset:], rxb.Offset)
+	binary.LittleEndian.PutUint32(b[consumedSize:], rxb.Size)
+	binary.LittleEndian.PutUint64(b[consumedUserData:], rxb.UserData)
+	binary.LittleEndian.PutUint64(b[consumedID:], rxb.ID)
+}
diff --git a/pkg/tcpip/link/sharedmem/queue/tx.go b/pkg/tcpip/link/sharedmem/queue/tx.go
new file mode 100644
index 000000000..beffe807b
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/queue/tx.go
@@ -0,0 +1,151 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package queue
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe"
+)
+
+const (
+	// Offsets within a packet header.
+	packetID       = 0
+	packetSize     = 8
+	packetReserved = 12
+
+	sizeOfPacketHeader = 16
+
+	// Offsets with a buffer descriptor
+	bufferOffset = 0
+	bufferSize   = 8
+
+	sizeOfBufferDescriptor = 12
+)
+
+// TxBuffer is the descriptor of a transmit buffer.
+type TxBuffer struct {
+	Next   *TxBuffer
+	Offset uint64
+	Size   uint32
+}
+
+// Tx is a transmit queue. It is implemented with one tx and one rx pipe: the
+// tx pipe is used to request the transmission of packets, while the rx pipe
+// is used to receive which transmissions have completed.
+//
+// This struct is thread-compatible.
+type Tx struct {
+	tx pipe.Tx
+	rx pipe.Rx
+}
+
+// Init initializes the transmit queue with the given pipes.
+func (t *Tx) Init(tx, rx []byte) {
+	t.tx.Init(tx)
+	t.rx.Init(rx)
+}
+
+// Enqueue queues the given linked list of buffers for transmission as one
+// packet. While it is queued, the caller must not modify them.
+func (t *Tx) Enqueue(id uint64, totalDataLen, bufferCount uint32, buffer *TxBuffer) bool {
+	// Reserve room in the tx pipe.
+	totalLen := sizeOfPacketHeader + uint64(bufferCount)*sizeOfBufferDescriptor
+
+	b := t.tx.Push(totalLen)
+	if b == nil {
+		return false
+	}
+
+	// Initialize the packet and buffer descriptors.
+	binary.LittleEndian.PutUint64(b[packetID:], id)
+	binary.LittleEndian.PutUint32(b[packetSize:], totalDataLen)
+	binary.LittleEndian.PutUint32(b[packetReserved:], 0)
+
+	offset := sizeOfPacketHeader
+	for i := bufferCount; i != 0; i-- {
+		binary.LittleEndian.PutUint64(b[offset+bufferOffset:], buffer.Offset)
+		binary.LittleEndian.PutUint32(b[offset+bufferSize:], buffer.Size)
+		offset += sizeOfBufferDescriptor
+		buffer = buffer.Next
+	}
+
+	t.tx.Flush()
+
+	return true
+}
+
+// CompletedPacket returns the id of the last completed transmission. The
+// returned id, if any, refers to a value passed on a previous call to
+// Enqueue().
+func (t *Tx) CompletedPacket() (id uint64, ok bool) {
+	for {
+		b := t.rx.Pull()
+		if b == nil {
+			return 0, false
+		}
+
+		if len(b) != 8 {
+			t.rx.Flush()
+			log.Warningf("Ignoring completed packet: size (%v) is less than expected (%v)", len(b), 8)
+			continue
+		}
+
+		v := binary.LittleEndian.Uint64(b)
+
+		t.rx.Flush()
+
+		return v, true
+	}
+}
+
+// Bytes returns the byte slices on which the queue operates.
+func (t *Tx) Bytes() (tx, rx []byte) {
+	return t.tx.Bytes(), t.rx.Bytes()
+}
+
+// TxPacketInfo holds information about a packet sent on a tx queue.
+type TxPacketInfo struct {
+	ID          uint64
+	Size        uint32
+	Reserved    uint32
+	BufferCount int
+}
+
+// DecodeTxPacketHeader decodes the header of a packet sent over a tx queue.
+func DecodeTxPacketHeader(b []byte) TxPacketInfo {
+	return TxPacketInfo{
+		ID:          binary.LittleEndian.Uint64(b[packetID:]),
+		Size:        binary.LittleEndian.Uint32(b[packetSize:]),
+		Reserved:    binary.LittleEndian.Uint32(b[packetReserved:]),
+		BufferCount: (len(b) - sizeOfPacketHeader) / sizeOfBufferDescriptor,
+	}
+}
+
+// DecodeTxBufferHeader decodes the header of the i-th buffer of a packet sent
+// over a tx queue.
+func DecodeTxBufferHeader(b []byte, i int) TxBuffer {
+	b = b[sizeOfPacketHeader+i*sizeOfBufferDescriptor:]
+	return TxBuffer{
+		Offset: binary.LittleEndian.Uint64(b[bufferOffset:]),
+		Size:   binary.LittleEndian.Uint32(b[bufferSize:]),
+	}
+}
+
+// EncodeTxCompletion encodes a tx completion header.
+func EncodeTxCompletion(b []byte, id uint64) {
+	binary.LittleEndian.PutUint64(b, id)
+}
diff --git a/pkg/tcpip/link/sharedmem/rx.go b/pkg/tcpip/link/sharedmem/rx.go
new file mode 100644
index 000000000..eec11e4cb
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/rx.go
@@ -0,0 +1,159 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package sharedmem
+
+import (
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue"
+)
+
+// rx holds all state associated with an rx queue.
+type rx struct {
+	data       []byte
+	sharedData []byte
+	q          queue.Rx
+	eventFD    int
+}
+
+// init initializes all state needed by the rx queue based on the information
+// provided.
+//
+// The caller always retains ownership of all file descriptors passed in. The
+// queue implementation will duplicate any that it may need in the future.
+func (r *rx) init(mtu uint32, c *QueueConfig) error {
+	// Map in all buffers.
+	txPipe, err := getBuffer(c.TxPipeFD)
+	if err != nil {
+		return err
+	}
+
+	rxPipe, err := getBuffer(c.RxPipeFD)
+	if err != nil {
+		syscall.Munmap(txPipe)
+		return err
+	}
+
+	data, err := getBuffer(c.DataFD)
+	if err != nil {
+		syscall.Munmap(txPipe)
+		syscall.Munmap(rxPipe)
+		return err
+	}
+
+	sharedData, err := getBuffer(c.SharedDataFD)
+	if err != nil {
+		syscall.Munmap(txPipe)
+		syscall.Munmap(rxPipe)
+		syscall.Munmap(data)
+		return err
+	}
+
+	// Duplicate the eventFD so that caller can close it but we can still
+	// use it.
+	efd, err := syscall.Dup(c.EventFD)
+	if err != nil {
+		syscall.Munmap(txPipe)
+		syscall.Munmap(rxPipe)
+		syscall.Munmap(data)
+		syscall.Munmap(sharedData)
+		return err
+	}
+
+	// Set the eventfd as non-blocking.
+	if err := syscall.SetNonblock(efd, true); err != nil {
+		syscall.Munmap(txPipe)
+		syscall.Munmap(rxPipe)
+		syscall.Munmap(data)
+		syscall.Munmap(sharedData)
+		syscall.Close(efd)
+		return err
+	}
+
+	// Initialize state based on buffers.
+	r.q.Init(txPipe, rxPipe, sharedDataPointer(sharedData))
+	r.data = data
+	r.eventFD = efd
+	r.sharedData = sharedData
+
+	return nil
+}
+
+// cleanup releases all resources allocated during init(). It must only be
+// called if init() has previously succeeded.
+func (r *rx) cleanup() {
+	a, b := r.q.Bytes()
+	syscall.Munmap(a)
+	syscall.Munmap(b)
+
+	syscall.Munmap(r.data)
+	syscall.Munmap(r.sharedData)
+	syscall.Close(r.eventFD)
+}
+
+// postAndReceive posts the provided buffers (if any), and then tries to read
+// from the receive queue.
+//
+// Capacity permitting, it reuses the posted buffer slice to store the buffers
+// that were read as well.
+//
+// This function will block if there aren't any available packets.
+func (r *rx) postAndReceive(b []queue.RxBuffer, stopRequested *uint32) ([]queue.RxBuffer, uint32) {
+	// Post the buffers first. If we cannot post, sleep until we can. We
+	// never post more than will fit concurrently, so it's safe to wait
+	// until enough room is available.
+	if len(b) != 0 && !r.q.PostBuffers(b) {
+		r.q.EnableNotification()
+		for !r.q.PostBuffers(b) {
+			var tmp [8]byte
+			rawfile.BlockingRead(r.eventFD, tmp[:])
+			if atomic.LoadUint32(stopRequested) != 0 {
+				r.q.DisableNotification()
+				return nil, 0
+			}
+		}
+		r.q.DisableNotification()
+	}
+
+	// Read the next set of descriptors.
+	b, n := r.q.Dequeue(b[:0])
+	if len(b) != 0 {
+		return b, n
+	}
+
+	// Data isn't immediately available. Enable eventfd notifications.
+	r.q.EnableNotification()
+	for {
+		b, n = r.q.Dequeue(b)
+		if len(b) != 0 {
+			break
+		}
+
+		// Wait for notification.
+		var tmp [8]byte
+		rawfile.BlockingRead(r.eventFD, tmp[:])
+		if atomic.LoadUint32(stopRequested) != 0 {
+			r.q.DisableNotification()
+			return nil, 0
+		}
+	}
+	r.q.DisableNotification()
+
+	return b, n
+}
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
new file mode 100644
index 000000000..0374a2441
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -0,0 +1,289 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+// Package sharedmem provides the implemention of data-link layer endpoints
+// backed by shared memory.
+//
+// Shared memory endpoints can be used in the networking stack by calling New()
+// to create a new endpoint, and then passing it as an argument to
+// Stack.CreateNIC().
+package sharedmem
+
+import (
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// QueueConfig holds all the file descriptors needed to describe a tx or rx
+// queue over shared memory. It is used when creating new shared memory
+// endpoints to describe tx and rx queues.
+type QueueConfig struct {
+	// DataFD is a file descriptor for the file that contains the data to
+	// be transmitted via this queue. Descriptors contain offsets within
+	// this file.
+	DataFD int
+
+	// EventFD is a file descriptor for the event that is signaled when
+	// data is becomes available in this queue.
+	EventFD int
+
+	// TxPipeFD is a file descriptor for the tx pipe associated with the
+	// queue.
+	TxPipeFD int
+
+	// RxPipeFD is a file descriptor for the rx pipe associated with the
+	// queue.
+	RxPipeFD int
+
+	// SharedDataFD is a file descriptor for the file that contains shared
+	// state between the two ends of the queue. This data specifies, for
+	// example, whether EventFD signaling is enabled or disabled.
+	SharedDataFD int
+}
+
+type endpoint struct {
+	// mtu (maximum transmission unit) is the maximum size of a packet.
+	mtu uint32
+
+	// bufferSize is the size of each individual buffer.
+	bufferSize uint32
+
+	// addr is the local address of this endpoint.
+	addr tcpip.LinkAddress
+
+	// rx is the receive queue.
+	rx rx
+
+	// stopRequested is to be accessed atomically only, and determines if
+	// the worker goroutines should stop.
+	stopRequested uint32
+
+	// Wait group used to indicate that all workers have stopped.
+	completed sync.WaitGroup
+
+	// mu protects the following fields.
+	mu sync.Mutex
+
+	// tx is the transmit queue.
+	tx tx
+
+	// workerStarted specifies whether the worker goroutine was started.
+	workerStarted bool
+}
+
+// New creates a new shared-memory-based endpoint. Buffers will be broken up
+// into buffers of "bufferSize" bytes.
+func New(mtu, bufferSize uint32, addr tcpip.LinkAddress, tx, rx QueueConfig) (stack.LinkEndpoint, error) {
+	e := &endpoint{
+		mtu:        mtu,
+		bufferSize: bufferSize,
+		addr:       addr,
+	}
+
+	if err := e.tx.init(bufferSize, &tx); err != nil {
+		return nil, err
+	}
+
+	if err := e.rx.init(bufferSize, &rx); err != nil {
+		e.tx.cleanup()
+		return nil, err
+	}
+
+	return e, nil
+}
+
+// Close frees all resources associated with the endpoint.
+func (e *endpoint) Close() {
+	// Tell dispatch goroutine to stop, then write to the eventfd so that
+	// it wakes up in case it's sleeping.
+	atomic.StoreUint32(&e.stopRequested, 1)
+	syscall.Write(e.rx.eventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+	// Cleanup the queues inline if the worker hasn't started yet; we also
+	// know it won't start from now on because stopRequested is set to 1.
+	e.mu.Lock()
+	workerPresent := e.workerStarted
+	e.mu.Unlock()
+
+	if !workerPresent {
+		e.tx.cleanup()
+		e.rx.cleanup()
+	}
+}
+
+// Wait implements stack.LinkEndpoint.Wait. It waits until all workers have
+// stopped after a Close() call.
+func (e *endpoint) Wait() {
+	e.completed.Wait()
+}
+
+// Attach implements stack.LinkEndpoint.Attach. It launches the goroutine that
+// reads packets from the rx queue.
+func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.mu.Lock()
+	if !e.workerStarted && atomic.LoadUint32(&e.stopRequested) == 0 {
+		e.workerStarted = true
+		e.completed.Add(1)
+		// Link endpoints are not savable. When transportation endpoints
+		// are saved, they stop sending outgoing packets and all
+		// incoming packets are rejected.
+		go e.dispatchLoop(dispatcher) // S/R-SAFE: see above.
+	}
+	e.mu.Unlock()
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *endpoint) IsAttached() bool {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.workerStarted
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
+// during construction.
+func (e *endpoint) MTU() uint32 {
+	return e.mtu - header.EthernetMinimumSize
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (*endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return 0
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. It returns the
+// ethernet frame header size.
+func (*endpoint) MaxHeaderLength() uint16 {
+	return header.EthernetMinimumSize
+}
+
+// LinkAddress implements stack.LinkEndpoint.LinkAddress. It returns the local
+// link address.
+func (e *endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.addr
+}
+
+// WritePacket writes outbound packets to the file descriptor. If it is not
+// currently writable, the packet is dropped.
+func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	// Add the ethernet header here.
+	eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
+	pkt.LinkHeader = buffer.View(eth)
+	ethHdr := &header.EthernetFields{
+		DstAddr: r.RemoteLinkAddress,
+		Type:    protocol,
+	}
+	if r.LocalLinkAddress != "" {
+		ethHdr.SrcAddr = r.LocalLinkAddress
+	} else {
+		ethHdr.SrcAddr = e.addr
+	}
+	eth.Encode(ethHdr)
+
+	v := pkt.Data.ToView()
+	// Transmit the packet.
+	e.mu.Lock()
+	ok := e.tx.transmit(pkt.Header.View(), v)
+	e.mu.Unlock()
+
+	if !ok {
+		return tcpip.ErrWouldBlock
+	}
+
+	return nil
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	v := vv.ToView()
+	// Transmit the packet.
+	e.mu.Lock()
+	ok := e.tx.transmit(v, buffer.View{})
+	e.mu.Unlock()
+
+	if !ok {
+		return tcpip.ErrWouldBlock
+	}
+
+	return nil
+}
+
+// dispatchLoop reads packets from the rx queue in a loop and dispatches them
+// to the network stack.
+func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
+	// Post initial set of buffers.
+	limit := e.rx.q.PostedBuffersLimit()
+	if l := uint64(len(e.rx.data)) / uint64(e.bufferSize); limit > l {
+		limit = l
+	}
+	for i := uint64(0); i < limit; i++ {
+		b := queue.RxBuffer{
+			Offset: i * uint64(e.bufferSize),
+			Size:   e.bufferSize,
+			ID:     i,
+		}
+		if !e.rx.q.PostBuffers([]queue.RxBuffer{b}) {
+			log.Warningf("Unable to post %v-th buffer", i)
+		}
+	}
+
+	// Read in a loop until a stop is requested.
+	var rxb []queue.RxBuffer
+	for atomic.LoadUint32(&e.stopRequested) == 0 {
+		var n uint32
+		rxb, n = e.rx.postAndReceive(rxb, &e.stopRequested)
+
+		// Copy data from the shared area to its own buffer, then
+		// prepare to repost the buffer.
+		b := make([]byte, n)
+		offset := uint32(0)
+		for i := range rxb {
+			copy(b[offset:], e.rx.data[rxb[i].Offset:][:rxb[i].Size])
+			offset += rxb[i].Size
+
+			rxb[i].Size = e.bufferSize
+		}
+
+		if n < header.EthernetMinimumSize {
+			continue
+		}
+
+		// Send packet up the stack.
+		eth := header.Ethernet(b[:header.EthernetMinimumSize])
+		d.DeliverNetworkPacket(eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), &stack.PacketBuffer{
+			Data:       buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(),
+			LinkHeader: buffer.View(eth),
+		})
+	}
+
+	// Clean state.
+	e.tx.cleanup()
+	e.rx.cleanup()
+
+	e.completed.Done()
+}
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
new file mode 100644
index 000000000..28a2e88ba
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -0,0 +1,812 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package sharedmem
+
+import (
+	"bytes"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"strings"
+	"syscall"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	localLinkAddr  = "\xde\xad\xbe\xef\x56\x78"
+	remoteLinkAddr = "\xde\xad\xbe\xef\x12\x34"
+
+	queueDataSize = 1024 * 1024
+	queuePipeSize = 4096
+)
+
+type queueBuffers struct {
+	data []byte
+	rx   pipe.Tx
+	tx   pipe.Rx
+}
+
+func initQueue(t *testing.T, q *queueBuffers, c *QueueConfig) {
+	// Prepare tx pipe.
+	b, err := getBuffer(c.TxPipeFD)
+	if err != nil {
+		t.Fatalf("getBuffer failed: %v", err)
+	}
+	q.tx.Init(b)
+
+	// Prepare rx pipe.
+	b, err = getBuffer(c.RxPipeFD)
+	if err != nil {
+		t.Fatalf("getBuffer failed: %v", err)
+	}
+	q.rx.Init(b)
+
+	// Get data slice.
+	q.data, err = getBuffer(c.DataFD)
+	if err != nil {
+		t.Fatalf("getBuffer failed: %v", err)
+	}
+}
+
+func (q *queueBuffers) cleanup() {
+	syscall.Munmap(q.tx.Bytes())
+	syscall.Munmap(q.rx.Bytes())
+	syscall.Munmap(q.data)
+}
+
+type packetInfo struct {
+	addr       tcpip.LinkAddress
+	proto      tcpip.NetworkProtocolNumber
+	vv         buffer.VectorisedView
+	linkHeader buffer.View
+}
+
+type testContext struct {
+	t     *testing.T
+	ep    *endpoint
+	txCfg QueueConfig
+	rxCfg QueueConfig
+	txq   queueBuffers
+	rxq   queueBuffers
+
+	packetCh chan struct{}
+	mu       sync.Mutex
+	packets  []packetInfo
+}
+
+func newTestContext(t *testing.T, mtu, bufferSize uint32, addr tcpip.LinkAddress) *testContext {
+	var err error
+	c := &testContext{
+		t:        t,
+		packetCh: make(chan struct{}, 1000000),
+	}
+	c.txCfg = createQueueFDs(t, queueSizes{
+		dataSize:       queueDataSize,
+		txPipeSize:     queuePipeSize,
+		rxPipeSize:     queuePipeSize,
+		sharedDataSize: 4096,
+	})
+
+	c.rxCfg = createQueueFDs(t, queueSizes{
+		dataSize:       queueDataSize,
+		txPipeSize:     queuePipeSize,
+		rxPipeSize:     queuePipeSize,
+		sharedDataSize: 4096,
+	})
+
+	initQueue(t, &c.txq, &c.txCfg)
+	initQueue(t, &c.rxq, &c.rxCfg)
+
+	ep, err := New(mtu, bufferSize, addr, c.txCfg, c.rxCfg)
+	if err != nil {
+		t.Fatalf("New failed: %v", err)
+	}
+
+	c.ep = ep.(*endpoint)
+	c.ep.Attach(c)
+
+	return c
+}
+
+func (c *testContext) DeliverNetworkPacket(remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	c.mu.Lock()
+	c.packets = append(c.packets, packetInfo{
+		addr:  remoteLinkAddr,
+		proto: proto,
+		vv:    pkt.Data.Clone(nil),
+	})
+	c.mu.Unlock()
+
+	c.packetCh <- struct{}{}
+}
+
+func (c *testContext) cleanup() {
+	c.ep.Close()
+	closeFDs(&c.txCfg)
+	closeFDs(&c.rxCfg)
+	c.txq.cleanup()
+	c.rxq.cleanup()
+}
+
+func (c *testContext) waitForPackets(n int, to <-chan time.Time, errorStr string) {
+	for i := 0; i < n; i++ {
+		select {
+		case <-c.packetCh:
+		case <-to:
+			c.t.Fatalf(errorStr)
+		}
+	}
+}
+
+func (c *testContext) pushRxCompletion(size uint32, bs []queue.RxBuffer) {
+	b := c.rxq.rx.Push(queue.RxCompletionSize(len(bs)))
+	queue.EncodeRxCompletion(b, size, 0)
+	for i := range bs {
+		queue.EncodeRxCompletionBuffer(b, i, queue.RxBuffer{
+			Offset: bs[i].Offset,
+			Size:   bs[i].Size,
+			ID:     bs[i].ID,
+		})
+	}
+}
+
+func randomFill(b []byte) {
+	for i := range b {
+		b[i] = byte(rand.Intn(256))
+	}
+}
+
+func shuffle(b []int) {
+	for i := len(b) - 1; i >= 0; i-- {
+		j := rand.Intn(i + 1)
+		b[i], b[j] = b[j], b[i]
+	}
+}
+
+func createFile(t *testing.T, size int64, initQueue bool) int {
+	tmpDir := os.Getenv("TEST_TMPDIR")
+	if tmpDir == "" {
+		tmpDir = os.Getenv("TMPDIR")
+	}
+	f, err := ioutil.TempFile(tmpDir, "sharedmem_test")
+	if err != nil {
+		t.Fatalf("TempFile failed: %v", err)
+	}
+	defer f.Close()
+	syscall.Unlink(f.Name())
+
+	if initQueue {
+		// Write the "slot-free" flag in the initial queue.
+		_, err := f.WriteAt([]byte{0, 0, 0, 0, 0, 0, 0, 0x80}, 0)
+		if err != nil {
+			t.Fatalf("WriteAt failed: %v", err)
+		}
+	}
+
+	fd, err := syscall.Dup(int(f.Fd()))
+	if err != nil {
+		t.Fatalf("Dup failed: %v", err)
+	}
+
+	if err := syscall.Ftruncate(fd, size); err != nil {
+		syscall.Close(fd)
+		t.Fatalf("Ftruncate failed: %v", err)
+	}
+
+	return fd
+}
+
+func closeFDs(c *QueueConfig) {
+	syscall.Close(c.DataFD)
+	syscall.Close(c.EventFD)
+	syscall.Close(c.TxPipeFD)
+	syscall.Close(c.RxPipeFD)
+	syscall.Close(c.SharedDataFD)
+}
+
+type queueSizes struct {
+	dataSize       int64
+	txPipeSize     int64
+	rxPipeSize     int64
+	sharedDataSize int64
+}
+
+func createQueueFDs(t *testing.T, s queueSizes) QueueConfig {
+	fd, _, err := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, 0, 0)
+	if err != 0 {
+		t.Fatalf("eventfd failed: %v", error(err))
+	}
+
+	return QueueConfig{
+		EventFD:      int(fd),
+		DataFD:       createFile(t, s.dataSize, false),
+		TxPipeFD:     createFile(t, s.txPipeSize, true),
+		RxPipeFD:     createFile(t, s.rxPipeSize, true),
+		SharedDataFD: createFile(t, s.sharedDataSize, false),
+	}
+}
+
+// TestSimpleSend sends 1000 packets with random header and payload sizes,
+// then checks that the right payload is received on the shared memory queues.
+func TestSimpleSend(t *testing.T) {
+	c := newTestContext(t, 20000, 1500, localLinkAddr)
+	defer c.cleanup()
+
+	// Prepare route.
+	r := stack.Route{
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+
+	for iters := 1000; iters > 0; iters-- {
+		func() {
+			// Prepare and send packet.
+			n := rand.Intn(10000)
+			hdr := buffer.NewPrependable(n + int(c.ep.MaxHeaderLength()))
+			hdrBuf := hdr.Prepend(n)
+			randomFill(hdrBuf)
+
+			n = rand.Intn(10000)
+			buf := buffer.NewView(n)
+			randomFill(buf)
+
+			proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
+			if err := c.ep.WritePacket(&r, nil /* gso */, proto, &stack.PacketBuffer{
+				Header: hdr,
+				Data:   buf.ToVectorisedView(),
+			}); err != nil {
+				t.Fatalf("WritePacket failed: %v", err)
+			}
+
+			// Receive packet.
+			desc := c.txq.tx.Pull()
+			pi := queue.DecodeTxPacketHeader(desc)
+			if pi.Reserved != 0 {
+				t.Fatalf("Reserved value is non-zero: 0x%x", pi.Reserved)
+			}
+			contents := make([]byte, 0, pi.Size)
+			for i := 0; i < pi.BufferCount; i++ {
+				bi := queue.DecodeTxBufferHeader(desc, i)
+				contents = append(contents, c.txq.data[bi.Offset:][:bi.Size]...)
+			}
+			c.txq.tx.Flush()
+
+			defer func() {
+				// Tell the endpoint about the completion of the write.
+				b := c.txq.rx.Push(8)
+				queue.EncodeTxCompletion(b, pi.ID)
+				c.txq.rx.Flush()
+			}()
+
+			// Check the ethernet header.
+			ethTemplate := make(header.Ethernet, header.EthernetMinimumSize)
+			ethTemplate.Encode(&header.EthernetFields{
+				SrcAddr: localLinkAddr,
+				DstAddr: remoteLinkAddr,
+				Type:    proto,
+			})
+			if got := contents[:header.EthernetMinimumSize]; !bytes.Equal(got, []byte(ethTemplate)) {
+				t.Fatalf("Bad ethernet header in packet: got %x, want %x", got, ethTemplate)
+			}
+
+			// Compare contents skipping the ethernet header added by the
+			// endpoint.
+			merged := append(hdrBuf, buf...)
+			if uint32(len(contents)) < pi.Size {
+				t.Fatalf("Sum of buffers is less than packet size: %v < %v", len(contents), pi.Size)
+			}
+			contents = contents[:pi.Size][header.EthernetMinimumSize:]
+
+			if !bytes.Equal(contents, merged) {
+				t.Fatalf("Buffers are different: got %x (%v bytes), want %x (%v bytes)", contents, len(contents), merged, len(merged))
+			}
+		}()
+	}
+}
+
+// TestPreserveSrcAddressInSend calls WritePacket once with LocalLinkAddress
+// set in Route (using much of the same code as TestSimpleSend), then checks
+// that the encoded ethernet header received includes the correct SrcAddr.
+func TestPreserveSrcAddressInSend(t *testing.T) {
+	c := newTestContext(t, 20000, 1500, localLinkAddr)
+	defer c.cleanup()
+
+	newLocalLinkAddress := tcpip.LinkAddress(strings.Repeat("0xFE", 6))
+	// Set both remote and local link address in route.
+	r := stack.Route{
+		RemoteLinkAddress: remoteLinkAddr,
+		LocalLinkAddress:  newLocalLinkAddress,
+	}
+
+	// WritePacket panics given a prependable with anything less than
+	// the minimum size of the ethernet header.
+	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
+
+	proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
+	if err := c.ep.WritePacket(&r, nil /* gso */, proto, &stack.PacketBuffer{
+		Header: hdr,
+	}); err != nil {
+		t.Fatalf("WritePacket failed: %v", err)
+	}
+
+	// Receive packet.
+	desc := c.txq.tx.Pull()
+	pi := queue.DecodeTxPacketHeader(desc)
+	if pi.Reserved != 0 {
+		t.Fatalf("Reserved value is non-zero: 0x%x", pi.Reserved)
+	}
+	contents := make([]byte, 0, pi.Size)
+	for i := 0; i < pi.BufferCount; i++ {
+		bi := queue.DecodeTxBufferHeader(desc, i)
+		contents = append(contents, c.txq.data[bi.Offset:][:bi.Size]...)
+	}
+	c.txq.tx.Flush()
+
+	defer func() {
+		// Tell the endpoint about the completion of the write.
+		b := c.txq.rx.Push(8)
+		queue.EncodeTxCompletion(b, pi.ID)
+		c.txq.rx.Flush()
+	}()
+
+	// Check that the ethernet header contains the expected SrcAddr.
+	ethTemplate := make(header.Ethernet, header.EthernetMinimumSize)
+	ethTemplate.Encode(&header.EthernetFields{
+		SrcAddr: newLocalLinkAddress,
+		DstAddr: remoteLinkAddr,
+		Type:    proto,
+	})
+	if got := contents[:header.EthernetMinimumSize]; !bytes.Equal(got, []byte(ethTemplate)) {
+		t.Fatalf("Bad ethernet header in packet: got %x, want %x", got, ethTemplate)
+	}
+}
+
+// TestFillTxQueue sends packets until the queue is full.
+func TestFillTxQueue(t *testing.T) {
+	c := newTestContext(t, 20000, 1500, localLinkAddr)
+	defer c.cleanup()
+
+	// Prepare to send a packet.
+	r := stack.Route{
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+
+	buf := buffer.NewView(100)
+
+	// Each packet is uses no more than 40 bytes, so write that many packets
+	// until the tx queue if full.
+	ids := make(map[uint64]struct{})
+	for i := queuePipeSize / 40; i > 0; i-- {
+		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
+			t.Fatalf("WritePacket failed unexpectedly: %v", err)
+		}
+
+		// Check that they have different IDs.
+		desc := c.txq.tx.Pull()
+		pi := queue.DecodeTxPacketHeader(desc)
+		if _, ok := ids[pi.ID]; ok {
+			t.Fatalf("ID (%v) reused", pi.ID)
+		}
+		ids[pi.ID] = struct{}{}
+	}
+
+	// Next attempt to write must fail.
+	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   buf.ToVectorisedView(),
+	}); err != want {
+		t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
+	}
+}
+
+// TestFillTxQueueAfterBadCompletion sends a bad completion, then sends packets
+// until the queue is full.
+func TestFillTxQueueAfterBadCompletion(t *testing.T) {
+	c := newTestContext(t, 20000, 1500, localLinkAddr)
+	defer c.cleanup()
+
+	// Send a bad completion.
+	queue.EncodeTxCompletion(c.txq.rx.Push(8), 1)
+	c.txq.rx.Flush()
+
+	// Prepare to send a packet.
+	r := stack.Route{
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+
+	buf := buffer.NewView(100)
+
+	// Send two packets so that the id slice has at least two slots.
+	for i := 2; i > 0; i-- {
+		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
+			t.Fatalf("WritePacket failed unexpectedly: %v", err)
+		}
+	}
+
+	// Complete the two writes twice.
+	for i := 2; i > 0; i-- {
+		pi := queue.DecodeTxPacketHeader(c.txq.tx.Pull())
+
+		queue.EncodeTxCompletion(c.txq.rx.Push(8), pi.ID)
+		queue.EncodeTxCompletion(c.txq.rx.Push(8), pi.ID)
+		c.txq.rx.Flush()
+	}
+	c.txq.tx.Flush()
+
+	// Each packet is uses no more than 40 bytes, so write that many packets
+	// until the tx queue if full.
+	ids := make(map[uint64]struct{})
+	for i := queuePipeSize / 40; i > 0; i-- {
+		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
+			t.Fatalf("WritePacket failed unexpectedly: %v", err)
+		}
+
+		// Check that they have different IDs.
+		desc := c.txq.tx.Pull()
+		pi := queue.DecodeTxPacketHeader(desc)
+		if _, ok := ids[pi.ID]; ok {
+			t.Fatalf("ID (%v) reused", pi.ID)
+		}
+		ids[pi.ID] = struct{}{}
+	}
+
+	// Next attempt to write must fail.
+	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   buf.ToVectorisedView(),
+	}); err != want {
+		t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
+	}
+}
+
+// TestFillTxMemory sends packets until the we run out of shared memory.
+func TestFillTxMemory(t *testing.T) {
+	const bufferSize = 1500
+	c := newTestContext(t, 20000, bufferSize, localLinkAddr)
+	defer c.cleanup()
+
+	// Prepare to send a packet.
+	r := stack.Route{
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+
+	buf := buffer.NewView(100)
+
+	// Each packet is uses up one buffer, so write as many as possible until
+	// we fill the memory.
+	ids := make(map[uint64]struct{})
+	for i := queueDataSize / bufferSize; i > 0; i-- {
+		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
+			t.Fatalf("WritePacket failed unexpectedly: %v", err)
+		}
+
+		// Check that they have different IDs.
+		desc := c.txq.tx.Pull()
+		pi := queue.DecodeTxPacketHeader(desc)
+		if _, ok := ids[pi.ID]; ok {
+			t.Fatalf("ID (%v) reused", pi.ID)
+		}
+		ids[pi.ID] = struct{}{}
+		c.txq.tx.Flush()
+	}
+
+	// Next attempt to write must fail.
+	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+	err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   buf.ToVectorisedView(),
+	})
+	if want := tcpip.ErrWouldBlock; err != want {
+		t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
+	}
+}
+
+// TestFillTxMemoryWithMultiBuffer sends packets until the we run out of
+// shared memory for a 2-buffer packet, but still with room for a 1-buffer
+// packet.
+func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
+	const bufferSize = 1500
+	c := newTestContext(t, 20000, bufferSize, localLinkAddr)
+	defer c.cleanup()
+
+	// Prepare to send a packet.
+	r := stack.Route{
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+
+	buf := buffer.NewView(100)
+
+	// Each packet is uses up one buffer, so write as many as possible
+	// until there is only one buffer left.
+	for i := queueDataSize/bufferSize - 1; i > 0; i-- {
+		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
+			t.Fatalf("WritePacket failed unexpectedly: %v", err)
+		}
+
+		// Pull the posted buffer.
+		c.txq.tx.Pull()
+		c.txq.tx.Flush()
+	}
+
+	// Attempt to write a two-buffer packet. It must fail.
+	{
+		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+		uu := buffer.NewView(bufferSize).ToVectorisedView()
+		if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   uu,
+		}); err != want {
+			t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
+		}
+	}
+
+	// Attempt to write the one-buffer packet again. It must succeed.
+	{
+		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
+			t.Fatalf("WritePacket failed unexpectedly: %v", err)
+		}
+	}
+}
+
+func pollPull(t *testing.T, p *pipe.Rx, to <-chan time.Time, errStr string) []byte {
+	t.Helper()
+
+	for {
+		b := p.Pull()
+		if b != nil {
+			return b
+		}
+
+		select {
+		case <-time.After(10 * time.Millisecond):
+		case <-to:
+			t.Fatal(errStr)
+		}
+	}
+}
+
+// TestSimpleReceive completes 1000 different receives with random payload and
+// random number of buffers. It checks that the contents match the expected
+// values.
+func TestSimpleReceive(t *testing.T) {
+	const bufferSize = 1500
+	c := newTestContext(t, 20000, bufferSize, localLinkAddr)
+	defer c.cleanup()
+
+	// Check that buffers have been posted.
+	limit := c.ep.rx.q.PostedBuffersLimit()
+	for i := uint64(0); i < limit; i++ {
+		timeout := time.After(2 * time.Second)
+		bi := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, timeout, "Timeout waiting for all buffers to be posted"))
+
+		if want := i * bufferSize; want != bi.Offset {
+			t.Fatalf("Bad posted offset: got %v, want %v", bi.Offset, want)
+		}
+
+		if want := i; want != bi.ID {
+			t.Fatalf("Bad posted ID: got %v, want %v", bi.ID, want)
+		}
+
+		if bufferSize != bi.Size {
+			t.Fatalf("Bad posted bufferSize: got %v, want %v", bi.Size, bufferSize)
+		}
+	}
+	c.rxq.tx.Flush()
+
+	// Create a slice with the indices 0..limit-1.
+	idx := make([]int, limit)
+	for i := range idx {
+		idx[i] = i
+	}
+
+	// Complete random packets 1000 times.
+	for iters := 1000; iters > 0; iters-- {
+		timeout := time.After(2 * time.Second)
+		// Prepare a random packet.
+		shuffle(idx)
+		n := 1 + rand.Intn(10)
+		bufs := make([]queue.RxBuffer, n)
+		contents := make([]byte, bufferSize*n-rand.Intn(500))
+		randomFill(contents)
+		for i := range bufs {
+			j := idx[i]
+			bufs[i].Size = bufferSize
+			bufs[i].Offset = uint64(bufferSize * j)
+			bufs[i].ID = uint64(j)
+
+			copy(c.rxq.data[bufs[i].Offset:][:bufferSize], contents[i*bufferSize:])
+		}
+
+		// Push completion.
+		c.pushRxCompletion(uint32(len(contents)), bufs)
+		c.rxq.rx.Flush()
+		syscall.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+		// Wait for packet to be received, then check it.
+		c.waitForPackets(1, time.After(5*time.Second), "Timeout waiting for packet")
+		c.mu.Lock()
+		rcvd := []byte(c.packets[0].vv.ToView())
+		c.packets = c.packets[:0]
+		c.mu.Unlock()
+
+		if contents := contents[header.EthernetMinimumSize:]; !bytes.Equal(contents, rcvd) {
+			t.Fatalf("Unexpected buffer contents: got %x, want %x", rcvd, contents)
+		}
+
+		// Check that buffers have been reposted.
+		for i := range bufs {
+			bi := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, timeout, "Timeout waiting for buffers to be reposted"))
+			if bi != bufs[i] {
+				t.Fatalf("Unexpected buffer reposted: got %x, want %x", bi, bufs[i])
+			}
+		}
+		c.rxq.tx.Flush()
+	}
+}
+
+// TestRxBuffersReposted tests that rx buffers get reposted after they have been
+// completed.
+func TestRxBuffersReposted(t *testing.T) {
+	const bufferSize = 1500
+	c := newTestContext(t, 20000, bufferSize, localLinkAddr)
+	defer c.cleanup()
+
+	// Receive all posted buffers.
+	limit := c.ep.rx.q.PostedBuffersLimit()
+	buffers := make([]queue.RxBuffer, 0, limit)
+	for i := limit; i > 0; i-- {
+		timeout := time.After(2 * time.Second)
+		buffers = append(buffers, queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, timeout, "Timeout waiting for all buffers")))
+	}
+	c.rxq.tx.Flush()
+
+	// Check that all buffers are reposted when individually completed.
+	for i := range buffers {
+		timeout := time.After(2 * time.Second)
+		// Complete the buffer.
+		c.pushRxCompletion(buffers[i].Size, buffers[i:][:1])
+		c.rxq.rx.Flush()
+		syscall.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+		// Wait for it to be reposted.
+		bi := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, timeout, "Timeout waiting for buffer to be reposted"))
+		if bi != buffers[i] {
+			t.Fatalf("Different buffer posted: got %v, want %v", bi, buffers[i])
+		}
+	}
+	c.rxq.tx.Flush()
+
+	// Check that all buffers are reposted when completed in pairs.
+	for i := 0; i < len(buffers)/2; i++ {
+		timeout := time.After(2 * time.Second)
+		// Complete with two buffers.
+		c.pushRxCompletion(2*bufferSize, buffers[2*i:][:2])
+		c.rxq.rx.Flush()
+		syscall.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+		// Wait for them to be reposted.
+		for j := 0; j < 2; j++ {
+			bi := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, timeout, "Timeout waiting for buffer to be reposted"))
+			if bi != buffers[2*i+j] {
+				t.Fatalf("Different buffer posted: got %v, want %v", bi, buffers[2*i+j])
+			}
+		}
+	}
+	c.rxq.tx.Flush()
+}
+
+// TestReceivePostingIsFull checks that the endpoint will properly handle the
+// case when a received buffer cannot be immediately reposted because it hasn't
+// been pulled from the tx pipe yet.
+func TestReceivePostingIsFull(t *testing.T) {
+	const bufferSize = 1500
+	c := newTestContext(t, 20000, bufferSize, localLinkAddr)
+	defer c.cleanup()
+
+	// Complete first posted buffer before flushing it from the tx pipe.
+	first := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, time.After(time.Second), "Timeout waiting for first buffer to be posted"))
+	c.pushRxCompletion(first.Size, []queue.RxBuffer{first})
+	c.rxq.rx.Flush()
+	syscall.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+	// Check that packet is received.
+	c.waitForPackets(1, time.After(time.Second), "Timeout waiting for completed packet")
+
+	// Complete another buffer.
+	second := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, time.After(time.Second), "Timeout waiting for second buffer to be posted"))
+	c.pushRxCompletion(second.Size, []queue.RxBuffer{second})
+	c.rxq.rx.Flush()
+	syscall.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+	// Check that no packet is received yet, as the worker is blocked trying
+	// to repost.
+	select {
+	case <-time.After(500 * time.Millisecond):
+	case <-c.packetCh:
+		t.Fatalf("Unexpected packet received")
+	}
+
+	// Flush tx queue, which will allow the first buffer to be reposted,
+	// and the second completion to be pulled.
+	c.rxq.tx.Flush()
+	syscall.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+	// Check that second packet completes.
+	c.waitForPackets(1, time.After(time.Second), "Timeout waiting for second completed packet")
+}
+
+// TestCloseWhileWaitingToPost closes the endpoint while it is waiting to
+// repost a buffer. Make sure it backs out.
+func TestCloseWhileWaitingToPost(t *testing.T) {
+	const bufferSize = 1500
+	c := newTestContext(t, 20000, bufferSize, localLinkAddr)
+	cleaned := false
+	defer func() {
+		if !cleaned {
+			c.cleanup()
+		}
+	}()
+
+	// Complete first posted buffer before flushing it from the tx pipe.
+	bi := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, time.After(time.Second), "Timeout waiting for initial buffer to be posted"))
+	c.pushRxCompletion(bi.Size, []queue.RxBuffer{bi})
+	c.rxq.rx.Flush()
+	syscall.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+	// Wait for packet to be indicated.
+	c.waitForPackets(1, time.After(time.Second), "Timeout waiting for completed packet")
+
+	// Cleanup and wait for worker to complete.
+	c.cleanup()
+	cleaned = true
+	c.ep.Wait()
+}
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go b/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
new file mode 100644
index 000000000..f7e816a41
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
@@ -0,0 +1,25 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sharedmem
+
+import (
+	"unsafe"
+)
+
+// sharedDataPointer converts the shared data slice into a pointer so that it
+// can be used in atomic operations.
+func sharedDataPointer(sharedData []byte) *uint32 {
+	return (*uint32)(unsafe.Pointer(&sharedData[0:4][0]))
+}
diff --git a/pkg/tcpip/link/sharedmem/tx.go b/pkg/tcpip/link/sharedmem/tx.go
new file mode 100644
index 000000000..6b8d7859d
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/tx.go
@@ -0,0 +1,272 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sharedmem
+
+import (
+	"math"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue"
+)
+
+const (
+	nilID = math.MaxUint64
+)
+
+// tx holds all state associated with a tx queue.
+type tx struct {
+	data []byte
+	q    queue.Tx
+	ids  idManager
+	bufs bufferManager
+}
+
+// init initializes all state needed by the tx queue based on the information
+// provided.
+//
+// The caller always retains ownership of all file descriptors passed in. The
+// queue implementation will duplicate any that it may need in the future.
+func (t *tx) init(mtu uint32, c *QueueConfig) error {
+	// Map in all buffers.
+	txPipe, err := getBuffer(c.TxPipeFD)
+	if err != nil {
+		return err
+	}
+
+	rxPipe, err := getBuffer(c.RxPipeFD)
+	if err != nil {
+		syscall.Munmap(txPipe)
+		return err
+	}
+
+	data, err := getBuffer(c.DataFD)
+	if err != nil {
+		syscall.Munmap(txPipe)
+		syscall.Munmap(rxPipe)
+		return err
+	}
+
+	// Initialize state based on buffers.
+	t.q.Init(txPipe, rxPipe)
+	t.ids.init()
+	t.bufs.init(0, len(data), int(mtu))
+	t.data = data
+
+	return nil
+}
+
+// cleanup releases all resources allocated during init(). It must only be
+// called if init() has previously succeeded.
+func (t *tx) cleanup() {
+	a, b := t.q.Bytes()
+	syscall.Munmap(a)
+	syscall.Munmap(b)
+	syscall.Munmap(t.data)
+}
+
+// transmit sends a packet made up of up to two buffers. Returns a boolean that
+// specifies whether the packet was successfully transmitted.
+func (t *tx) transmit(a, b []byte) bool {
+	// Pull completions from the tx queue and add their buffers back to the
+	// pool so that we can reuse them.
+	for {
+		id, ok := t.q.CompletedPacket()
+		if !ok {
+			break
+		}
+
+		if buf := t.ids.remove(id); buf != nil {
+			t.bufs.free(buf)
+		}
+	}
+
+	bSize := t.bufs.entrySize
+	total := uint32(len(a) + len(b))
+	bufCount := (total + bSize - 1) / bSize
+
+	// Allocate enough buffers to hold all the data.
+	var buf *queue.TxBuffer
+	for i := bufCount; i != 0; i-- {
+		b := t.bufs.alloc()
+		if b == nil {
+			// Failed to get all buffers. Return to the pool
+			// whatever we had managed to get.
+			if buf != nil {
+				t.bufs.free(buf)
+			}
+			return false
+		}
+		b.Next = buf
+		buf = b
+	}
+
+	// Copy data into allocated buffers.
+	nBuf := buf
+	var dBuf []byte
+	for _, data := range [][]byte{a, b} {
+		for len(data) > 0 {
+			if len(dBuf) == 0 {
+				dBuf = t.data[nBuf.Offset:][:nBuf.Size]
+				nBuf = nBuf.Next
+			}
+			n := copy(dBuf, data)
+			data = data[n:]
+			dBuf = dBuf[n:]
+		}
+	}
+
+	// Get an id for this packet and send it out.
+	id := t.ids.add(buf)
+	if !t.q.Enqueue(id, total, bufCount, buf) {
+		t.ids.remove(id)
+		t.bufs.free(buf)
+		return false
+	}
+
+	return true
+}
+
+// getBuffer returns a memory region mapped to the full contents of the given
+// file descriptor.
+func getBuffer(fd int) ([]byte, error) {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(fd, &s); err != nil {
+		return nil, err
+	}
+
+	// Check that size doesn't overflow an int.
+	if s.Size > int64(^uint(0)>>1) {
+		return nil, syscall.EDOM
+	}
+
+	return syscall.Mmap(fd, 0, int(s.Size), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED|syscall.MAP_FILE)
+}
+
+// idDescriptor is used by idManager to either point to a tx buffer (in case
+// the ID is assigned) or to the next free element (if the id is not assigned).
+type idDescriptor struct {
+	buf      *queue.TxBuffer
+	nextFree uint64
+}
+
+// idManager is a manager of tx buffer identifiers. It assigns unique IDs to
+// tx buffers that are added to it; the IDs can only be reused after they have
+// been removed.
+//
+// The ID assignments are stored so that the tx buffers can be retrieved from
+// the IDs previously assigned to them.
+type idManager struct {
+	// ids is a slice containing all tx buffers. The ID is the index into
+	// this slice.
+	ids []idDescriptor
+
+	// freeList a list of free IDs.
+	freeList uint64
+}
+
+// init initializes the id manager.
+func (m *idManager) init() {
+	m.freeList = nilID
+}
+
+// add assigns an ID to the given tx buffer.
+func (m *idManager) add(b *queue.TxBuffer) uint64 {
+	if i := m.freeList; i != nilID {
+		// There is an id available in the free list, just use it.
+		m.ids[i].buf = b
+		m.freeList = m.ids[i].nextFree
+		return i
+	}
+
+	// We need to expand the id descriptor.
+	m.ids = append(m.ids, idDescriptor{buf: b})
+	return uint64(len(m.ids) - 1)
+}
+
+// remove retrieves the tx buffer associated with the given ID, and removes the
+// ID from the assigned table so that it can be reused in the future.
+func (m *idManager) remove(i uint64) *queue.TxBuffer {
+	if i >= uint64(len(m.ids)) {
+		return nil
+	}
+
+	desc := &m.ids[i]
+	b := desc.buf
+	if b == nil {
+		// The provided id is not currently assigned.
+		return nil
+	}
+
+	desc.buf = nil
+	desc.nextFree = m.freeList
+	m.freeList = i
+
+	return b
+}
+
+// bufferManager manages a buffer region broken up into smaller, equally sized
+// buffers. Smaller buffers can be allocated and freed.
+type bufferManager struct {
+	freeList  *queue.TxBuffer
+	curOffset uint64
+	limit     uint64
+	entrySize uint32
+}
+
+// init initializes the buffer manager.
+func (b *bufferManager) init(initialOffset, size, entrySize int) {
+	b.freeList = nil
+	b.curOffset = uint64(initialOffset)
+	b.limit = uint64(initialOffset + size/entrySize*entrySize)
+	b.entrySize = uint32(entrySize)
+}
+
+// alloc allocates a buffer from the manager, if one is available.
+func (b *bufferManager) alloc() *queue.TxBuffer {
+	if b.freeList != nil {
+		// There is a descriptor ready for reuse in the free list.
+		d := b.freeList
+		b.freeList = d.Next
+		d.Next = nil
+		return d
+	}
+
+	if b.curOffset < b.limit {
+		// There is room available in the never-used range, so create
+		// a new descriptor for it.
+		d := &queue.TxBuffer{
+			Offset: b.curOffset,
+			Size:   b.entrySize,
+		}
+		b.curOffset += uint64(b.entrySize)
+		return d
+	}
+
+	return nil
+}
+
+// free returns all buffers in the list to the buffer manager so that they can
+// be reused.
+func (b *bufferManager) free(d *queue.TxBuffer) {
+	// Find the last buffer in the list.
+	last := d
+	for last.Next != nil {
+		last = last.Next
+	}
+
+	// Push list onto free list.
+	last.Next = b.freeList
+	b.freeList = d
+}
diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD
new file mode 100644
index 000000000..7cbc305e7
--- /dev/null
+++ b/pkg/tcpip/link/sniffer/BUILD
@@ -0,0 +1,20 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "sniffer",
+    srcs = [
+        "pcap.go",
+        "sniffer.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/log",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/nested",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/sniffer/pcap.go b/pkg/tcpip/link/sniffer/pcap.go
new file mode 100644
index 000000000..c16c19647
--- /dev/null
+++ b/pkg/tcpip/link/sniffer/pcap.go
@@ -0,0 +1,66 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sniffer
+
+import "time"
+
+type pcapHeader struct {
+	// MagicNumber is the file magic number.
+	MagicNumber uint32
+
+	// VersionMajor is the major version number.
+	VersionMajor uint16
+
+	// VersionMinor is the minor version number.
+	VersionMinor uint16
+
+	// Thiszone is the GMT to local correction.
+	Thiszone int32
+
+	// Sigfigs is the accuracy of timestamps.
+	Sigfigs uint32
+
+	// Snaplen is the max length of captured packets, in octets.
+	Snaplen uint32
+
+	// Network is the data link type.
+	Network uint32
+}
+
+const pcapPacketHeaderLen = 16
+
+type pcapPacketHeader struct {
+	// Seconds is the timestamp seconds.
+	Seconds uint32
+
+	// Microseconds is the timestamp microseconds.
+	Microseconds uint32
+
+	// IncludedLength is the number of octets of packet saved in file.
+	IncludedLength uint32
+
+	// OriginalLength is the actual length of packet.
+	OriginalLength uint32
+}
+
+func newPCAPPacketHeader(incLen, orgLen uint32) pcapPacketHeader {
+	now := time.Now()
+	return pcapPacketHeader{
+		Seconds:        uint32(now.Unix()),
+		Microseconds:   uint32(now.Nanosecond() / 1000),
+		IncludedLength: incLen,
+		OriginalLength: orgLen,
+	}
+}
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
new file mode 100644
index 000000000..d9cd4e83a
--- /dev/null
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -0,0 +1,394 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sniffer provides the implementation of data-link layer endpoints that
+// wrap another endpoint and logs inbound and outbound packets.
+//
+// Sniffer endpoints can be used in the networking stack by calling New(eID) to
+// create a new endpoint, where eID is the ID of the endpoint being wrapped,
+// and then passing it as an argument to Stack.CreateNIC().
+package sniffer
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"sync/atomic"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/nested"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// LogPackets is a flag used to enable or disable packet logging via the log
+// package. Valid values are 0 or 1.
+//
+// LogPackets must be accessed atomically.
+var LogPackets uint32 = 1
+
+// LogPacketsToPCAP is a flag used to enable or disable logging packets to a
+// pcap writer. Valid values are 0 or 1. A writer must have been specified when the
+// sniffer was created for this flag to have effect.
+//
+// LogPacketsToPCAP must be accessed atomically.
+var LogPacketsToPCAP uint32 = 1
+
+type endpoint struct {
+	nested.Endpoint
+	writer     io.Writer
+	maxPCAPLen uint32
+}
+
+var _ stack.GSOEndpoint = (*endpoint)(nil)
+var _ stack.LinkEndpoint = (*endpoint)(nil)
+var _ stack.NetworkDispatcher = (*endpoint)(nil)
+
+// New creates a new sniffer link-layer endpoint. It wraps around another
+// endpoint and logs packets and they traverse the endpoint.
+func New(lower stack.LinkEndpoint) stack.LinkEndpoint {
+	sniffer := &endpoint{}
+	sniffer.Endpoint.Init(lower, sniffer)
+	return sniffer
+}
+
+func zoneOffset() (int32, error) {
+	loc, err := time.LoadLocation("Local")
+	if err != nil {
+		return 0, err
+	}
+	date := time.Date(0, 0, 0, 0, 0, 0, 0, loc)
+	_, offset := date.Zone()
+	return int32(offset), nil
+}
+
+func writePCAPHeader(w io.Writer, maxLen uint32) error {
+	offset, err := zoneOffset()
+	if err != nil {
+		return err
+	}
+	return binary.Write(w, binary.BigEndian, pcapHeader{
+		// From https://wiki.wireshark.org/Development/LibpcapFileFormat
+		MagicNumber: 0xa1b2c3d4,
+
+		VersionMajor: 2,
+		VersionMinor: 4,
+		Thiszone:     offset,
+		Sigfigs:      0,
+		Snaplen:      maxLen,
+		Network:      101, // LINKTYPE_RAW
+	})
+}
+
+// NewWithWriter creates a new sniffer link-layer endpoint. It wraps around
+// another endpoint and logs packets as they traverse the endpoint.
+//
+// Packets are logged to writer in the pcap format. A sniffer created with this
+// function will not emit packets using the standard log package.
+//
+// snapLen is the maximum amount of a packet to be saved. Packets with a length
+// less than or equal to snapLen will be saved in their entirety. Longer
+// packets will be truncated to snapLen.
+func NewWithWriter(lower stack.LinkEndpoint, writer io.Writer, snapLen uint32) (stack.LinkEndpoint, error) {
+	if err := writePCAPHeader(writer, snapLen); err != nil {
+		return nil, err
+	}
+	sniffer := &endpoint{
+		writer:     writer,
+		maxPCAPLen: snapLen,
+	}
+	sniffer.Endpoint.Init(lower, sniffer)
+	return sniffer, nil
+}
+
+// DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is
+// called by the link-layer endpoint being wrapped when a packet arrives, and
+// logs the packet before forwarding to the actual dispatcher.
+func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dumpPacket("recv", nil, protocol, pkt)
+	e.Endpoint.DeliverNetworkPacket(remote, local, protocol, pkt)
+}
+
+func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	writer := e.writer
+	if writer == nil && atomic.LoadUint32(&LogPackets) == 1 {
+		logPacket(prefix, protocol, pkt, gso)
+	}
+	if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 {
+		totalLength := pkt.Header.UsedLength() + pkt.Data.Size()
+		length := totalLength
+		if max := int(e.maxPCAPLen); length > max {
+			length = max
+		}
+		if err := binary.Write(writer, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(totalLength))); err != nil {
+			panic(err)
+		}
+		write := func(b []byte) {
+			if len(b) > length {
+				b = b[:length]
+			}
+			for len(b) != 0 {
+				n, err := writer.Write(b)
+				if err != nil {
+					panic(err)
+				}
+				b = b[n:]
+				length -= n
+			}
+		}
+		write(pkt.Header.View())
+		for _, view := range pkt.Data.Views() {
+			if length == 0 {
+				break
+			}
+			write(view)
+		}
+	}
+}
+
+// WritePacket implements the stack.LinkEndpoint interface. It is called by
+// higher-level protocols to write packets; it just logs the packet and
+// forwards the request to the lower endpoint.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	e.dumpPacket("send", gso, protocol, pkt)
+	return e.Endpoint.WritePacket(r, gso, protocol, pkt)
+}
+
+// WritePackets implements the stack.LinkEndpoint interface. It is called by
+// higher-level protocols to write packets; it just logs the packet and
+// forwards the request to the lower endpoint.
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		e.dumpPacket("send", gso, protocol, pkt)
+	}
+	return e.Endpoint.WritePackets(r, gso, pkts, protocol)
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	e.dumpPacket("send", nil, 0, &stack.PacketBuffer{
+		Data: vv,
+	})
+	return e.Endpoint.WriteRawPacket(vv)
+}
+
+func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) {
+	// Figure out the network layer info.
+	var transProto uint8
+	src := tcpip.Address("unknown")
+	dst := tcpip.Address("unknown")
+	id := 0
+	size := uint16(0)
+	var fragmentOffset uint16
+	var moreFragments bool
+
+	// Create a clone of pkt, including any headers if present. Avoid allocating
+	// backing memory for the clone.
+	views := [8]buffer.View{}
+	vv := buffer.NewVectorisedView(0, views[:0])
+	vv.AppendView(pkt.Header.View())
+	vv.Append(pkt.Data)
+
+	switch protocol {
+	case header.IPv4ProtocolNumber:
+		hdr, ok := vv.PullUp(header.IPv4MinimumSize)
+		if !ok {
+			return
+		}
+		ipv4 := header.IPv4(hdr)
+		fragmentOffset = ipv4.FragmentOffset()
+		moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments
+		src = ipv4.SourceAddress()
+		dst = ipv4.DestinationAddress()
+		transProto = ipv4.Protocol()
+		size = ipv4.TotalLength() - uint16(ipv4.HeaderLength())
+		vv.TrimFront(int(ipv4.HeaderLength()))
+		id = int(ipv4.ID())
+
+	case header.IPv6ProtocolNumber:
+		hdr, ok := vv.PullUp(header.IPv6MinimumSize)
+		if !ok {
+			return
+		}
+		ipv6 := header.IPv6(hdr)
+		src = ipv6.SourceAddress()
+		dst = ipv6.DestinationAddress()
+		transProto = ipv6.NextHeader()
+		size = ipv6.PayloadLength()
+		vv.TrimFront(header.IPv6MinimumSize)
+
+	case header.ARPProtocolNumber:
+		hdr, ok := vv.PullUp(header.ARPSize)
+		if !ok {
+			return
+		}
+		vv.TrimFront(header.ARPSize)
+		arp := header.ARP(hdr)
+		log.Infof(
+			"%s arp %s (%s) -> %s (%s) valid:%t",
+			prefix,
+			tcpip.Address(arp.ProtocolAddressSender()), tcpip.LinkAddress(arp.HardwareAddressSender()),
+			tcpip.Address(arp.ProtocolAddressTarget()), tcpip.LinkAddress(arp.HardwareAddressTarget()),
+			arp.IsValid(),
+		)
+		return
+	default:
+		log.Infof("%s unknown network protocol", prefix)
+		return
+	}
+
+	// Figure out the transport layer info.
+	transName := "unknown"
+	srcPort := uint16(0)
+	dstPort := uint16(0)
+	details := ""
+	switch tcpip.TransportProtocolNumber(transProto) {
+	case header.ICMPv4ProtocolNumber:
+		transName = "icmp"
+		hdr, ok := vv.PullUp(header.ICMPv4MinimumSize)
+		if !ok {
+			break
+		}
+		icmp := header.ICMPv4(hdr)
+		icmpType := "unknown"
+		if fragmentOffset == 0 {
+			switch icmp.Type() {
+			case header.ICMPv4EchoReply:
+				icmpType = "echo reply"
+			case header.ICMPv4DstUnreachable:
+				icmpType = "destination unreachable"
+			case header.ICMPv4SrcQuench:
+				icmpType = "source quench"
+			case header.ICMPv4Redirect:
+				icmpType = "redirect"
+			case header.ICMPv4Echo:
+				icmpType = "echo"
+			case header.ICMPv4TimeExceeded:
+				icmpType = "time exceeded"
+			case header.ICMPv4ParamProblem:
+				icmpType = "param problem"
+			case header.ICMPv4Timestamp:
+				icmpType = "timestamp"
+			case header.ICMPv4TimestampReply:
+				icmpType = "timestamp reply"
+			case header.ICMPv4InfoRequest:
+				icmpType = "info request"
+			case header.ICMPv4InfoReply:
+				icmpType = "info reply"
+			}
+		}
+		log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+		return
+
+	case header.ICMPv6ProtocolNumber:
+		transName = "icmp"
+		hdr, ok := vv.PullUp(header.ICMPv6MinimumSize)
+		if !ok {
+			break
+		}
+		icmp := header.ICMPv6(hdr)
+		icmpType := "unknown"
+		switch icmp.Type() {
+		case header.ICMPv6DstUnreachable:
+			icmpType = "destination unreachable"
+		case header.ICMPv6PacketTooBig:
+			icmpType = "packet too big"
+		case header.ICMPv6TimeExceeded:
+			icmpType = "time exceeded"
+		case header.ICMPv6ParamProblem:
+			icmpType = "param problem"
+		case header.ICMPv6EchoRequest:
+			icmpType = "echo request"
+		case header.ICMPv6EchoReply:
+			icmpType = "echo reply"
+		case header.ICMPv6RouterSolicit:
+			icmpType = "router solicit"
+		case header.ICMPv6RouterAdvert:
+			icmpType = "router advert"
+		case header.ICMPv6NeighborSolicit:
+			icmpType = "neighbor solicit"
+		case header.ICMPv6NeighborAdvert:
+			icmpType = "neighbor advert"
+		case header.ICMPv6RedirectMsg:
+			icmpType = "redirect message"
+		}
+		log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+		return
+
+	case header.UDPProtocolNumber:
+		transName = "udp"
+		hdr, ok := vv.PullUp(header.UDPMinimumSize)
+		if !ok {
+			break
+		}
+		udp := header.UDP(hdr)
+		if fragmentOffset == 0 {
+			srcPort = udp.SourcePort()
+			dstPort = udp.DestinationPort()
+			details = fmt.Sprintf("xsum: 0x%x", udp.Checksum())
+			size -= header.UDPMinimumSize
+		}
+
+	case header.TCPProtocolNumber:
+		transName = "tcp"
+		hdr, ok := vv.PullUp(header.TCPMinimumSize)
+		if !ok {
+			break
+		}
+		tcp := header.TCP(hdr)
+		if fragmentOffset == 0 {
+			offset := int(tcp.DataOffset())
+			if offset < header.TCPMinimumSize {
+				details += fmt.Sprintf("invalid packet: tcp data offset too small %d", offset)
+				break
+			}
+			if offset > vv.Size() && !moreFragments {
+				details += fmt.Sprintf("invalid packet: tcp data offset %d larger than packet buffer length %d", offset, vv.Size())
+				break
+			}
+
+			srcPort = tcp.SourcePort()
+			dstPort = tcp.DestinationPort()
+			size -= uint16(offset)
+
+			// Initialize the TCP flags.
+			flags := tcp.Flags()
+			flagsStr := []byte("FSRPAU")
+			for i := range flagsStr {
+				if flags&(1<<uint(i)) == 0 {
+					flagsStr[i] = ' '
+				}
+			}
+			details = fmt.Sprintf("flags:0x%02x (%s) seqnum: %d ack: %d win: %d xsum:0x%x", flags, string(flagsStr), tcp.SequenceNumber(), tcp.AckNumber(), tcp.WindowSize(), tcp.Checksum())
+			if flags&header.TCPFlagSyn != 0 {
+				details += fmt.Sprintf(" options: %+v", header.ParseSynOptions(tcp.Options(), flags&header.TCPFlagAck != 0))
+			} else {
+				details += fmt.Sprintf(" options: %+v", tcp.ParsedOptions())
+			}
+		}
+
+	default:
+		log.Infof("%s %s -> %s unknown transport protocol: %d", prefix, src, dst, transProto)
+		return
+	}
+
+	if gso != nil {
+		details += fmt.Sprintf(" gso: %+v", gso)
+	}
+
+	log.Infof("%s %s %s:%d -> %s:%d len:%d id:%04x %s", prefix, transName, src, srcPort, dst, dstPort, size, id, details)
+}
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
new file mode 100644
index 000000000..e0db6cf54
--- /dev/null
+++ b/pkg/tcpip/link/tun/BUILD
@@ -0,0 +1,25 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "tun",
+    srcs = [
+        "device.go",
+        "protocol.go",
+        "tun_unsafe.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/refs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/stack",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
new file mode 100644
index 000000000..6bc9033d0
--- /dev/null
+++ b/pkg/tcpip/link/tun/device.go
@@ -0,0 +1,358 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tun
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// drivers/net/tun.c:tun_net_init()
+	defaultDevMtu = 1500
+
+	// Queue length for outbound packet, arriving at fd side for read. Overflow
+	// causes packet drops. gVisor implementation-specific.
+	defaultDevOutQueueLen = 1024
+)
+
+var zeroMAC [6]byte
+
+// Device is an opened /dev/net/tun device.
+//
+// +stateify savable
+type Device struct {
+	waiter.Queue
+
+	mu           sync.RWMutex `state:"nosave"`
+	endpoint     *tunEndpoint
+	notifyHandle *channel.NotificationHandle
+	flags        uint16
+}
+
+// beforeSave is invoked by stateify.
+func (d *Device) beforeSave() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	// TODO(b/110961832): Restore the device to stack. At this moment, the stack
+	// is not savable.
+	if d.endpoint != nil {
+		panic("/dev/net/tun does not support save/restore when a device is associated with it.")
+	}
+}
+
+// Release implements fs.FileOperations.Release.
+func (d *Device) Release() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	// Decrease refcount if there is an endpoint associated with this file.
+	if d.endpoint != nil {
+		d.endpoint.RemoveNotify(d.notifyHandle)
+		d.endpoint.DecRef()
+		d.endpoint = nil
+	}
+}
+
+// SetIff services TUNSETIFF ioctl(2) request.
+func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	if d.endpoint != nil {
+		return syserror.EINVAL
+	}
+
+	// Input validations.
+	isTun := flags&linux.IFF_TUN != 0
+	isTap := flags&linux.IFF_TAP != 0
+	supportedFlags := uint16(linux.IFF_TUN | linux.IFF_TAP | linux.IFF_NO_PI)
+	if isTap && isTun || !isTap && !isTun || flags&^supportedFlags != 0 {
+		return syserror.EINVAL
+	}
+
+	prefix := "tun"
+	if isTap {
+		prefix = "tap"
+	}
+
+	linkCaps := stack.CapabilityNone
+	if isTap {
+		linkCaps |= stack.CapabilityResolutionRequired
+	}
+
+	endpoint, err := attachOrCreateNIC(s, name, prefix, linkCaps)
+	if err != nil {
+		return syserror.EINVAL
+	}
+
+	d.endpoint = endpoint
+	d.notifyHandle = d.endpoint.AddNotify(d)
+	d.flags = flags
+	return nil
+}
+
+func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkEndpointCapabilities) (*tunEndpoint, error) {
+	for {
+		// 1. Try to attach to an existing NIC.
+		if name != "" {
+			if nic, found := s.GetNICByName(name); found {
+				endpoint, ok := nic.LinkEndpoint().(*tunEndpoint)
+				if !ok {
+					// Not a NIC created by tun device.
+					return nil, syserror.EOPNOTSUPP
+				}
+				if !endpoint.TryIncRef() {
+					// Race detected: NIC got deleted in between.
+					continue
+				}
+				return endpoint, nil
+			}
+		}
+
+		// 2. Creating a new NIC.
+		id := tcpip.NICID(s.UniqueID())
+		endpoint := &tunEndpoint{
+			Endpoint: channel.New(defaultDevOutQueueLen, defaultDevMtu, ""),
+			stack:    s,
+			nicID:    id,
+			name:     name,
+		}
+		endpoint.Endpoint.LinkEPCapabilities = linkCaps
+		if endpoint.name == "" {
+			endpoint.name = fmt.Sprintf("%s%d", prefix, id)
+		}
+		err := s.CreateNICWithOptions(endpoint.nicID, endpoint, stack.NICOptions{
+			Name: endpoint.name,
+		})
+		switch err {
+		case nil:
+			return endpoint, nil
+		case tcpip.ErrDuplicateNICID:
+			// Race detected: A NIC has been created in between.
+			continue
+		default:
+			return nil, syserror.EINVAL
+		}
+	}
+}
+
+// Write inject one inbound packet to the network interface.
+func (d *Device) Write(data []byte) (int64, error) {
+	d.mu.RLock()
+	endpoint := d.endpoint
+	d.mu.RUnlock()
+	if endpoint == nil {
+		return 0, syserror.EBADFD
+	}
+	if !endpoint.IsAttached() {
+		return 0, syserror.EIO
+	}
+
+	dataLen := int64(len(data))
+
+	// Packet information.
+	var pktInfoHdr PacketInfoHeader
+	if !d.hasFlags(linux.IFF_NO_PI) {
+		if len(data) < PacketInfoHeaderSize {
+			// Ignore bad packet.
+			return dataLen, nil
+		}
+		pktInfoHdr = PacketInfoHeader(data[:PacketInfoHeaderSize])
+		data = data[PacketInfoHeaderSize:]
+	}
+
+	// Ethernet header (TAP only).
+	var ethHdr header.Ethernet
+	if d.hasFlags(linux.IFF_TAP) {
+		if len(data) < header.EthernetMinimumSize {
+			// Ignore bad packet.
+			return dataLen, nil
+		}
+		ethHdr = header.Ethernet(data[:header.EthernetMinimumSize])
+		data = data[header.EthernetMinimumSize:]
+	}
+
+	// Try to determine network protocol number, default zero.
+	var protocol tcpip.NetworkProtocolNumber
+	switch {
+	case pktInfoHdr != nil:
+		protocol = pktInfoHdr.Protocol()
+	case ethHdr != nil:
+		protocol = ethHdr.Type()
+	}
+
+	// Try to determine remote link address, default zero.
+	var remote tcpip.LinkAddress
+	switch {
+	case ethHdr != nil:
+		remote = ethHdr.SourceAddress()
+	default:
+		remote = tcpip.LinkAddress(zeroMAC[:])
+	}
+
+	pkt := &stack.PacketBuffer{
+		Data: buffer.View(data).ToVectorisedView(),
+	}
+	if ethHdr != nil {
+		pkt.LinkHeader = buffer.View(ethHdr)
+	}
+	endpoint.InjectLinkAddr(protocol, remote, pkt)
+	return dataLen, nil
+}
+
+// Read reads one outgoing packet from the network interface.
+func (d *Device) Read() ([]byte, error) {
+	d.mu.RLock()
+	endpoint := d.endpoint
+	d.mu.RUnlock()
+	if endpoint == nil {
+		return nil, syserror.EBADFD
+	}
+
+	for {
+		info, ok := endpoint.Read()
+		if !ok {
+			return nil, syserror.ErrWouldBlock
+		}
+
+		v, ok := d.encodePkt(&info)
+		if !ok {
+			// Ignore unsupported packet.
+			continue
+		}
+		return v, nil
+	}
+}
+
+// encodePkt encodes packet for fd side.
+func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) {
+	var vv buffer.VectorisedView
+
+	// Packet information.
+	if !d.hasFlags(linux.IFF_NO_PI) {
+		hdr := make(PacketInfoHeader, PacketInfoHeaderSize)
+		hdr.Encode(&PacketInfoFields{
+			Protocol: info.Proto,
+		})
+		vv.AppendView(buffer.View(hdr))
+	}
+
+	// If the packet does not already have link layer header, and the route
+	// does not exist, we can't compute it. This is possibly a raw packet, tun
+	// device doesn't support this at the moment.
+	if info.Pkt.LinkHeader == nil && info.Route.RemoteLinkAddress == "" {
+		return nil, false
+	}
+
+	// Ethernet header (TAP only).
+	if d.hasFlags(linux.IFF_TAP) {
+		// Add ethernet header if not provided.
+		if info.Pkt.LinkHeader == nil {
+			hdr := &header.EthernetFields{
+				SrcAddr: info.Route.LocalLinkAddress,
+				DstAddr: info.Route.RemoteLinkAddress,
+				Type:    info.Proto,
+			}
+			if hdr.SrcAddr == "" {
+				hdr.SrcAddr = d.endpoint.LinkAddress()
+			}
+
+			eth := make(header.Ethernet, header.EthernetMinimumSize)
+			eth.Encode(hdr)
+			vv.AppendView(buffer.View(eth))
+		} else {
+			vv.AppendView(info.Pkt.LinkHeader)
+		}
+	}
+
+	// Append upper headers.
+	vv.AppendView(buffer.View(info.Pkt.Header.View()[len(info.Pkt.LinkHeader):]))
+	// Append data payload.
+	vv.Append(info.Pkt.Data)
+
+	return vv.ToView(), true
+}
+
+// Name returns the name of the attached network interface. Empty string if
+// unattached.
+func (d *Device) Name() string {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	if d.endpoint != nil {
+		return d.endpoint.name
+	}
+	return ""
+}
+
+// Flags returns the flags set for d. Zero value if unset.
+func (d *Device) Flags() uint16 {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	return d.flags
+}
+
+func (d *Device) hasFlags(flags uint16) bool {
+	return d.flags&flags == flags
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (d *Device) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if mask&waiter.EventIn != 0 {
+		d.mu.RLock()
+		endpoint := d.endpoint
+		d.mu.RUnlock()
+		if endpoint != nil && endpoint.NumQueued() == 0 {
+			mask &= ^waiter.EventIn
+		}
+	}
+	return mask & (waiter.EventIn | waiter.EventOut)
+}
+
+// WriteNotify implements channel.Notification.WriteNotify.
+func (d *Device) WriteNotify() {
+	d.Notify(waiter.EventIn)
+}
+
+// tunEndpoint is the link endpoint for the NIC created by the tun device.
+//
+// It is ref-counted as multiple opening files can attach to the same NIC.
+// The last owner is responsible for deleting the NIC.
+type tunEndpoint struct {
+	*channel.Endpoint
+
+	refs.AtomicRefCount
+
+	stack *stack.Stack
+	nicID tcpip.NICID
+	name  string
+}
+
+// DecRef decrements refcount of e, removes NIC if refcount goes to 0.
+func (e *tunEndpoint) DecRef() {
+	e.DecRefWithDestructor(func() {
+		e.stack.RemoveNIC(e.nicID)
+	})
+}
diff --git a/pkg/tcpip/link/tun/protocol.go b/pkg/tcpip/link/tun/protocol.go
new file mode 100644
index 000000000..89d9d91a9
--- /dev/null
+++ b/pkg/tcpip/link/tun/protocol.go
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tun
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// PacketInfoHeaderSize is the size of the packet information header.
+	PacketInfoHeaderSize = 4
+
+	offsetFlags    = 0
+	offsetProtocol = 2
+)
+
+// PacketInfoFields contains fields sent through the wire if IFF_NO_PI flag is
+// not set.
+type PacketInfoFields struct {
+	Flags    uint16
+	Protocol tcpip.NetworkProtocolNumber
+}
+
+// PacketInfoHeader is the wire representation of the packet information sent if
+// IFF_NO_PI flag is not set.
+type PacketInfoHeader []byte
+
+// Encode encodes f into h.
+func (h PacketInfoHeader) Encode(f *PacketInfoFields) {
+	binary.BigEndian.PutUint16(h[offsetFlags:][:2], f.Flags)
+	binary.BigEndian.PutUint16(h[offsetProtocol:][:2], uint16(f.Protocol))
+}
+
+// Flags returns the flag field in h.
+func (h PacketInfoHeader) Flags() uint16 {
+	return binary.BigEndian.Uint16(h[offsetFlags:])
+}
+
+// Protocol returns the protocol field in h.
+func (h PacketInfoHeader) Protocol() tcpip.NetworkProtocolNumber {
+	return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(h[offsetProtocol:]))
+}
diff --git a/pkg/tcpip/link/tun/tun_unsafe.go b/pkg/tcpip/link/tun/tun_unsafe.go
new file mode 100644
index 000000000..09ca9b527
--- /dev/null
+++ b/pkg/tcpip/link/tun/tun_unsafe.go
@@ -0,0 +1,63 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+// Package tun contains methods to open TAP and TUN devices.
+package tun
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+// Open opens the specified TUN device, sets it to non-blocking mode, and
+// returns its file descriptor.
+func Open(name string) (int, error) {
+	return open(name, syscall.IFF_TUN|syscall.IFF_NO_PI)
+}
+
+// OpenTAP opens the specified TAP device, sets it to non-blocking mode, and
+// returns its file descriptor.
+func OpenTAP(name string) (int, error) {
+	return open(name, syscall.IFF_TAP|syscall.IFF_NO_PI)
+}
+
+func open(name string, flags uint16) (int, error) {
+	fd, err := syscall.Open("/dev/net/tun", syscall.O_RDWR, 0)
+	if err != nil {
+		return -1, err
+	}
+
+	var ifr struct {
+		name  [16]byte
+		flags uint16
+		_     [22]byte
+	}
+
+	copy(ifr.name[:], name)
+	ifr.flags = flags
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.TUNSETIFF, uintptr(unsafe.Pointer(&ifr)))
+	if errno != 0 {
+		syscall.Close(fd)
+		return -1, errno
+	}
+
+	if err = syscall.SetNonblock(fd, true); err != nil {
+		syscall.Close(fd)
+		return -1, err
+	}
+
+	return fd, nil
+}
diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD
new file mode 100644
index 000000000..0956d2c65
--- /dev/null
+++ b/pkg/tcpip/link/waitable/BUILD
@@ -0,0 +1,30 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "waitable",
+    srcs = [
+        "waitable.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/gate",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
+    ],
+)
+
+go_test(
+    name = "waitable_test",
+    srcs = [
+        "waitable_test.go",
+    ],
+    library = ":waitable",
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
new file mode 100644
index 000000000..949b3f2b2
--- /dev/null
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -0,0 +1,149 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package waitable provides the implementation of data-link layer endpoints
+// that wrap other endpoints, and can wait for inflight calls to WritePacket or
+// DeliverNetworkPacket to finish (and new ones to be prevented).
+//
+// Waitable endpoints can be used in the networking stack by calling New(eID) to
+// create a new endpoint, where eID is the ID of the endpoint being wrapped,
+// and then passing it as an argument to Stack.CreateNIC().
+package waitable
+
+import (
+	"gvisor.dev/gvisor/pkg/gate"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// Endpoint is a waitable link-layer endpoint.
+type Endpoint struct {
+	dispatchGate gate.Gate
+	dispatcher   stack.NetworkDispatcher
+
+	writeGate gate.Gate
+	lower     stack.LinkEndpoint
+}
+
+// New creates a new waitable link-layer endpoint. It wraps around another
+// endpoint and allows the caller to block new write/dispatch calls and wait for
+// the inflight ones to finish before returning.
+func New(lower stack.LinkEndpoint) *Endpoint {
+	return &Endpoint{
+		lower: lower,
+	}
+}
+
+// DeliverNetworkPacket implements stack.NetworkDispatcher.DeliverNetworkPacket.
+// It is called by the link-layer endpoint being wrapped when a packet arrives,
+// and only forwards to the actual dispatcher if Wait or WaitDispatch haven't
+// been called.
+func (e *Endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	if !e.dispatchGate.Enter() {
+		return
+	}
+
+	e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
+	e.dispatchGate.Leave()
+}
+
+// Attach implements stack.LinkEndpoint.Attach. It saves the dispatcher and
+// registers with the lower endpoint as its dispatcher so that "e" is called
+// for inbound packets.
+func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+	e.lower.Attach(e)
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *Endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It just forwards the request to the
+// lower endpoint.
+func (e *Endpoint) MTU() uint32 {
+	return e.lower.MTU()
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities. It just forwards the
+// request to the lower endpoint.
+func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.lower.Capabilities()
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. It just
+// forwards the request to the lower endpoint.
+func (e *Endpoint) MaxHeaderLength() uint16 {
+	return e.lower.MaxHeaderLength()
+}
+
+// LinkAddress implements stack.LinkEndpoint.LinkAddress. It just forwards the
+// request to the lower endpoint.
+func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.lower.LinkAddress()
+}
+
+// WritePacket implements stack.LinkEndpoint.WritePacket. It is called by
+// higher-level protocols to write packets. It only forwards packets to the
+// lower endpoint if Wait or WaitWrite haven't been called.
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	if !e.writeGate.Enter() {
+		return nil
+	}
+
+	err := e.lower.WritePacket(r, gso, protocol, pkt)
+	e.writeGate.Leave()
+	return err
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets. It is called by
+// higher-level protocols to write packets. It only forwards packets to the
+// lower endpoint if Wait or WaitWrite haven't been called.
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	if !e.writeGate.Enter() {
+		return pkts.Len(), nil
+	}
+
+	n, err := e.lower.WritePackets(r, gso, pkts, protocol)
+	e.writeGate.Leave()
+	return n, err
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	if !e.writeGate.Enter() {
+		return nil
+	}
+
+	err := e.lower.WriteRawPacket(vv)
+	e.writeGate.Leave()
+	return err
+}
+
+// WaitWrite prevents new calls to WritePacket from reaching the lower endpoint,
+// and waits for inflight ones to finish before returning.
+func (e *Endpoint) WaitWrite() {
+	e.writeGate.Close()
+}
+
+// WaitDispatch prevents new calls to DeliverNetworkPacket from reaching the
+// actual dispatcher, and waits for inflight ones to finish before returning.
+func (e *Endpoint) WaitDispatch() {
+	e.dispatchGate.Close()
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (e *Endpoint) Wait() {}
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
new file mode 100644
index 000000000..63bf40562
--- /dev/null
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -0,0 +1,173 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package waitable
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+type countedEndpoint struct {
+	dispatchCount int
+	writeCount    int
+	attachCount   int
+
+	mtu          uint32
+	capabilities stack.LinkEndpointCapabilities
+	hdrLen       uint16
+	linkAddr     tcpip.LinkAddress
+
+	dispatcher stack.NetworkDispatcher
+}
+
+func (e *countedEndpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dispatchCount++
+}
+
+func (e *countedEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.attachCount++
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *countedEndpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+func (e *countedEndpoint) MTU() uint32 {
+	return e.mtu
+}
+
+func (e *countedEndpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.capabilities
+}
+
+func (e *countedEndpoint) MaxHeaderLength() uint16 {
+	return e.hdrLen
+}
+
+func (e *countedEndpoint) LinkAddress() tcpip.LinkAddress {
+	return e.linkAddr
+}
+
+func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	e.writeCount++
+	return nil
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	e.writeCount += pkts.Len()
+	return pkts.Len(), nil
+}
+
+func (e *countedEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
+	e.writeCount++
+	return nil
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (*countedEndpoint) Wait() {}
+
+func TestWaitWrite(t *testing.T) {
+	ep := &countedEndpoint{}
+	wep := New(ep)
+
+	// Write and check that it goes through.
+	wep.WritePacket(nil, nil /* gso */, 0, &stack.PacketBuffer{})
+	if want := 1; ep.writeCount != want {
+		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
+	}
+
+	// Wait on dispatches, then try to write. It must go through.
+	wep.WaitDispatch()
+	wep.WritePacket(nil, nil /* gso */, 0, &stack.PacketBuffer{})
+	if want := 2; ep.writeCount != want {
+		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
+	}
+
+	// Wait on writes, then try to write. It must not go through.
+	wep.WaitWrite()
+	wep.WritePacket(nil, nil /* gso */, 0, &stack.PacketBuffer{})
+	if want := 2; ep.writeCount != want {
+		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
+	}
+}
+
+func TestWaitDispatch(t *testing.T) {
+	ep := &countedEndpoint{}
+	wep := New(ep)
+
+	// Check that attach happens.
+	wep.Attach(ep)
+	if want := 1; ep.attachCount != want {
+		t.Fatalf("Unexpected attachCount: got=%v, want=%v", ep.attachCount, want)
+	}
+
+	// Dispatch and check that it goes through.
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, &stack.PacketBuffer{})
+	if want := 1; ep.dispatchCount != want {
+		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
+	}
+
+	// Wait on writes, then try to dispatch. It must go through.
+	wep.WaitWrite()
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, &stack.PacketBuffer{})
+	if want := 2; ep.dispatchCount != want {
+		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
+	}
+
+	// Wait on dispatches, then try to dispatch. It must not go through.
+	wep.WaitDispatch()
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, &stack.PacketBuffer{})
+	if want := 2; ep.dispatchCount != want {
+		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
+	}
+}
+
+func TestOtherMethods(t *testing.T) {
+	const (
+		mtu          = 0xdead
+		capabilities = 0xbeef
+		hdrLen       = 0x1234
+		linkAddr     = "test address"
+	)
+	ep := &countedEndpoint{
+		mtu:          mtu,
+		capabilities: capabilities,
+		hdrLen:       hdrLen,
+		linkAddr:     linkAddr,
+	}
+	wep := New(ep)
+
+	if v := wep.MTU(); v != mtu {
+		t.Fatalf("Unexpected mtu: got=%v, want=%v", v, mtu)
+	}
+
+	if v := wep.Capabilities(); v != capabilities {
+		t.Fatalf("Unexpected capabilities: got=%v, want=%v", v, capabilities)
+	}
+
+	if v := wep.MaxHeaderLength(); v != hdrLen {
+		t.Fatalf("Unexpected MaxHeaderLength: got=%v, want=%v", v, hdrLen)
+	}
+
+	if v := wep.LinkAddress(); v != linkAddr {
+		t.Fatalf("Unexpected LinkAddress: got=%q, want=%q", v, linkAddr)
+	}
+}
diff --git a/pkg/tcpip/network/BUILD b/pkg/tcpip/network/BUILD
new file mode 100644
index 000000000..6a4839fb8
--- /dev/null
+++ b/pkg/tcpip/network/BUILD
@@ -0,0 +1,22 @@
+load("//tools:defs.bzl", "go_test")
+
+package(licenses = ["notice"])
+
+go_test(
+    name = "ip_test",
+    size = "small",
+    srcs = [
+        "ip_test.go",
+    ],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/tcpip/transport/udp",
+    ],
+)
diff --git a/pkg/tcpip/network/arp/BUILD b/pkg/tcpip/network/arp/BUILD
new file mode 100644
index 000000000..eddf7b725
--- /dev/null
+++ b/pkg/tcpip/network/arp/BUILD
@@ -0,0 +1,32 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "arp",
+    srcs = ["arp.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+    ],
+)
+
+go_test(
+    name = "arp_test",
+    size = "small",
+    srcs = ["arp_test.go"],
+    deps = [
+        ":arp",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/icmp",
+    ],
+)
diff --git a/pkg/tcpip/network/arp/arp.go b/pkg/tcpip/network/arp/arp.go
new file mode 100644
index 000000000..7f27a840d
--- /dev/null
+++ b/pkg/tcpip/network/arp/arp.go
@@ -0,0 +1,224 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package arp implements the ARP network protocol. It is used to resolve
+// IPv4 addresses into link-local MAC addresses, and advertises IPv4
+// addresses of its stack with the local network.
+//
+// To use it in the networking stack, pass arp.NewProtocol() as one of the
+// network protocols when calling stack.New. Then add an "arp" address to every
+// NIC on the stack that should respond to ARP requests. That is:
+//
+//	if err := s.AddAddress(1, arp.ProtocolNumber, "arp"); err != nil {
+//		// handle err
+//	}
+package arp
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	// ProtocolNumber is the ARP protocol number.
+	ProtocolNumber = header.ARPProtocolNumber
+
+	// ProtocolAddress is the address expected by the ARP endpoint.
+	ProtocolAddress = tcpip.Address("arp")
+)
+
+// endpoint implements stack.NetworkEndpoint.
+type endpoint struct {
+	protocol      *protocol
+	nicID         tcpip.NICID
+	linkEP        stack.LinkEndpoint
+	linkAddrCache stack.LinkAddressCache
+}
+
+// DefaultTTL is unused for ARP. It implements stack.NetworkEndpoint.
+func (e *endpoint) DefaultTTL() uint8 {
+	return 0
+}
+
+func (e *endpoint) MTU() uint32 {
+	lmtu := e.linkEP.MTU()
+	return lmtu - uint32(e.MaxHeaderLength())
+}
+
+func (e *endpoint) NICID() tcpip.NICID {
+	return e.nicID
+}
+
+func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.linkEP.Capabilities()
+}
+
+func (e *endpoint) ID() *stack.NetworkEndpointID {
+	return &stack.NetworkEndpointID{ProtocolAddress}
+}
+
+func (e *endpoint) PrefixLen() int {
+	return 0
+}
+
+func (e *endpoint) MaxHeaderLength() uint16 {
+	return e.linkEP.MaxHeaderLength() + header.ARPSize
+}
+
+func (e *endpoint) Close() {}
+
+func (e *endpoint) WritePacket(*stack.Route, *stack.GSO, stack.NetworkHeaderParams, *stack.PacketBuffer) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
+func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return e.protocol.Number()
+}
+
+// WritePackets implements stack.NetworkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList, stack.NetworkHeaderParams) (int, *tcpip.Error) {
+	return 0, tcpip.ErrNotSupported
+}
+
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	h := header.ARP(pkt.NetworkHeader)
+	if !h.IsValid() {
+		return
+	}
+
+	switch h.Op() {
+	case header.ARPRequest:
+		localAddr := tcpip.Address(h.ProtocolAddressTarget())
+		if e.linkAddrCache.CheckLocalAddress(e.nicID, header.IPv4ProtocolNumber, localAddr) == 0 {
+			return // we have no useful answer, ignore the request
+		}
+		hdr := buffer.NewPrependable(int(e.linkEP.MaxHeaderLength()) + header.ARPSize)
+		packet := header.ARP(hdr.Prepend(header.ARPSize))
+		packet.SetIPv4OverEthernet()
+		packet.SetOp(header.ARPReply)
+		copy(packet.HardwareAddressSender(), r.LocalLinkAddress[:])
+		copy(packet.ProtocolAddressSender(), h.ProtocolAddressTarget())
+		copy(packet.HardwareAddressTarget(), h.HardwareAddressSender())
+		copy(packet.ProtocolAddressTarget(), h.ProtocolAddressSender())
+		e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+		})
+		fallthrough // also fill the cache from requests
+	case header.ARPReply:
+		addr := tcpip.Address(h.ProtocolAddressSender())
+		linkAddr := tcpip.LinkAddress(h.HardwareAddressSender())
+		e.linkAddrCache.AddLinkAddress(e.nicID, addr, linkAddr)
+	}
+}
+
+// protocol implements stack.NetworkProtocol and stack.LinkAddressResolver.
+type protocol struct {
+}
+
+func (p *protocol) Number() tcpip.NetworkProtocolNumber { return ProtocolNumber }
+func (p *protocol) MinimumPacketSize() int              { return header.ARPSize }
+func (p *protocol) DefaultPrefixLen() int               { return 0 }
+
+func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
+	h := header.ARP(v)
+	return tcpip.Address(h.ProtocolAddressSender()), ProtocolAddress
+}
+
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, sender stack.LinkEndpoint, st *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
+	if addrWithPrefix.Address != ProtocolAddress {
+		return nil, tcpip.ErrBadLocalAddress
+	}
+	return &endpoint{
+		protocol:      p,
+		nicID:         nicID,
+		linkEP:        sender,
+		linkAddrCache: linkAddrCache,
+	}, nil
+}
+
+// LinkAddressProtocol implements stack.LinkAddressResolver.LinkAddressProtocol.
+func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
+	return header.IPv4ProtocolNumber
+}
+
+// LinkAddressRequest implements stack.LinkAddressResolver.LinkAddressRequest.
+func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error {
+	r := &stack.Route{
+		RemoteLinkAddress: broadcastMAC,
+	}
+
+	hdr := buffer.NewPrependable(int(linkEP.MaxHeaderLength()) + header.ARPSize)
+	h := header.ARP(hdr.Prepend(header.ARPSize))
+	h.SetIPv4OverEthernet()
+	h.SetOp(header.ARPRequest)
+	copy(h.HardwareAddressSender(), linkEP.LinkAddress())
+	copy(h.ProtocolAddressSender(), localAddr)
+	copy(h.ProtocolAddressTarget(), addr)
+
+	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, &stack.PacketBuffer{
+		Header: hdr,
+	})
+}
+
+// ResolveStaticAddress implements stack.LinkAddressResolver.ResolveStaticAddress.
+func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+	if addr == header.IPv4Broadcast {
+		return broadcastMAC, true
+	}
+	if header.IsV4MulticastAddress(addr) {
+		return header.EthernetAddressFromMulticastIPv4Address(addr), true
+	}
+	return tcpip.LinkAddress([]byte(nil)), false
+}
+
+// SetOption implements stack.NetworkProtocol.SetOption.
+func (*protocol) SetOption(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// Option implements stack.NetworkProtocol.Option.
+func (*protocol) Option(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
+// Parse implements stack.NetworkProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
+	hdr, ok := pkt.Data.PullUp(header.ARPSize)
+	if !ok {
+		return 0, false, false
+	}
+	pkt.NetworkHeader = hdr
+	pkt.Data.TrimFront(header.ARPSize)
+	return 0, false, true
+}
+
+var broadcastMAC = tcpip.LinkAddress([]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff})
+
+// NewProtocol returns an ARP network protocol.
+func NewProtocol() stack.NetworkProtocol {
+	return &protocol{}
+}
diff --git a/pkg/tcpip/network/arp/arp_test.go b/pkg/tcpip/network/arp/arp_test.go
new file mode 100644
index 000000000..66e67429c
--- /dev/null
+++ b/pkg/tcpip/network/arp/arp_test.go
@@ -0,0 +1,146 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package arp_test
+
+import (
+	"context"
+	"strconv"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+)
+
+const (
+	stackLinkAddr = tcpip.LinkAddress("\x0a\x0a\x0b\x0b\x0c\x0c")
+	stackAddr1    = tcpip.Address("\x0a\x00\x00\x01")
+	stackAddr2    = tcpip.Address("\x0a\x00\x00\x02")
+	stackAddrBad  = tcpip.Address("\x0a\x00\x00\x03")
+)
+
+type testContext struct {
+	t      *testing.T
+	linkEP *channel.Endpoint
+	s      *stack.Stack
+}
+
+func newTestContext(t *testing.T) *testContext {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), arp.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol4()},
+	})
+
+	const defaultMTU = 65536
+	ep := channel.New(256, defaultMTU, stackLinkAddr)
+	wep := stack.LinkEndpoint(ep)
+
+	if testing.Verbose() {
+		wep = sniffer.New(ep)
+	}
+	if err := s.CreateNIC(1, wep); err != nil {
+		t.Fatalf("CreateNIC failed: %v", err)
+	}
+
+	if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr1); err != nil {
+		t.Fatalf("AddAddress for ipv4 failed: %v", err)
+	}
+	if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr2); err != nil {
+		t.Fatalf("AddAddress for ipv4 failed: %v", err)
+	}
+	if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+		t.Fatalf("AddAddress for arp failed: %v", err)
+	}
+
+	s.SetRouteTable([]tcpip.Route{{
+		Destination: header.IPv4EmptySubnet,
+		NIC:         1,
+	}})
+
+	return &testContext{
+		t:      t,
+		s:      s,
+		linkEP: ep,
+	}
+}
+
+func (c *testContext) cleanup() {
+	c.linkEP.Close()
+}
+
+func TestDirectRequest(t *testing.T) {
+	c := newTestContext(t)
+	defer c.cleanup()
+
+	const senderMAC = "\x01\x02\x03\x04\x05\x06"
+	const senderIPv4 = "\x0a\x00\x00\x02"
+
+	v := make(buffer.View, header.ARPSize)
+	h := header.ARP(v)
+	h.SetIPv4OverEthernet()
+	h.SetOp(header.ARPRequest)
+	copy(h.HardwareAddressSender(), senderMAC)
+	copy(h.ProtocolAddressSender(), senderIPv4)
+
+	inject := func(addr tcpip.Address) {
+		copy(h.ProtocolAddressTarget(), addr)
+		c.linkEP.InjectInbound(arp.ProtocolNumber, &stack.PacketBuffer{
+			Data: v.ToVectorisedView(),
+		})
+	}
+
+	for i, address := range []tcpip.Address{stackAddr1, stackAddr2} {
+		t.Run(strconv.Itoa(i), func(t *testing.T) {
+			inject(address)
+			pi, _ := c.linkEP.ReadContext(context.Background())
+			if pi.Proto != arp.ProtocolNumber {
+				t.Fatalf("expected ARP response, got network protocol number %d", pi.Proto)
+			}
+			rep := header.ARP(pi.Pkt.Header.View())
+			if !rep.IsValid() {
+				t.Fatalf("invalid ARP response pi.Pkt.Header.UsedLength()=%d", pi.Pkt.Header.UsedLength())
+			}
+			if got, want := tcpip.LinkAddress(rep.HardwareAddressSender()), stackLinkAddr; got != want {
+				t.Errorf("got HardwareAddressSender = %s, want = %s", got, want)
+			}
+			if got, want := tcpip.Address(rep.ProtocolAddressSender()), tcpip.Address(h.ProtocolAddressTarget()); got != want {
+				t.Errorf("got ProtocolAddressSender = %s, want = %s", got, want)
+			}
+			if got, want := tcpip.LinkAddress(rep.HardwareAddressTarget()), tcpip.LinkAddress(h.HardwareAddressSender()); got != want {
+				t.Errorf("got HardwareAddressTarget = %s, want = %s", got, want)
+			}
+			if got, want := tcpip.Address(rep.ProtocolAddressTarget()), tcpip.Address(h.ProtocolAddressSender()); got != want {
+				t.Errorf("got ProtocolAddressTarget = %s, want = %s", got, want)
+			}
+		})
+	}
+
+	inject(stackAddrBad)
+	// Sleep tests are gross, but this will only potentially flake
+	// if there's a bug. If there is no bug this will reliably
+	// succeed.
+	ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	defer cancel()
+	if pkt, ok := c.linkEP.ReadContext(ctx); ok {
+		t.Errorf("stackAddrBad: unexpected packet sent, Proto=%v", pkt.Proto)
+	}
+}
diff --git a/pkg/tcpip/network/fragmentation/BUILD b/pkg/tcpip/network/fragmentation/BUILD
new file mode 100644
index 000000000..d1c728ccf
--- /dev/null
+++ b/pkg/tcpip/network/fragmentation/BUILD
@@ -0,0 +1,45 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "reassembler_list",
+    out = "reassembler_list.go",
+    package = "fragmentation",
+    prefix = "reassembler",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*reassembler",
+        "Linker": "*reassembler",
+    },
+)
+
+go_library(
+    name = "fragmentation",
+    srcs = [
+        "frag_heap.go",
+        "fragmentation.go",
+        "reassembler.go",
+        "reassembler_list.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/log",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+    ],
+)
+
+go_test(
+    name = "fragmentation_test",
+    size = "small",
+    srcs = [
+        "frag_heap_test.go",
+        "fragmentation_test.go",
+        "reassembler_test.go",
+    ],
+    library = ":fragmentation",
+    deps = ["//pkg/tcpip/buffer"],
+)
diff --git a/pkg/tcpip/network/fragmentation/frag_heap.go b/pkg/tcpip/network/fragmentation/frag_heap.go
new file mode 100644
index 000000000..0b570d25a
--- /dev/null
+++ b/pkg/tcpip/network/fragmentation/frag_heap.go
@@ -0,0 +1,77 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fragmentation
+
+import (
+	"container/heap"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+type fragment struct {
+	offset uint16
+	vv     buffer.VectorisedView
+}
+
+type fragHeap []fragment
+
+func (h *fragHeap) Len() int {
+	return len(*h)
+}
+
+func (h *fragHeap) Less(i, j int) bool {
+	return (*h)[i].offset < (*h)[j].offset
+}
+
+func (h *fragHeap) Swap(i, j int) {
+	(*h)[i], (*h)[j] = (*h)[j], (*h)[i]
+}
+
+func (h *fragHeap) Push(x interface{}) {
+	*h = append(*h, x.(fragment))
+}
+
+func (h *fragHeap) Pop() interface{} {
+	old := *h
+	n := len(old)
+	x := old[n-1]
+	*h = old[:n-1]
+	return x
+}
+
+// reassamble empties the heap and returns a VectorisedView
+// containing a reassambled version of the fragments inside the heap.
+func (h *fragHeap) reassemble() (buffer.VectorisedView, error) {
+	curr := heap.Pop(h).(fragment)
+	views := curr.vv.Views()
+	size := curr.vv.Size()
+
+	if curr.offset != 0 {
+		return buffer.VectorisedView{}, fmt.Errorf("offset of the first packet is != 0 (%d)", curr.offset)
+	}
+
+	for h.Len() > 0 {
+		curr := heap.Pop(h).(fragment)
+		if int(curr.offset) < size {
+			curr.vv.TrimFront(size - int(curr.offset))
+		} else if int(curr.offset) > size {
+			return buffer.VectorisedView{}, fmt.Errorf("packet has a hole, expected offset %d, got %d", size, curr.offset)
+		}
+		size += curr.vv.Size()
+		views = append(views, curr.vv.Views()...)
+	}
+	return buffer.NewVectorisedView(size, views), nil
+}
diff --git a/pkg/tcpip/network/fragmentation/frag_heap_test.go b/pkg/tcpip/network/fragmentation/frag_heap_test.go
new file mode 100644
index 000000000..9ececcb9f
--- /dev/null
+++ b/pkg/tcpip/network/fragmentation/frag_heap_test.go
@@ -0,0 +1,126 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fragmentation
+
+import (
+	"container/heap"
+	"reflect"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+var reassambleTestCases = []struct {
+	comment string
+	in      []fragment
+	want    buffer.VectorisedView
+}{
+	{
+		comment: "Non-overlapping in-order",
+		in: []fragment{
+			{offset: 0, vv: vv(1, "0")},
+			{offset: 1, vv: vv(1, "1")},
+		},
+		want: vv(2, "0", "1"),
+	},
+	{
+		comment: "Non-overlapping out-of-order",
+		in: []fragment{
+			{offset: 1, vv: vv(1, "1")},
+			{offset: 0, vv: vv(1, "0")},
+		},
+		want: vv(2, "0", "1"),
+	},
+	{
+		comment: "Duplicated packets",
+		in: []fragment{
+			{offset: 0, vv: vv(1, "0")},
+			{offset: 0, vv: vv(1, "0")},
+		},
+		want: vv(1, "0"),
+	},
+	{
+		comment: "Overlapping in-order",
+		in: []fragment{
+			{offset: 0, vv: vv(2, "01")},
+			{offset: 1, vv: vv(2, "12")},
+		},
+		want: vv(3, "01", "2"),
+	},
+	{
+		comment: "Overlapping out-of-order",
+		in: []fragment{
+			{offset: 1, vv: vv(2, "12")},
+			{offset: 0, vv: vv(2, "01")},
+		},
+		want: vv(3, "01", "2"),
+	},
+	{
+		comment: "Overlapping subset in-order",
+		in: []fragment{
+			{offset: 0, vv: vv(3, "012")},
+			{offset: 1, vv: vv(1, "1")},
+		},
+		want: vv(3, "012"),
+	},
+	{
+		comment: "Overlapping subset out-of-order",
+		in: []fragment{
+			{offset: 1, vv: vv(1, "1")},
+			{offset: 0, vv: vv(3, "012")},
+		},
+		want: vv(3, "012"),
+	},
+}
+
+func TestReassamble(t *testing.T) {
+	for _, c := range reassambleTestCases {
+		t.Run(c.comment, func(t *testing.T) {
+			h := make(fragHeap, 0, 8)
+			heap.Init(&h)
+			for _, f := range c.in {
+				heap.Push(&h, f)
+			}
+			got, err := h.reassemble()
+			if err != nil {
+				t.Fatal(err)
+			}
+			if !reflect.DeepEqual(got, c.want) {
+				t.Errorf("got reassemble(%+v) = %v, want = %v", c.in, got, c.want)
+			}
+		})
+	}
+}
+
+func TestReassambleFailsForNonZeroOffset(t *testing.T) {
+	h := make(fragHeap, 0, 8)
+	heap.Init(&h)
+	heap.Push(&h, fragment{offset: 1, vv: vv(1, "0")})
+	_, err := h.reassemble()
+	if err == nil {
+		t.Errorf("reassemble() did not fail when the first packet had offset != 0")
+	}
+}
+
+func TestReassambleFailsForHoles(t *testing.T) {
+	h := make(fragHeap, 0, 8)
+	heap.Init(&h)
+	heap.Push(&h, fragment{offset: 0, vv: vv(1, "0")})
+	heap.Push(&h, fragment{offset: 2, vv: vv(1, "1")})
+	_, err := h.reassemble()
+	if err == nil {
+		t.Errorf("reassemble() did not fail when there was a hole in the packet")
+	}
+}
diff --git a/pkg/tcpip/network/fragmentation/fragmentation.go b/pkg/tcpip/network/fragmentation/fragmentation.go
new file mode 100644
index 000000000..2982450f8
--- /dev/null
+++ b/pkg/tcpip/network/fragmentation/fragmentation.go
@@ -0,0 +1,144 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fragmentation contains the implementation of IP fragmentation.
+// It is based on RFC 791 and RFC 815.
+package fragmentation
+
+import (
+	"fmt"
+	"log"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+// DefaultReassembleTimeout is based on the linux stack: net.ipv4.ipfrag_time.
+const DefaultReassembleTimeout = 30 * time.Second
+
+// HighFragThreshold is the threshold at which we start trimming old
+// fragmented packets. Linux uses a default value of 4 MB. See
+// net.ipv4.ipfrag_high_thresh for more information.
+const HighFragThreshold = 4 << 20 // 4MB
+
+// LowFragThreshold is the threshold we reach to when we start dropping
+// older fragmented packets. It's important that we keep enough room for newer
+// packets to be re-assembled. Hence, this needs to be lower than
+// HighFragThreshold enough. Linux uses a default value of 3 MB. See
+// net.ipv4.ipfrag_low_thresh for more information.
+const LowFragThreshold = 3 << 20 // 3MB
+
+// Fragmentation is the main structure that other modules
+// of the stack should use to implement IP Fragmentation.
+type Fragmentation struct {
+	mu           sync.Mutex
+	highLimit    int
+	lowLimit     int
+	reassemblers map[uint32]*reassembler
+	rList        reassemblerList
+	size         int
+	timeout      time.Duration
+}
+
+// NewFragmentation creates a new Fragmentation.
+//
+// highMemoryLimit specifies the limit on the memory consumed
+// by the fragments stored by Fragmentation (overhead of internal data-structures
+// is not accounted). Fragments are dropped when the limit is reached.
+//
+// lowMemoryLimit specifies the limit on which we will reach by dropping
+// fragments after reaching highMemoryLimit.
+//
+// reassemblingTimeout specifies the maximum time allowed to reassemble a packet.
+// Fragments are lazily evicted only when a new a packet with an
+// already existing fragmentation-id arrives after the timeout.
+func NewFragmentation(highMemoryLimit, lowMemoryLimit int, reassemblingTimeout time.Duration) *Fragmentation {
+	if lowMemoryLimit >= highMemoryLimit {
+		lowMemoryLimit = highMemoryLimit
+	}
+
+	if lowMemoryLimit < 0 {
+		lowMemoryLimit = 0
+	}
+
+	return &Fragmentation{
+		reassemblers: make(map[uint32]*reassembler),
+		highLimit:    highMemoryLimit,
+		lowLimit:     lowMemoryLimit,
+		timeout:      reassemblingTimeout,
+	}
+}
+
+// Process processes an incoming fragment belonging to an ID and returns a
+// complete packet when all the packets belonging to that ID have been received.
+func (f *Fragmentation) Process(id uint32, first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, error) {
+	f.mu.Lock()
+	r, ok := f.reassemblers[id]
+	if ok && r.tooOld(f.timeout) {
+		// This is very likely to be an id-collision or someone performing a slow-rate attack.
+		f.release(r)
+		ok = false
+	}
+	if !ok {
+		r = newReassembler(id)
+		f.reassemblers[id] = r
+		f.rList.PushFront(r)
+	}
+	f.mu.Unlock()
+
+	res, done, consumed, err := r.process(first, last, more, vv)
+	if err != nil {
+		// We probably got an invalid sequence of fragments. Just
+		// discard the reassembler and move on.
+		f.mu.Lock()
+		f.release(r)
+		f.mu.Unlock()
+		return buffer.VectorisedView{}, false, fmt.Errorf("fragmentation processing error: %v", err)
+	}
+	f.mu.Lock()
+	f.size += consumed
+	if done {
+		f.release(r)
+	}
+	// Evict reassemblers if we are consuming more memory than highLimit until
+	// we reach lowLimit.
+	if f.size > f.highLimit {
+		for f.size > f.lowLimit {
+			tail := f.rList.Back()
+			if tail == nil {
+				break
+			}
+			f.release(tail)
+		}
+	}
+	f.mu.Unlock()
+	return res, done, nil
+}
+
+func (f *Fragmentation) release(r *reassembler) {
+	// Before releasing a fragment we need to check if r is already marked as done.
+	// Otherwise, we would delete it twice.
+	if r.checkDoneOrMark() {
+		return
+	}
+
+	delete(f.reassemblers, r.id)
+	f.rList.Remove(r)
+	f.size -= r.size
+	if f.size < 0 {
+		log.Printf("memory counter < 0 (%d), this is an accounting bug that requires investigation", f.size)
+		f.size = 0
+	}
+}
diff --git a/pkg/tcpip/network/fragmentation/fragmentation_test.go b/pkg/tcpip/network/fragmentation/fragmentation_test.go
new file mode 100644
index 000000000..72c0f53be
--- /dev/null
+++ b/pkg/tcpip/network/fragmentation/fragmentation_test.go
@@ -0,0 +1,165 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fragmentation
+
+import (
+	"reflect"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+// vv is a helper to build VectorisedView from different strings.
+func vv(size int, pieces ...string) buffer.VectorisedView {
+	views := make([]buffer.View, len(pieces))
+	for i, p := range pieces {
+		views[i] = []byte(p)
+	}
+
+	return buffer.NewVectorisedView(size, views)
+}
+
+type processInput struct {
+	id    uint32
+	first uint16
+	last  uint16
+	more  bool
+	vv    buffer.VectorisedView
+}
+
+type processOutput struct {
+	vv   buffer.VectorisedView
+	done bool
+}
+
+var processTestCases = []struct {
+	comment string
+	in      []processInput
+	out     []processOutput
+}{
+	{
+		comment: "One ID",
+		in: []processInput{
+			{id: 0, first: 0, last: 1, more: true, vv: vv(2, "01")},
+			{id: 0, first: 2, last: 3, more: false, vv: vv(2, "23")},
+		},
+		out: []processOutput{
+			{vv: buffer.VectorisedView{}, done: false},
+			{vv: vv(4, "01", "23"), done: true},
+		},
+	},
+	{
+		comment: "Two IDs",
+		in: []processInput{
+			{id: 0, first: 0, last: 1, more: true, vv: vv(2, "01")},
+			{id: 1, first: 0, last: 1, more: true, vv: vv(2, "ab")},
+			{id: 1, first: 2, last: 3, more: false, vv: vv(2, "cd")},
+			{id: 0, first: 2, last: 3, more: false, vv: vv(2, "23")},
+		},
+		out: []processOutput{
+			{vv: buffer.VectorisedView{}, done: false},
+			{vv: buffer.VectorisedView{}, done: false},
+			{vv: vv(4, "ab", "cd"), done: true},
+			{vv: vv(4, "01", "23"), done: true},
+		},
+	},
+}
+
+func TestFragmentationProcess(t *testing.T) {
+	for _, c := range processTestCases {
+		t.Run(c.comment, func(t *testing.T) {
+			f := NewFragmentation(1024, 512, DefaultReassembleTimeout)
+			for i, in := range c.in {
+				vv, done, err := f.Process(in.id, in.first, in.last, in.more, in.vv)
+				if err != nil {
+					t.Fatalf("f.Process(%+v, %+d, %+d, %t, %+v) failed: %v", in.id, in.first, in.last, in.more, in.vv, err)
+				}
+				if !reflect.DeepEqual(vv, c.out[i].vv) {
+					t.Errorf("got Process(%d) = %+v, want = %+v", i, vv, c.out[i].vv)
+				}
+				if done != c.out[i].done {
+					t.Errorf("got Process(%d) = %+v, want = %+v", i, done, c.out[i].done)
+				}
+				if c.out[i].done {
+					if _, ok := f.reassemblers[in.id]; ok {
+						t.Errorf("Process(%d) did not remove buffer from reassemblers", i)
+					}
+					for n := f.rList.Front(); n != nil; n = n.Next() {
+						if n.id == in.id {
+							t.Errorf("Process(%d) did not remove buffer from rList", i)
+						}
+					}
+				}
+			}
+		})
+	}
+}
+
+func TestReassemblingTimeout(t *testing.T) {
+	timeout := time.Millisecond
+	f := NewFragmentation(1024, 512, timeout)
+	// Send first fragment with id = 0, first = 0, last = 0, and more = true.
+	f.Process(0, 0, 0, true, vv(1, "0"))
+	// Sleep more than the timeout.
+	time.Sleep(2 * timeout)
+	// Send another fragment that completes a packet.
+	// However, no packet should be reassembled because the fragment arrived after the timeout.
+	_, done, err := f.Process(0, 1, 1, false, vv(1, "1"))
+	if err != nil {
+		t.Fatalf("f.Process(0, 1, 1, false, vv(1, \"1\")) failed: %v", err)
+	}
+	if done {
+		t.Errorf("Fragmentation does not respect the reassembling timeout.")
+	}
+}
+
+func TestMemoryLimits(t *testing.T) {
+	f := NewFragmentation(3, 1, DefaultReassembleTimeout)
+	// Send first fragment with id = 0.
+	f.Process(0, 0, 0, true, vv(1, "0"))
+	// Send first fragment with id = 1.
+	f.Process(1, 0, 0, true, vv(1, "1"))
+	// Send first fragment with id = 2.
+	f.Process(2, 0, 0, true, vv(1, "2"))
+
+	// Send first fragment with id = 3. This should caused id = 0 and id = 1 to be
+	// evicted.
+	f.Process(3, 0, 0, true, vv(1, "3"))
+
+	if _, ok := f.reassemblers[0]; ok {
+		t.Errorf("Memory limits are not respected: id=0 has not been evicted.")
+	}
+	if _, ok := f.reassemblers[1]; ok {
+		t.Errorf("Memory limits are not respected: id=1 has not been evicted.")
+	}
+	if _, ok := f.reassemblers[3]; !ok {
+		t.Errorf("Implementation of memory limits is wrong: id=3 is not present.")
+	}
+}
+
+func TestMemoryLimitsIgnoresDuplicates(t *testing.T) {
+	f := NewFragmentation(1, 0, DefaultReassembleTimeout)
+	// Send first fragment with id = 0.
+	f.Process(0, 0, 0, true, vv(1, "0"))
+	// Send the same packet again.
+	f.Process(0, 0, 0, true, vv(1, "0"))
+
+	got := f.size
+	want := 1
+	if got != want {
+		t.Errorf("Wrong size, duplicates are not handled correctly: got=%d, want=%d.", got, want)
+	}
+}
diff --git a/pkg/tcpip/network/fragmentation/reassembler.go b/pkg/tcpip/network/fragmentation/reassembler.go
new file mode 100644
index 000000000..0a83d81f2
--- /dev/null
+++ b/pkg/tcpip/network/fragmentation/reassembler.go
@@ -0,0 +1,118 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fragmentation
+
+import (
+	"container/heap"
+	"fmt"
+	"math"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+type hole struct {
+	first   uint16
+	last    uint16
+	deleted bool
+}
+
+type reassembler struct {
+	reassemblerEntry
+	id           uint32
+	size         int
+	mu           sync.Mutex
+	holes        []hole
+	deleted      int
+	heap         fragHeap
+	done         bool
+	creationTime time.Time
+}
+
+func newReassembler(id uint32) *reassembler {
+	r := &reassembler{
+		id:           id,
+		holes:        make([]hole, 0, 16),
+		deleted:      0,
+		heap:         make(fragHeap, 0, 8),
+		creationTime: time.Now(),
+	}
+	r.holes = append(r.holes, hole{
+		first:   0,
+		last:    math.MaxUint16,
+		deleted: false})
+	return r
+}
+
+// updateHoles updates the list of holes for an incoming fragment and
+// returns true iff the fragment filled at least part of an existing hole.
+func (r *reassembler) updateHoles(first, last uint16, more bool) bool {
+	used := false
+	for i := range r.holes {
+		if r.holes[i].deleted || first > r.holes[i].last || last < r.holes[i].first {
+			continue
+		}
+		used = true
+		r.deleted++
+		r.holes[i].deleted = true
+		if first > r.holes[i].first {
+			r.holes = append(r.holes, hole{r.holes[i].first, first - 1, false})
+		}
+		if last < r.holes[i].last && more {
+			r.holes = append(r.holes, hole{last + 1, r.holes[i].last, false})
+		}
+	}
+	return used
+}
+
+func (r *reassembler) process(first, last uint16, more bool, vv buffer.VectorisedView) (buffer.VectorisedView, bool, int, error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	consumed := 0
+	if r.done {
+		// A concurrent goroutine might have already reassembled
+		// the packet and emptied the heap while this goroutine
+		// was waiting on the mutex. We don't have to do anything in this case.
+		return buffer.VectorisedView{}, false, consumed, nil
+	}
+	if r.updateHoles(first, last, more) {
+		// We store the incoming packet only if it filled some holes.
+		heap.Push(&r.heap, fragment{offset: first, vv: vv.Clone(nil)})
+		consumed = vv.Size()
+		r.size += consumed
+	}
+	// Check if all the holes have been deleted and we are ready to reassamble.
+	if r.deleted < len(r.holes) {
+		return buffer.VectorisedView{}, false, consumed, nil
+	}
+	res, err := r.heap.reassemble()
+	if err != nil {
+		return buffer.VectorisedView{}, false, consumed, fmt.Errorf("fragment reassembly failed: %v", err)
+	}
+	return res, true, consumed, nil
+}
+
+func (r *reassembler) tooOld(timeout time.Duration) bool {
+	return time.Now().Sub(r.creationTime) > timeout
+}
+
+func (r *reassembler) checkDoneOrMark() bool {
+	r.mu.Lock()
+	prev := r.done
+	r.done = true
+	r.mu.Unlock()
+	return prev
+}
diff --git a/pkg/tcpip/network/fragmentation/reassembler_test.go b/pkg/tcpip/network/fragmentation/reassembler_test.go
new file mode 100644
index 000000000..7eee0710d
--- /dev/null
+++ b/pkg/tcpip/network/fragmentation/reassembler_test.go
@@ -0,0 +1,105 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fragmentation
+
+import (
+	"math"
+	"reflect"
+	"testing"
+)
+
+type updateHolesInput struct {
+	first uint16
+	last  uint16
+	more  bool
+}
+
+var holesTestCases = []struct {
+	comment string
+	in      []updateHolesInput
+	want    []hole
+}{
+	{
+		comment: "No fragments. Expected holes: {[0 -> inf]}.",
+		in:      []updateHolesInput{},
+		want:    []hole{{first: 0, last: math.MaxUint16, deleted: false}},
+	},
+	{
+		comment: "One fragment at beginning. Expected holes: {[2, inf]}.",
+		in:      []updateHolesInput{{first: 0, last: 1, more: true}},
+		want: []hole{
+			{first: 0, last: math.MaxUint16, deleted: true},
+			{first: 2, last: math.MaxUint16, deleted: false},
+		},
+	},
+	{
+		comment: "One fragment in the middle. Expected holes: {[0, 0], [3, inf]}.",
+		in:      []updateHolesInput{{first: 1, last: 2, more: true}},
+		want: []hole{
+			{first: 0, last: math.MaxUint16, deleted: true},
+			{first: 0, last: 0, deleted: false},
+			{first: 3, last: math.MaxUint16, deleted: false},
+		},
+	},
+	{
+		comment: "One fragment at the end. Expected holes: {[0, 0]}.",
+		in:      []updateHolesInput{{first: 1, last: 2, more: false}},
+		want: []hole{
+			{first: 0, last: math.MaxUint16, deleted: true},
+			{first: 0, last: 0, deleted: false},
+		},
+	},
+	{
+		comment: "One fragment completing a packet. Expected holes: {}.",
+		in:      []updateHolesInput{{first: 0, last: 1, more: false}},
+		want: []hole{
+			{first: 0, last: math.MaxUint16, deleted: true},
+		},
+	},
+	{
+		comment: "Two non-overlapping fragments completing a packet. Expected holes: {}.",
+		in: []updateHolesInput{
+			{first: 0, last: 1, more: true},
+			{first: 2, last: 3, more: false},
+		},
+		want: []hole{
+			{first: 0, last: math.MaxUint16, deleted: true},
+			{first: 2, last: math.MaxUint16, deleted: true},
+		},
+	},
+	{
+		comment: "Two overlapping fragments completing a packet. Expected holes: {}.",
+		in: []updateHolesInput{
+			{first: 0, last: 2, more: true},
+			{first: 2, last: 3, more: false},
+		},
+		want: []hole{
+			{first: 0, last: math.MaxUint16, deleted: true},
+			{first: 3, last: math.MaxUint16, deleted: true},
+		},
+	},
+}
+
+func TestUpdateHoles(t *testing.T) {
+	for _, c := range holesTestCases {
+		r := newReassembler(0)
+		for _, i := range c.in {
+			r.updateHoles(i.first, i.last, i.more)
+		}
+		if !reflect.DeepEqual(r.holes, c.want) {
+			t.Errorf("Test \"%s\" produced unexepetced holes. Got %v. Want %v", c.comment, r.holes, c.want)
+		}
+	}
+}
diff --git a/pkg/tcpip/network/hash/BUILD b/pkg/tcpip/network/hash/BUILD
new file mode 100644
index 000000000..872165866
--- /dev/null
+++ b/pkg/tcpip/network/hash/BUILD
@@ -0,0 +1,13 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "hash",
+    srcs = ["hash.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/rand",
+        "//pkg/tcpip/header",
+    ],
+)
diff --git a/pkg/tcpip/network/hash/hash.go b/pkg/tcpip/network/hash/hash.go
new file mode 100644
index 000000000..8f65713c5
--- /dev/null
+++ b/pkg/tcpip/network/hash/hash.go
@@ -0,0 +1,93 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package hash contains utility functions for hashing.
+package hash
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+var hashIV = RandN32(1)[0]
+
+// RandN32 generates a slice of n cryptographic random 32-bit numbers.
+func RandN32(n int) []uint32 {
+	b := make([]byte, 4*n)
+	if _, err := rand.Read(b); err != nil {
+		panic("unable to get random numbers: " + err.Error())
+	}
+	r := make([]uint32, n)
+	for i := range r {
+		r[i] = binary.LittleEndian.Uint32(b[4*i : (4*i + 4)])
+	}
+	return r
+}
+
+// Hash3Words calculates the Jenkins hash of 3 32-bit words. This is adapted
+// from linux.
+func Hash3Words(a, b, c, initval uint32) uint32 {
+	const iv = 0xdeadbeef + (3 << 2)
+	initval += iv
+
+	a += initval
+	b += initval
+	c += initval
+
+	c ^= b
+	c -= rol32(b, 14)
+	a ^= c
+	a -= rol32(c, 11)
+	b ^= a
+	b -= rol32(a, 25)
+	c ^= b
+	c -= rol32(b, 16)
+	a ^= c
+	a -= rol32(c, 4)
+	b ^= a
+	b -= rol32(a, 14)
+	c ^= b
+	c -= rol32(b, 24)
+
+	return c
+}
+
+// IPv4FragmentHash computes the hash of the IPv4 fragment as suggested in RFC 791.
+func IPv4FragmentHash(h header.IPv4) uint32 {
+	x := uint32(h.ID())<<16 | uint32(h.Protocol())
+	t := h.SourceAddress()
+	y := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
+	t = h.DestinationAddress()
+	z := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
+	return Hash3Words(x, y, z, hashIV)
+}
+
+// IPv6FragmentHash computes the hash of the ipv6 fragment.
+// Unlike IPv4, the protocol is not used to compute the hash.
+// RFC 2640 (sec 4.5) is not very sharp on this aspect.
+// As a reference, also Linux ignores the protocol to compute
+// the hash (inet6_hash_frag).
+func IPv6FragmentHash(h header.IPv6, id uint32) uint32 {
+	t := h.SourceAddress()
+	y := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
+	t = h.DestinationAddress()
+	z := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
+	return Hash3Words(id, y, z, hashIV)
+}
+
+func rol32(v, shift uint32) uint32 {
+	return (v << shift) | (v >> ((-shift) & 31))
+}
diff --git a/pkg/tcpip/network/ip_test.go b/pkg/tcpip/network/ip_test.go
new file mode 100644
index 000000000..7c8fb3e0a
--- /dev/null
+++ b/pkg/tcpip/network/ip_test.go
@@ -0,0 +1,673 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ip_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+)
+
+const (
+	localIpv4Addr      = "\x0a\x00\x00\x01"
+	localIpv4PrefixLen = 24
+	remoteIpv4Addr     = "\x0a\x00\x00\x02"
+	ipv4SubnetAddr     = "\x0a\x00\x00\x00"
+	ipv4SubnetMask     = "\xff\xff\xff\x00"
+	ipv4Gateway        = "\x0a\x00\x00\x03"
+	localIpv6Addr      = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+	localIpv6PrefixLen = 120
+	remoteIpv6Addr     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	ipv6SubnetAddr     = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+	ipv6SubnetMask     = "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x00"
+	ipv6Gateway        = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03"
+)
+
+// testObject implements two interfaces: LinkEndpoint and TransportDispatcher.
+// The former is used to pretend that it's a link endpoint so that we can
+// inspect packets written by the network endpoints. The latter is used to
+// pretend that it's the network stack so that it can inspect incoming packets
+// that have been handled by the network endpoints.
+//
+// Packets are checked by comparing their fields/values against the expected
+// values stored in the test object itself.
+type testObject struct {
+	t        *testing.T
+	protocol tcpip.TransportProtocolNumber
+	contents []byte
+	srcAddr  tcpip.Address
+	dstAddr  tcpip.Address
+	v4       bool
+	typ      stack.ControlType
+	extra    uint32
+
+	dataCalls    int
+	controlCalls int
+}
+
+// checkValues verifies that the transport protocol, data contents, src & dst
+// addresses of a packet match what's expected. If any field doesn't match, the
+// test fails.
+func (t *testObject) checkValues(protocol tcpip.TransportProtocolNumber, vv buffer.VectorisedView, srcAddr, dstAddr tcpip.Address) {
+	v := vv.ToView()
+	if protocol != t.protocol {
+		t.t.Errorf("protocol = %v, want %v", protocol, t.protocol)
+	}
+
+	if srcAddr != t.srcAddr {
+		t.t.Errorf("srcAddr = %v, want %v", srcAddr, t.srcAddr)
+	}
+
+	if dstAddr != t.dstAddr {
+		t.t.Errorf("dstAddr = %v, want %v", dstAddr, t.dstAddr)
+	}
+
+	if len(v) != len(t.contents) {
+		t.t.Fatalf("len(payload) = %v, want %v", len(v), len(t.contents))
+	}
+
+	for i := range t.contents {
+		if t.contents[i] != v[i] {
+			t.t.Fatalf("payload[%v] = %v, want %v", i, v[i], t.contents[i])
+		}
+	}
+}
+
+// DeliverTransportPacket is called by network endpoints after parsing incoming
+// packets. This is used by the test object to verify that the results of the
+// parsing are expected.
+func (t *testObject) DeliverTransportPacket(r *stack.Route, protocol tcpip.TransportProtocolNumber, pkt *stack.PacketBuffer) {
+	t.checkValues(protocol, pkt.Data, r.RemoteAddress, r.LocalAddress)
+	t.dataCalls++
+}
+
+// DeliverTransportControlPacket is called by network endpoints after parsing
+// incoming control (ICMP) packets. This is used by the test object to verify
+// that the results of the parsing are expected.
+func (t *testObject) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
+	t.checkValues(trans, pkt.Data, remote, local)
+	if typ != t.typ {
+		t.t.Errorf("typ = %v, want %v", typ, t.typ)
+	}
+	if extra != t.extra {
+		t.t.Errorf("extra = %v, want %v", extra, t.extra)
+	}
+	t.controlCalls++
+}
+
+// Attach is only implemented to satisfy the LinkEndpoint interface.
+func (*testObject) Attach(stack.NetworkDispatcher) {}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (*testObject) IsAttached() bool {
+	return true
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It just returns a constant that
+// matches the linux loopback MTU.
+func (*testObject) MTU() uint32 {
+	return 65536
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (*testObject) Capabilities() stack.LinkEndpointCapabilities {
+	return 0
+}
+
+// MaxHeaderLength is only implemented to satisfy the LinkEndpoint interface.
+func (*testObject) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (*testObject) LinkAddress() tcpip.LinkAddress {
+	return ""
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (*testObject) Wait() {}
+
+// WritePacket is called by network endpoints after producing a packet and
+// writing it to the link endpoint. This is used by the test object to verify
+// that the produced packet is as expected.
+func (t *testObject) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	var prot tcpip.TransportProtocolNumber
+	var srcAddr tcpip.Address
+	var dstAddr tcpip.Address
+
+	if t.v4 {
+		h := header.IPv4(pkt.Header.View())
+		prot = tcpip.TransportProtocolNumber(h.Protocol())
+		srcAddr = h.SourceAddress()
+		dstAddr = h.DestinationAddress()
+
+	} else {
+		h := header.IPv6(pkt.Header.View())
+		prot = tcpip.TransportProtocolNumber(h.NextHeader())
+		srcAddr = h.SourceAddress()
+		dstAddr = h.DestinationAddress()
+	}
+	t.checkValues(prot, pkt.Data, srcAddr, dstAddr)
+	return nil
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (t *testObject) WritePackets(_ *stack.Route, _ *stack.GSO, pkt stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+func (t *testObject) WriteRawPacket(_ buffer.VectorisedView) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+func buildIPv4Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol(), tcp.NewProtocol()},
+	})
+	s.CreateNIC(1, loopback.New())
+	s.AddAddress(1, ipv4.ProtocolNumber, local)
+	s.SetRouteTable([]tcpip.Route{{
+		Destination: header.IPv4EmptySubnet,
+		Gateway:     ipv4Gateway,
+		NIC:         1,
+	}})
+
+	return s.FindRoute(1, local, remote, ipv4.ProtocolNumber, false /* multicastLoop */)
+}
+
+func buildIPv6Route(local, remote tcpip.Address) (stack.Route, *tcpip.Error) {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol(), tcp.NewProtocol()},
+	})
+	s.CreateNIC(1, loopback.New())
+	s.AddAddress(1, ipv6.ProtocolNumber, local)
+	s.SetRouteTable([]tcpip.Route{{
+		Destination: header.IPv6EmptySubnet,
+		Gateway:     ipv6Gateway,
+		NIC:         1,
+	}})
+
+	return s.FindRoute(1, local, remote, ipv6.ProtocolNumber, false /* multicastLoop */)
+}
+
+func buildDummyStack() *stack.Stack {
+	return stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol(), tcp.NewProtocol()},
+	})
+}
+
+func TestIPv4Send(t *testing.T) {
+	o := testObject{t: t, v4: true}
+	proto := ipv4.NewProtocol()
+	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, nil, &o, buildDummyStack())
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	// Allocate and initialize the payload view.
+	payload := buffer.NewView(100)
+	for i := 0; i < len(payload); i++ {
+		payload[i] = uint8(i)
+	}
+
+	// Allocate the header buffer.
+	hdr := buffer.NewPrependable(int(ep.MaxHeaderLength()))
+
+	// Issue the write.
+	o.protocol = 123
+	o.srcAddr = localIpv4Addr
+	o.dstAddr = remoteIpv4Addr
+	o.contents = payload
+
+	r, err := buildIPv4Route(localIpv4Addr, remoteIpv4Addr)
+	if err != nil {
+		t.Fatalf("could not find route: %v", err)
+	}
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{
+		Protocol: 123,
+		TTL:      123,
+		TOS:      stack.DefaultTOS,
+	}, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   payload.ToVectorisedView(),
+	}); err != nil {
+		t.Fatalf("WritePacket failed: %v", err)
+	}
+}
+
+func TestIPv4Receive(t *testing.T) {
+	o := testObject{t: t, v4: true}
+	proto := ipv4.NewProtocol()
+	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, &o, nil, buildDummyStack())
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	totalLen := header.IPv4MinimumSize + 30
+	view := buffer.NewView(totalLen)
+	ip := header.IPv4(view)
+	ip.Encode(&header.IPv4Fields{
+		IHL:         header.IPv4MinimumSize,
+		TotalLength: uint16(totalLen),
+		TTL:         20,
+		Protocol:    10,
+		SrcAddr:     remoteIpv4Addr,
+		DstAddr:     localIpv4Addr,
+	})
+
+	// Make payload be non-zero.
+	for i := header.IPv4MinimumSize; i < totalLen; i++ {
+		view[i] = uint8(i)
+	}
+
+	// Give packet to ipv4 endpoint, dispatcher will validate that it's ok.
+	o.protocol = 10
+	o.srcAddr = remoteIpv4Addr
+	o.dstAddr = localIpv4Addr
+	o.contents = view[header.IPv4MinimumSize:totalLen]
+
+	r, err := buildIPv4Route(localIpv4Addr, remoteIpv4Addr)
+	if err != nil {
+		t.Fatalf("could not find route: %v", err)
+	}
+	pkt := stack.PacketBuffer{Data: view.ToVectorisedView()}
+	proto.Parse(&pkt)
+	ep.HandlePacket(&r, &pkt)
+	if o.dataCalls != 1 {
+		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
+	}
+}
+
+func TestIPv4ReceiveControl(t *testing.T) {
+	const mtu = 0xbeef - header.IPv4MinimumSize
+	cases := []struct {
+		name           string
+		expectedCount  int
+		fragmentOffset uint16
+		code           uint8
+		expectedTyp    stack.ControlType
+		expectedExtra  uint32
+		trunc          int
+	}{
+		{"FragmentationNeeded", 1, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, 0},
+		{"Truncated (10 bytes missing)", 0, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, 10},
+		{"Truncated (missing IPv4 header)", 0, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, header.IPv4MinimumSize + 8},
+		{"Truncated (missing 'extra info')", 0, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, 4 + header.IPv4MinimumSize + 8},
+		{"Truncated (missing ICMP header)", 0, 0, header.ICMPv4FragmentationNeeded, stack.ControlPacketTooBig, mtu, header.ICMPv4MinimumSize + header.IPv4MinimumSize + 8},
+		{"Port unreachable", 1, 0, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 0},
+		{"Non-zero fragment offset", 0, 100, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 0},
+		{"Zero-length packet", 0, 0, header.ICMPv4PortUnreachable, stack.ControlPortUnreachable, 0, 2*header.IPv4MinimumSize + header.ICMPv4MinimumSize + 8},
+	}
+	r, err := buildIPv4Route(localIpv4Addr, "\x0a\x00\x00\xbb")
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			o := testObject{t: t}
+			proto := ipv4.NewProtocol()
+			ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, &o, nil, buildDummyStack())
+			if err != nil {
+				t.Fatalf("NewEndpoint failed: %v", err)
+			}
+			defer ep.Close()
+
+			const dataOffset = header.IPv4MinimumSize*2 + header.ICMPv4MinimumSize
+			view := buffer.NewView(dataOffset + 8)
+
+			// Create the outer IPv4 header.
+			ip := header.IPv4(view)
+			ip.Encode(&header.IPv4Fields{
+				IHL:         header.IPv4MinimumSize,
+				TotalLength: uint16(len(view) - c.trunc),
+				TTL:         20,
+				Protocol:    uint8(header.ICMPv4ProtocolNumber),
+				SrcAddr:     "\x0a\x00\x00\xbb",
+				DstAddr:     localIpv4Addr,
+			})
+
+			// Create the ICMP header.
+			icmp := header.ICMPv4(view[header.IPv4MinimumSize:])
+			icmp.SetType(header.ICMPv4DstUnreachable)
+			icmp.SetCode(c.code)
+			icmp.SetIdent(0xdead)
+			icmp.SetSequence(0xbeef)
+
+			// Create the inner IPv4 header.
+			ip = header.IPv4(view[header.IPv4MinimumSize+header.ICMPv4MinimumSize:])
+			ip.Encode(&header.IPv4Fields{
+				IHL:            header.IPv4MinimumSize,
+				TotalLength:    100,
+				TTL:            20,
+				Protocol:       10,
+				FragmentOffset: c.fragmentOffset,
+				SrcAddr:        localIpv4Addr,
+				DstAddr:        remoteIpv4Addr,
+			})
+
+			// Make payload be non-zero.
+			for i := dataOffset; i < len(view); i++ {
+				view[i] = uint8(i)
+			}
+
+			// Give packet to IPv4 endpoint, dispatcher will validate that
+			// it's ok.
+			o.protocol = 10
+			o.srcAddr = remoteIpv4Addr
+			o.dstAddr = localIpv4Addr
+			o.contents = view[dataOffset:]
+			o.typ = c.expectedTyp
+			o.extra = c.expectedExtra
+
+			ep.HandlePacket(&r, truncatedPacket(view, c.trunc, header.IPv4MinimumSize))
+			if want := c.expectedCount; o.controlCalls != want {
+				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
+			}
+		})
+	}
+}
+
+func TestIPv4FragmentationReceive(t *testing.T) {
+	o := testObject{t: t, v4: true}
+	proto := ipv4.NewProtocol()
+	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv4Addr, localIpv4PrefixLen}, nil, &o, nil, buildDummyStack())
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	totalLen := header.IPv4MinimumSize + 24
+
+	frag1 := buffer.NewView(totalLen)
+	ip1 := header.IPv4(frag1)
+	ip1.Encode(&header.IPv4Fields{
+		IHL:            header.IPv4MinimumSize,
+		TotalLength:    uint16(totalLen),
+		TTL:            20,
+		Protocol:       10,
+		FragmentOffset: 0,
+		Flags:          header.IPv4FlagMoreFragments,
+		SrcAddr:        remoteIpv4Addr,
+		DstAddr:        localIpv4Addr,
+	})
+	// Make payload be non-zero.
+	for i := header.IPv4MinimumSize; i < totalLen; i++ {
+		frag1[i] = uint8(i)
+	}
+
+	frag2 := buffer.NewView(totalLen)
+	ip2 := header.IPv4(frag2)
+	ip2.Encode(&header.IPv4Fields{
+		IHL:            header.IPv4MinimumSize,
+		TotalLength:    uint16(totalLen),
+		TTL:            20,
+		Protocol:       10,
+		FragmentOffset: 24,
+		SrcAddr:        remoteIpv4Addr,
+		DstAddr:        localIpv4Addr,
+	})
+	// Make payload be non-zero.
+	for i := header.IPv4MinimumSize; i < totalLen; i++ {
+		frag2[i] = uint8(i)
+	}
+
+	// Give packet to ipv4 endpoint, dispatcher will validate that it's ok.
+	o.protocol = 10
+	o.srcAddr = remoteIpv4Addr
+	o.dstAddr = localIpv4Addr
+	o.contents = append(frag1[header.IPv4MinimumSize:totalLen], frag2[header.IPv4MinimumSize:totalLen]...)
+
+	r, err := buildIPv4Route(localIpv4Addr, remoteIpv4Addr)
+	if err != nil {
+		t.Fatalf("could not find route: %v", err)
+	}
+
+	// Send first segment.
+	pkt := stack.PacketBuffer{Data: frag1.ToVectorisedView()}
+	proto.Parse(&pkt)
+	ep.HandlePacket(&r, &pkt)
+	if o.dataCalls != 0 {
+		t.Fatalf("Bad number of data calls: got %x, want 0", o.dataCalls)
+	}
+
+	// Send second segment.
+	pkt = stack.PacketBuffer{Data: frag2.ToVectorisedView()}
+	proto.Parse(&pkt)
+	ep.HandlePacket(&r, &pkt)
+	if o.dataCalls != 1 {
+		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
+	}
+}
+
+func TestIPv6Send(t *testing.T) {
+	o := testObject{t: t}
+	proto := ipv6.NewProtocol()
+	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv6Addr, localIpv6PrefixLen}, nil, nil, &o, buildDummyStack())
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	// Allocate and initialize the payload view.
+	payload := buffer.NewView(100)
+	for i := 0; i < len(payload); i++ {
+		payload[i] = uint8(i)
+	}
+
+	// Allocate the header buffer.
+	hdr := buffer.NewPrependable(int(ep.MaxHeaderLength()))
+
+	// Issue the write.
+	o.protocol = 123
+	o.srcAddr = localIpv6Addr
+	o.dstAddr = remoteIpv6Addr
+	o.contents = payload
+
+	r, err := buildIPv6Route(localIpv6Addr, remoteIpv6Addr)
+	if err != nil {
+		t.Fatalf("could not find route: %v", err)
+	}
+	if err := ep.WritePacket(&r, nil /* gso */, stack.NetworkHeaderParams{
+		Protocol: 123,
+		TTL:      123,
+		TOS:      stack.DefaultTOS,
+	}, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   payload.ToVectorisedView(),
+	}); err != nil {
+		t.Fatalf("WritePacket failed: %v", err)
+	}
+}
+
+func TestIPv6Receive(t *testing.T) {
+	o := testObject{t: t}
+	proto := ipv6.NewProtocol()
+	ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv6Addr, localIpv6PrefixLen}, nil, &o, nil, buildDummyStack())
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	totalLen := header.IPv6MinimumSize + 30
+	view := buffer.NewView(totalLen)
+	ip := header.IPv6(view)
+	ip.Encode(&header.IPv6Fields{
+		PayloadLength: uint16(totalLen - header.IPv6MinimumSize),
+		NextHeader:    10,
+		HopLimit:      20,
+		SrcAddr:       remoteIpv6Addr,
+		DstAddr:       localIpv6Addr,
+	})
+
+	// Make payload be non-zero.
+	for i := header.IPv6MinimumSize; i < totalLen; i++ {
+		view[i] = uint8(i)
+	}
+
+	// Give packet to ipv6 endpoint, dispatcher will validate that it's ok.
+	o.protocol = 10
+	o.srcAddr = remoteIpv6Addr
+	o.dstAddr = localIpv6Addr
+	o.contents = view[header.IPv6MinimumSize:totalLen]
+
+	r, err := buildIPv6Route(localIpv6Addr, remoteIpv6Addr)
+	if err != nil {
+		t.Fatalf("could not find route: %v", err)
+	}
+
+	pkt := stack.PacketBuffer{Data: view.ToVectorisedView()}
+	proto.Parse(&pkt)
+	ep.HandlePacket(&r, &pkt)
+	if o.dataCalls != 1 {
+		t.Fatalf("Bad number of data calls: got %x, want 1", o.dataCalls)
+	}
+}
+
+func TestIPv6ReceiveControl(t *testing.T) {
+	newUint16 := func(v uint16) *uint16 { return &v }
+
+	const mtu = 0xffff
+	const outerSrcAddr = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xaa"
+	cases := []struct {
+		name           string
+		expectedCount  int
+		fragmentOffset *uint16
+		typ            header.ICMPv6Type
+		code           uint8
+		expectedTyp    stack.ControlType
+		expectedExtra  uint32
+		trunc          int
+	}{
+		{"PacketTooBig", 1, nil, header.ICMPv6PacketTooBig, 0, stack.ControlPacketTooBig, mtu, 0},
+		{"Truncated (10 bytes missing)", 0, nil, header.ICMPv6PacketTooBig, 0, stack.ControlPacketTooBig, mtu, 10},
+		{"Truncated (missing IPv6 header)", 0, nil, header.ICMPv6PacketTooBig, 0, stack.ControlPacketTooBig, mtu, header.IPv6MinimumSize + 8},
+		{"Truncated PacketTooBig (missing 'extra info')", 0, nil, header.ICMPv6PacketTooBig, 0, stack.ControlPacketTooBig, mtu, 4 + header.IPv6MinimumSize + 8},
+		{"Truncated (missing ICMP header)", 0, nil, header.ICMPv6PacketTooBig, 0, stack.ControlPacketTooBig, mtu, header.ICMPv6PacketTooBigMinimumSize + header.IPv6MinimumSize + 8},
+		{"Port unreachable", 1, nil, header.ICMPv6DstUnreachable, header.ICMPv6PortUnreachable, stack.ControlPortUnreachable, 0, 0},
+		{"Truncated DstUnreachable (missing 'extra info')", 0, nil, header.ICMPv6DstUnreachable, header.ICMPv6PortUnreachable, stack.ControlPortUnreachable, 0, 4 + header.IPv6MinimumSize + 8},
+		{"Fragmented, zero offset", 1, newUint16(0), header.ICMPv6DstUnreachable, header.ICMPv6PortUnreachable, stack.ControlPortUnreachable, 0, 0},
+		{"Non-zero fragment offset", 0, newUint16(100), header.ICMPv6DstUnreachable, header.ICMPv6PortUnreachable, stack.ControlPortUnreachable, 0, 0},
+		{"Zero-length packet", 0, nil, header.ICMPv6DstUnreachable, header.ICMPv6PortUnreachable, stack.ControlPortUnreachable, 0, 2*header.IPv6MinimumSize + header.ICMPv6DstUnreachableMinimumSize + 8},
+	}
+	r, err := buildIPv6Route(
+		localIpv6Addr,
+		"\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xaa",
+	)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			o := testObject{t: t}
+			proto := ipv6.NewProtocol()
+			ep, err := proto.NewEndpoint(1, tcpip.AddressWithPrefix{localIpv6Addr, localIpv6PrefixLen}, nil, &o, nil, buildDummyStack())
+			if err != nil {
+				t.Fatalf("NewEndpoint failed: %v", err)
+			}
+
+			defer ep.Close()
+
+			dataOffset := header.IPv6MinimumSize*2 + header.ICMPv6MinimumSize
+			if c.fragmentOffset != nil {
+				dataOffset += header.IPv6FragmentHeaderSize
+			}
+			view := buffer.NewView(dataOffset + 8)
+
+			// Create the outer IPv6 header.
+			ip := header.IPv6(view)
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(len(view) - header.IPv6MinimumSize - c.trunc),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      20,
+				SrcAddr:       outerSrcAddr,
+				DstAddr:       localIpv6Addr,
+			})
+
+			// Create the ICMP header.
+			icmp := header.ICMPv6(view[header.IPv6MinimumSize:])
+			icmp.SetType(c.typ)
+			icmp.SetCode(c.code)
+			icmp.SetIdent(0xdead)
+			icmp.SetSequence(0xbeef)
+
+			// Create the inner IPv6 header.
+			ip = header.IPv6(view[header.IPv6MinimumSize+header.ICMPv6PayloadOffset:])
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: 100,
+				NextHeader:    10,
+				HopLimit:      20,
+				SrcAddr:       localIpv6Addr,
+				DstAddr:       remoteIpv6Addr,
+			})
+
+			// Build the fragmentation header if needed.
+			if c.fragmentOffset != nil {
+				ip.SetNextHeader(header.IPv6FragmentHeader)
+				frag := header.IPv6Fragment(view[2*header.IPv6MinimumSize+header.ICMPv6MinimumSize:])
+				frag.Encode(&header.IPv6FragmentFields{
+					NextHeader:     10,
+					FragmentOffset: *c.fragmentOffset,
+					M:              true,
+					Identification: 0x12345678,
+				})
+			}
+
+			// Make payload be non-zero.
+			for i := dataOffset; i < len(view); i++ {
+				view[i] = uint8(i)
+			}
+
+			// Give packet to IPv6 endpoint, dispatcher will validate that
+			// it's ok.
+			o.protocol = 10
+			o.srcAddr = remoteIpv6Addr
+			o.dstAddr = localIpv6Addr
+			o.contents = view[dataOffset:]
+			o.typ = c.expectedTyp
+			o.extra = c.expectedExtra
+
+			// Set ICMPv6 checksum.
+			icmp.SetChecksum(header.ICMPv6Checksum(icmp, outerSrcAddr, localIpv6Addr, buffer.VectorisedView{}))
+
+			ep.HandlePacket(&r, truncatedPacket(view, c.trunc, header.IPv6MinimumSize))
+			if want := c.expectedCount; o.controlCalls != want {
+				t.Fatalf("Bad number of control calls for %q case: got %v, want %v", c.name, o.controlCalls, want)
+			}
+		})
+	}
+}
+
+// truncatedPacket returns a PacketBuffer based on a truncated view. If view,
+// after truncation, is large enough to hold a network header, it makes part of
+// view the packet's NetworkHeader and the rest its Data. Otherwise all of view
+// becomes Data.
+func truncatedPacket(view buffer.View, trunc, netHdrLen int) *stack.PacketBuffer {
+	v := view[:len(view)-trunc]
+	if len(v) < netHdrLen {
+		return &stack.PacketBuffer{Data: v.ToVectorisedView()}
+	}
+	return &stack.PacketBuffer{
+		NetworkHeader: v[:netHdrLen],
+		Data:          v[netHdrLen:].ToVectorisedView(),
+	}
+}
diff --git a/pkg/tcpip/network/ipv4/BUILD b/pkg/tcpip/network/ipv4/BUILD
new file mode 100644
index 000000000..78420d6e6
--- /dev/null
+++ b/pkg/tcpip/network/ipv4/BUILD
@@ -0,0 +1,39 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "ipv4",
+    srcs = [
+        "icmp.go",
+        "ipv4.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/network/fragmentation",
+        "//pkg/tcpip/network/hash",
+        "//pkg/tcpip/stack",
+    ],
+)
+
+go_test(
+    name = "ipv4_test",
+    size = "small",
+    srcs = ["ipv4_test.go"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/tcpip/transport/udp",
+        "//pkg/waiter",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+    ],
+)
diff --git a/pkg/tcpip/network/ipv4/icmp.go b/pkg/tcpip/network/ipv4/icmp.go
new file mode 100644
index 000000000..1b67aa066
--- /dev/null
+++ b/pkg/tcpip/network/ipv4/icmp.go
@@ -0,0 +1,167 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipv4
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// handleControl handles the case when an ICMP packet contains the headers of
+// the original packet that caused the ICMP one to be sent. This information is
+// used to find out which transport endpoint must be notified about the ICMP
+// packet.
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
+	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return
+	}
+	hdr := header.IPv4(h)
+
+	// We don't use IsValid() here because ICMP only requires that the IP
+	// header plus 8 bytes of the transport header be included. So it's
+	// likely that it is truncated, which would cause IsValid to return
+	// false.
+	//
+	// Drop packet if it doesn't have the basic IPv4 header or if the
+	// original source address doesn't match the endpoint's address.
+	if hdr.SourceAddress() != e.id.LocalAddress {
+		return
+	}
+
+	hlen := int(hdr.HeaderLength())
+	if pkt.Data.Size() < hlen || hdr.FragmentOffset() != 0 {
+		// We won't be able to handle this if it doesn't contain the
+		// full IPv4 header, or if it's a fragment not at offset 0
+		// (because it won't have the transport header).
+		return
+	}
+
+	// Skip the ip header, then deliver control message.
+	pkt.Data.TrimFront(hlen)
+	p := hdr.TransportProtocol()
+	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
+}
+
+func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer) {
+	stats := r.Stats()
+	received := stats.ICMP.V4PacketsReceived
+	// TODO(gvisor.dev/issue/170): ICMP packets don't have their
+	// TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a
+	// full explanation.
+	v, ok := pkt.Data.PullUp(header.ICMPv4MinimumSize)
+	if !ok {
+		received.Invalid.Increment()
+		return
+	}
+	h := header.ICMPv4(v)
+
+	// TODO(b/112892170): Meaningfully handle all ICMP types.
+	switch h.Type() {
+	case header.ICMPv4Echo:
+		received.Echo.Increment()
+
+		// Only send a reply if the checksum is valid.
+		wantChecksum := h.Checksum()
+		// Reset the checksum field to 0 to can calculate the proper
+		// checksum. We'll have to reset this before we hand the packet
+		// off.
+		h.SetChecksum(0)
+		gotChecksum := ^header.ChecksumVV(pkt.Data, 0 /* initial */)
+		if gotChecksum != wantChecksum {
+			// It's possible that a raw socket expects to receive this.
+			h.SetChecksum(wantChecksum)
+			e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, pkt)
+			received.Invalid.Increment()
+			return
+		}
+
+		// It's possible that a raw socket expects to receive this.
+		h.SetChecksum(wantChecksum)
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, &stack.PacketBuffer{
+			Data:          pkt.Data.Clone(nil),
+			NetworkHeader: append(buffer.View(nil), pkt.NetworkHeader...),
+		})
+
+		vv := pkt.Data.Clone(nil)
+		vv.TrimFront(header.ICMPv4MinimumSize)
+		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv4MinimumSize)
+		pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
+		copy(pkt, h)
+		pkt.SetType(header.ICMPv4EchoReply)
+		pkt.SetChecksum(0)
+		pkt.SetChecksum(^header.Checksum(pkt, header.ChecksumVV(vv, 0)))
+		sent := stats.ICMP.V4PacketsSent
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{
+			Protocol: header.ICMPv4ProtocolNumber,
+			TTL:      r.DefaultTTL(),
+			TOS:      stack.DefaultTOS,
+		}, &stack.PacketBuffer{
+			Header:          hdr,
+			Data:            vv,
+			TransportHeader: buffer.View(pkt),
+		}); err != nil {
+			sent.Dropped.Increment()
+			return
+		}
+		sent.EchoReply.Increment()
+
+	case header.ICMPv4EchoReply:
+		received.EchoReply.Increment()
+
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv4ProtocolNumber, pkt)
+
+	case header.ICMPv4DstUnreachable:
+		received.DstUnreachable.Increment()
+
+		pkt.Data.TrimFront(header.ICMPv4MinimumSize)
+		switch h.Code() {
+		case header.ICMPv4PortUnreachable:
+			e.handleControl(stack.ControlPortUnreachable, 0, pkt)
+
+		case header.ICMPv4FragmentationNeeded:
+			mtu := uint32(h.MTU())
+			e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), pkt)
+		}
+
+	case header.ICMPv4SrcQuench:
+		received.SrcQuench.Increment()
+
+	case header.ICMPv4Redirect:
+		received.Redirect.Increment()
+
+	case header.ICMPv4TimeExceeded:
+		received.TimeExceeded.Increment()
+
+	case header.ICMPv4ParamProblem:
+		received.ParamProblem.Increment()
+
+	case header.ICMPv4Timestamp:
+		received.Timestamp.Increment()
+
+	case header.ICMPv4TimestampReply:
+		received.TimestampReply.Increment()
+
+	case header.ICMPv4InfoRequest:
+		received.InfoRequest.Increment()
+
+	case header.ICMPv4InfoReply:
+		received.InfoReply.Increment()
+
+	default:
+		received.Invalid.Increment()
+	}
+}
diff --git a/pkg/tcpip/network/ipv4/ipv4.go b/pkg/tcpip/network/ipv4/ipv4.go
new file mode 100644
index 000000000..b1776e5ee
--- /dev/null
+++ b/pkg/tcpip/network/ipv4/ipv4.go
@@ -0,0 +1,594 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ipv4 contains the implementation of the ipv4 network protocol. To use
+// it in the networking stack, this package must be added to the project, and
+// activated on the stack by passing ipv4.NewProtocol() as one of the network
+// protocols when calling stack.New(). Then endpoints can be created by passing
+// ipv4.ProtocolNumber as the network protocol number when calling
+// Stack.NewEndpoint().
+package ipv4
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
+	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	// ProtocolNumber is the ipv4 protocol number.
+	ProtocolNumber = header.IPv4ProtocolNumber
+
+	// MaxTotalSize is maximum size that can be encoded in the 16-bit
+	// TotalLength field of the ipv4 header.
+	MaxTotalSize = 0xffff
+
+	// DefaultTTL is the default time-to-live value for this endpoint.
+	DefaultTTL = 64
+
+	// buckets is the number of identifier buckets.
+	buckets = 2048
+)
+
+type endpoint struct {
+	nicID         tcpip.NICID
+	id            stack.NetworkEndpointID
+	prefixLen     int
+	linkEP        stack.LinkEndpoint
+	dispatcher    stack.TransportDispatcher
+	fragmentation *fragmentation.Fragmentation
+	protocol      *protocol
+	stack         *stack.Stack
+}
+
+// NewEndpoint creates a new ipv4 endpoint.
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
+	e := &endpoint{
+		nicID:         nicID,
+		id:            stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
+		prefixLen:     addrWithPrefix.PrefixLen,
+		linkEP:        linkEP,
+		dispatcher:    dispatcher,
+		fragmentation: fragmentation.NewFragmentation(fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout),
+		protocol:      p,
+		stack:         st,
+	}
+
+	return e, nil
+}
+
+// DefaultTTL is the default time-to-live value for this endpoint.
+func (e *endpoint) DefaultTTL() uint8 {
+	return e.protocol.DefaultTTL()
+}
+
+// MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus
+// the network layer max header length.
+func (e *endpoint) MTU() uint32 {
+	return calculateMTU(e.linkEP.MTU())
+}
+
+// Capabilities implements stack.NetworkEndpoint.Capabilities.
+func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.linkEP.Capabilities()
+}
+
+// NICID returns the ID of the NIC this endpoint belongs to.
+func (e *endpoint) NICID() tcpip.NICID {
+	return e.nicID
+}
+
+// ID returns the ipv4 endpoint ID.
+func (e *endpoint) ID() *stack.NetworkEndpointID {
+	return &e.id
+}
+
+// PrefixLen returns the ipv4 endpoint subnet prefix length in bits.
+func (e *endpoint) PrefixLen() int {
+	return e.prefixLen
+}
+
+// MaxHeaderLength returns the maximum length needed by ipv4 headers (and
+// underlying protocols).
+func (e *endpoint) MaxHeaderLength() uint16 {
+	return e.linkEP.MaxHeaderLength() + header.IPv4MinimumSize
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (e *endpoint) GSOMaxSize() uint32 {
+	if gso, ok := e.linkEP.(stack.GSOEndpoint); ok {
+		return gso.GSOMaxSize()
+	}
+	return 0
+}
+
+// NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
+func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return e.protocol.Number()
+}
+
+// writePacketFragments calls e.linkEP.WritePacket with each packet fragment to
+// write. It assumes that the IP header is entirely in pkt.Header but does not
+// assume that only the IP header is in pkt.Header. It assumes that the input
+// packet's stated length matches the length of the header+payload. mtu
+// includes the IP header and options. This does not support the DontFragment
+// IP flag.
+func (e *endpoint) writePacketFragments(r *stack.Route, gso *stack.GSO, mtu int, pkt *stack.PacketBuffer) *tcpip.Error {
+	// This packet is too big, it needs to be fragmented.
+	ip := header.IPv4(pkt.Header.View())
+	flags := ip.Flags()
+
+	// Update mtu to take into account the header, which will exist in all
+	// fragments anyway.
+	innerMTU := mtu - int(ip.HeaderLength())
+
+	// Round the MTU down to align to 8 bytes. Then calculate the number of
+	// fragments. Calculate fragment sizes as in RFC791.
+	innerMTU &^= 7
+	n := (int(ip.PayloadLength()) + innerMTU - 1) / innerMTU
+
+	outerMTU := innerMTU + int(ip.HeaderLength())
+	offset := ip.FragmentOffset()
+	originalAvailableLength := pkt.Header.AvailableLength()
+	for i := 0; i < n; i++ {
+		// Where possible, the first fragment that is sent has the same
+		// pkt.Header.UsedLength() as the input packet. The link-layer
+		// endpoint may depend on this for looking at, eg, L4 headers.
+		h := ip
+		if i > 0 {
+			pkt.Header = buffer.NewPrependable(int(ip.HeaderLength()) + originalAvailableLength)
+			h = header.IPv4(pkt.Header.Prepend(int(ip.HeaderLength())))
+			copy(h, ip[:ip.HeaderLength()])
+		}
+		if i != n-1 {
+			h.SetTotalLength(uint16(outerMTU))
+			h.SetFlagsFragmentOffset(flags|header.IPv4FlagMoreFragments, offset)
+		} else {
+			h.SetTotalLength(uint16(h.HeaderLength()) + uint16(pkt.Data.Size()))
+			h.SetFlagsFragmentOffset(flags, offset)
+		}
+		h.SetChecksum(0)
+		h.SetChecksum(^h.CalculateChecksum())
+		offset += uint16(innerMTU)
+		if i > 0 {
+			newPayload := pkt.Data.Clone(nil)
+			newPayload.CapLength(innerMTU)
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, &stack.PacketBuffer{
+				Header:        pkt.Header,
+				Data:          newPayload,
+				NetworkHeader: buffer.View(h),
+			}); err != nil {
+				return err
+			}
+			r.Stats().IP.PacketsSent.Increment()
+			pkt.Data.TrimFront(newPayload.Size())
+			continue
+		}
+		// Special handling for the first fragment because it comes
+		// from the header.
+		if outerMTU >= pkt.Header.UsedLength() {
+			// This fragment can fit all of pkt.Header and possibly
+			// some of pkt.Data, too.
+			newPayload := pkt.Data.Clone(nil)
+			newPayloadLength := outerMTU - pkt.Header.UsedLength()
+			newPayload.CapLength(newPayloadLength)
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, &stack.PacketBuffer{
+				Header:        pkt.Header,
+				Data:          newPayload,
+				NetworkHeader: buffer.View(h),
+			}); err != nil {
+				return err
+			}
+			r.Stats().IP.PacketsSent.Increment()
+			pkt.Data.TrimFront(newPayloadLength)
+		} else {
+			// The fragment is too small to fit all of pkt.Header.
+			startOfHdr := pkt.Header
+			startOfHdr.TrimBack(pkt.Header.UsedLength() - outerMTU)
+			emptyVV := buffer.NewVectorisedView(0, []buffer.View{})
+			if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, &stack.PacketBuffer{
+				Header:        startOfHdr,
+				Data:          emptyVV,
+				NetworkHeader: buffer.View(h),
+			}); err != nil {
+				return err
+			}
+			r.Stats().IP.PacketsSent.Increment()
+			// Add the unused bytes of pkt.Header into the pkt.Data
+			// that remains to be sent.
+			restOfHdr := pkt.Header.View()[outerMTU:]
+			tmp := buffer.NewVectorisedView(len(restOfHdr), []buffer.View{buffer.NewViewFromBytes(restOfHdr)})
+			tmp.Append(pkt.Data)
+			pkt.Data = tmp
+		}
+	}
+	return nil
+}
+
+func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) header.IPv4 {
+	ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+	length := uint16(hdr.UsedLength() + payloadSize)
+	// RFC 6864 section 4.3 mandates uniqueness of ID values for non-atomic
+	// datagrams. Since the DF bit is never being set here, all datagrams
+	// are non-atomic and need an ID.
+	id := atomic.AddUint32(&e.protocol.ids[hashRoute(r, params.Protocol, e.protocol.hashIV)%buckets], 1)
+	ip.Encode(&header.IPv4Fields{
+		IHL:         header.IPv4MinimumSize,
+		TotalLength: length,
+		ID:          uint16(id),
+		TTL:         params.TTL,
+		TOS:         params.TOS,
+		Protocol:    uint8(params.Protocol),
+		SrcAddr:     r.LocalAddress,
+		DstAddr:     r.RemoteAddress,
+	})
+	ip.SetChecksum(^ip.CalculateChecksum())
+	return ip
+}
+
+// WritePacket writes a packet to the given destination address and protocol.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
+	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
+	pkt.NetworkHeader = buffer.View(ip)
+
+	nicName := e.stack.FindNICNameFromID(e.NICID())
+	// iptables filtering. All packets that reach here are locally
+	// generated.
+	ipt := e.stack.IPTables()
+	if ok := ipt.Check(stack.Output, pkt, gso, r, "", nicName); !ok {
+		// iptables is telling us to drop the packet.
+		return nil
+	}
+
+	// If the packet is manipulated as per NAT Ouput rules, handle packet
+	// based on destination address and do not send the packet to link layer.
+	// TODO(gvisor.dev/issue/170): We should do this for every packet, rather than
+	// only NATted packets, but removing this check short circuits broadcasts
+	// before they are sent out to other hosts.
+	if pkt.NatDone {
+		netHeader := header.IPv4(pkt.NetworkHeader)
+		ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress())
+		if err == nil {
+			route := r.ReverseRoute(netHeader.SourceAddress(), netHeader.DestinationAddress())
+			ep.HandlePacket(&route, pkt)
+			return nil
+		}
+	}
+
+	if r.Loop&stack.PacketLoop != 0 {
+		loopedR := r.MakeLoopedRoute()
+		e.HandlePacket(&loopedR, pkt)
+		loopedR.Release()
+	}
+	if r.Loop&stack.PacketOut == 0 {
+		return nil
+	}
+	if pkt.Header.UsedLength()+pkt.Data.Size() > int(e.linkEP.MTU()) && (gso == nil || gso.Type == stack.GSONone) {
+		return e.writePacketFragments(r, gso, int(e.linkEP.MTU()), pkt)
+	}
+	if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
+		return err
+	}
+	r.Stats().IP.PacketsSent.Increment()
+	return nil
+}
+
+// WritePackets implements stack.NetworkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+	if r.Loop&stack.PacketLoop != 0 {
+		panic("multiple packets in local loop")
+	}
+	if r.Loop&stack.PacketOut == 0 {
+		return pkts.Len(), nil
+	}
+
+	for pkt := pkts.Front(); pkt != nil; {
+		ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
+		pkt.NetworkHeader = buffer.View(ip)
+		pkt = pkt.Next()
+	}
+
+	nicName := e.stack.FindNICNameFromID(e.NICID())
+	// iptables filtering. All packets that reach here are locally
+	// generated.
+	ipt := e.stack.IPTables()
+	dropped, natPkts := ipt.CheckPackets(stack.Output, pkts, gso, r, nicName)
+	if len(dropped) == 0 && len(natPkts) == 0 {
+		// Fast path: If no packets are to be dropped then we can just invoke the
+		// faster WritePackets API directly.
+		n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
+		r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+		return n, err
+	}
+
+	// Slow Path as we are dropping some packets in the batch degrade to
+	// emitting one packet at a time.
+	n := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		if _, ok := dropped[pkt]; ok {
+			continue
+		}
+		if _, ok := natPkts[pkt]; ok {
+			netHeader := header.IPv4(pkt.NetworkHeader)
+			if ep, err := e.stack.FindNetworkEndpoint(header.IPv4ProtocolNumber, netHeader.DestinationAddress()); err == nil {
+				src := netHeader.SourceAddress()
+				dst := netHeader.DestinationAddress()
+				route := r.ReverseRoute(src, dst)
+				ep.HandlePacket(&route, pkt)
+				n++
+				continue
+			}
+		}
+		if err := e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt); err != nil {
+			r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+			return n, err
+		}
+		n++
+	}
+	r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+	return n, nil
+}
+
+// WriteHeaderIncludedPacket writes a packet already containing a network
+// header through the given route.
+func (e *endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
+	// The packet already has an IP header, but there are a few required
+	// checks.
+	h, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return tcpip.ErrInvalidOptionValue
+	}
+	ip := header.IPv4(h)
+	if !ip.IsValid(pkt.Data.Size()) {
+		return tcpip.ErrInvalidOptionValue
+	}
+
+	// Always set the total length.
+	ip.SetTotalLength(uint16(pkt.Data.Size()))
+
+	// Set the source address when zero.
+	if ip.SourceAddress() == tcpip.Address(([]byte{0, 0, 0, 0})) {
+		ip.SetSourceAddress(r.LocalAddress)
+	}
+
+	// Set the destination. If the packet already included a destination,
+	// it will be part of the route.
+	ip.SetDestinationAddress(r.RemoteAddress)
+
+	// Set the packet ID when zero.
+	if ip.ID() == 0 {
+		// RFC 6864 section 4.3 mandates uniqueness of ID values for
+		// non-atomic datagrams, so assign an ID to all such datagrams
+		// according to the definition given in RFC 6864 section 4.
+		if ip.Flags()&header.IPv4FlagDontFragment == 0 || ip.Flags()&header.IPv4FlagMoreFragments != 0 || ip.FragmentOffset() > 0 {
+			ip.SetID(uint16(atomic.AddUint32(&e.protocol.ids[hashRoute(r, 0 /* protocol */, e.protocol.hashIV)%buckets], 1)))
+		}
+	}
+
+	// Always set the checksum.
+	ip.SetChecksum(0)
+	ip.SetChecksum(^ip.CalculateChecksum())
+
+	if r.Loop&stack.PacketLoop != 0 {
+		e.HandlePacket(r, pkt.Clone())
+	}
+	if r.Loop&stack.PacketOut == 0 {
+		return nil
+	}
+
+	r.Stats().IP.PacketsSent.Increment()
+
+	ip = ip[:ip.HeaderLength()]
+	pkt.Header = buffer.NewPrependableFromView(buffer.View(ip))
+	pkt.Data.TrimFront(int(ip.HeaderLength()))
+	return e.linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, pkt)
+}
+
+// HandlePacket is called by the link layer when new ipv4 packets arrive for
+// this endpoint.
+func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	h := header.IPv4(pkt.NetworkHeader)
+	if !h.IsValid(pkt.Data.Size() + len(pkt.NetworkHeader) + len(pkt.TransportHeader)) {
+		r.Stats().IP.MalformedPacketsReceived.Increment()
+		return
+	}
+
+	// iptables filtering. All packets that reach here are intended for
+	// this machine and will not be forwarded.
+	ipt := e.stack.IPTables()
+	if ok := ipt.Check(stack.Input, pkt, nil, nil, "", ""); !ok {
+		// iptables is telling us to drop the packet.
+		return
+	}
+
+	if h.More() || h.FragmentOffset() != 0 {
+		if pkt.Data.Size()+len(pkt.TransportHeader) == 0 {
+			// Drop the packet as it's marked as a fragment but has
+			// no payload.
+			r.Stats().IP.MalformedPacketsReceived.Increment()
+			r.Stats().IP.MalformedFragmentsReceived.Increment()
+			return
+		}
+		// The packet is a fragment, let's try to reassemble it.
+		last := h.FragmentOffset() + uint16(pkt.Data.Size()) - 1
+		// Drop the packet if the fragmentOffset is incorrect. i.e the
+		// combination of fragmentOffset and pkt.Data.size() causes a
+		// wrap around resulting in last being less than the offset.
+		if last < h.FragmentOffset() {
+			r.Stats().IP.MalformedPacketsReceived.Increment()
+			r.Stats().IP.MalformedFragmentsReceived.Increment()
+			return
+		}
+		var ready bool
+		var err error
+		pkt.Data, ready, err = e.fragmentation.Process(hash.IPv4FragmentHash(h), h.FragmentOffset(), last, h.More(), pkt.Data)
+		if err != nil {
+			r.Stats().IP.MalformedPacketsReceived.Increment()
+			r.Stats().IP.MalformedFragmentsReceived.Increment()
+			return
+		}
+		if !ready {
+			return
+		}
+	}
+	p := h.TransportProtocol()
+	if p == header.ICMPv4ProtocolNumber {
+		pkt.NetworkHeader.CapLength(int(h.HeaderLength()))
+		e.handleICMP(r, pkt)
+		return
+	}
+	r.Stats().IP.PacketsDelivered.Increment()
+	e.dispatcher.DeliverTransportPacket(r, p, pkt)
+}
+
+// Close cleans up resources associated with the endpoint.
+func (e *endpoint) Close() {}
+
+type protocol struct {
+	ids    []uint32
+	hashIV uint32
+
+	// defaultTTL is the current default TTL for the protocol. Only the
+	// uint8 portion of it is meaningful and it must be accessed
+	// atomically.
+	defaultTTL uint32
+}
+
+// Number returns the ipv4 protocol number.
+func (p *protocol) Number() tcpip.NetworkProtocolNumber {
+	return ProtocolNumber
+}
+
+// MinimumPacketSize returns the minimum valid ipv4 packet size.
+func (p *protocol) MinimumPacketSize() int {
+	return header.IPv4MinimumSize
+}
+
+// DefaultPrefixLen returns the IPv4 default prefix length.
+func (p *protocol) DefaultPrefixLen() int {
+	return header.IPv4AddressSize * 8
+}
+
+// ParseAddresses implements NetworkProtocol.ParseAddresses.
+func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
+	h := header.IPv4(v)
+	return h.SourceAddress(), h.DestinationAddress()
+}
+
+// SetOption implements NetworkProtocol.SetOption.
+func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+	switch v := option.(type) {
+	case tcpip.DefaultTTLOption:
+		p.SetDefaultTTL(uint8(v))
+		return nil
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// Option implements NetworkProtocol.Option.
+func (p *protocol) Option(option interface{}) *tcpip.Error {
+	switch v := option.(type) {
+	case *tcpip.DefaultTTLOption:
+		*v = tcpip.DefaultTTLOption(p.DefaultTTL())
+		return nil
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// SetDefaultTTL sets the default TTL for endpoints created with this protocol.
+func (p *protocol) SetDefaultTTL(ttl uint8) {
+	atomic.StoreUint32(&p.defaultTTL, uint32(ttl))
+}
+
+// DefaultTTL returns the default TTL for endpoints created with this protocol.
+func (p *protocol) DefaultTTL() uint8 {
+	return uint8(atomic.LoadUint32(&p.defaultTTL))
+}
+
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
+	hdr, ok := pkt.Data.PullUp(header.IPv4MinimumSize)
+	if !ok {
+		return 0, false, false
+	}
+	ipHdr := header.IPv4(hdr)
+
+	// If there are options, pull those into hdr as well.
+	if headerLen := int(ipHdr.HeaderLength()); headerLen > header.IPv4MinimumSize && headerLen <= pkt.Data.Size() {
+		hdr, ok = pkt.Data.PullUp(headerLen)
+		if !ok {
+			panic(fmt.Sprintf("There are only %d bytes in pkt.Data, but there should be at least %d", pkt.Data.Size(), headerLen))
+		}
+		ipHdr = header.IPv4(hdr)
+	}
+
+	// If this is a fragment, don't bother parsing the transport header.
+	parseTransportHeader := true
+	if ipHdr.More() || ipHdr.FragmentOffset() != 0 {
+		parseTransportHeader = false
+	}
+
+	pkt.NetworkHeader = hdr
+	pkt.Data.TrimFront(len(hdr))
+	pkt.Data.CapLength(int(ipHdr.TotalLength()) - len(hdr))
+	return ipHdr.TransportProtocol(), parseTransportHeader, true
+}
+
+// calculateMTU calculates the network-layer payload MTU based on the link-layer
+// payload mtu.
+func calculateMTU(mtu uint32) uint32 {
+	if mtu > MaxTotalSize {
+		mtu = MaxTotalSize
+	}
+	return mtu - header.IPv4MinimumSize
+}
+
+// hashRoute calculates a hash value for the given route. It uses the source &
+// destination address, the transport protocol number, and a random initial
+// value (generated once on initialization) to generate the hash.
+func hashRoute(r *stack.Route, protocol tcpip.TransportProtocolNumber, hashIV uint32) uint32 {
+	t := r.LocalAddress
+	a := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
+	t = r.RemoteAddress
+	b := uint32(t[0]) | uint32(t[1])<<8 | uint32(t[2])<<16 | uint32(t[3])<<24
+	return hash.Hash3Words(a, b, uint32(protocol), hashIV)
+}
+
+// NewProtocol returns an IPv4 network protocol.
+func NewProtocol() stack.NetworkProtocol {
+	ids := make([]uint32, buckets)
+
+	// Randomly initialize hashIV and the ids.
+	r := hash.RandN32(1 + buckets)
+	for i := range ids {
+		ids[i] = r[i]
+	}
+	hashIV := r[buckets]
+
+	return &protocol{ids: ids, hashIV: hashIV, defaultTTL: DefaultTTL}
+}
diff --git a/pkg/tcpip/network/ipv4/ipv4_test.go b/pkg/tcpip/network/ipv4/ipv4_test.go
new file mode 100644
index 000000000..11e579c4b
--- /dev/null
+++ b/pkg/tcpip/network/ipv4/ipv4_test.go
@@ -0,0 +1,745 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipv4_test
+
+import (
+	"bytes"
+	"encoding/hex"
+	"math/rand"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func TestExcludeBroadcast(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+	})
+
+	const defaultMTU = 65536
+	ep := stack.LinkEndpoint(channel.New(256, defaultMTU, ""))
+	if testing.Verbose() {
+		ep = sniffer.New(ep)
+	}
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatalf("CreateNIC failed: %v", err)
+	}
+
+	s.SetRouteTable([]tcpip.Route{{
+		Destination: header.IPv4EmptySubnet,
+		NIC:         1,
+	}})
+
+	randomAddr := tcpip.FullAddress{NIC: 1, Addr: "\x0a\x00\x00\x01", Port: 53}
+
+	var wq waiter.Queue
+	t.Run("WithoutPrimaryAddress", func(t *testing.T) {
+		ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer ep.Close()
+
+		// Cannot connect using a broadcast address as the source.
+		if err := ep.Connect(randomAddr); err != tcpip.ErrNoRoute {
+			t.Errorf("got ep.Connect(...) = %v, want = %v", err, tcpip.ErrNoRoute)
+		}
+
+		// However, we can bind to a broadcast address to listen.
+		if err := ep.Bind(tcpip.FullAddress{Addr: header.IPv4Broadcast, Port: 53, NIC: 1}); err != nil {
+			t.Errorf("Bind failed: %v", err)
+		}
+	})
+
+	t.Run("WithPrimaryAddress", func(t *testing.T) {
+		ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer ep.Close()
+
+		// Add a valid primary endpoint address, now we can connect.
+		if err := s.AddAddress(1, ipv4.ProtocolNumber, "\x0a\x00\x00\x02"); err != nil {
+			t.Fatalf("AddAddress failed: %v", err)
+		}
+		if err := ep.Connect(randomAddr); err != nil {
+			t.Errorf("Connect failed: %v", err)
+		}
+	})
+}
+
+// makeHdrAndPayload generates a randomize packet. hdrLength indicates how much
+// data should already be in the header before WritePacket. extraLength
+// indicates how much extra space should be in the header. The payload is made
+// from many Views of the sizes listed in viewSizes.
+func makeHdrAndPayload(hdrLength int, extraLength int, viewSizes []int) (buffer.Prependable, buffer.VectorisedView) {
+	hdr := buffer.NewPrependable(hdrLength + extraLength)
+	hdr.Prepend(hdrLength)
+	rand.Read(hdr.View())
+
+	var views []buffer.View
+	totalLength := 0
+	for _, s := range viewSizes {
+		newView := buffer.NewView(s)
+		rand.Read(newView)
+		views = append(views, newView)
+		totalLength += s
+	}
+	payload := buffer.NewVectorisedView(totalLength, views)
+	return hdr, payload
+}
+
+// comparePayloads compared the contents of all the packets against the contents
+// of the source packet.
+func compareFragments(t *testing.T, packets []*stack.PacketBuffer, sourcePacketInfo *stack.PacketBuffer, mtu uint32) {
+	t.Helper()
+	// Make a complete array of the sourcePacketInfo packet.
+	source := header.IPv4(packets[0].Header.View()[:header.IPv4MinimumSize])
+	source = append(source, sourcePacketInfo.Header.View()...)
+	source = append(source, sourcePacketInfo.Data.ToView()...)
+
+	// Make a copy of the IP header, which will be modified in some fields to make
+	// an expected header.
+	sourceCopy := header.IPv4(append(buffer.View(nil), source[:source.HeaderLength()]...))
+	sourceCopy.SetChecksum(0)
+	sourceCopy.SetFlagsFragmentOffset(0, 0)
+	sourceCopy.SetTotalLength(0)
+	var offset uint16
+	// Build up an array of the bytes sent.
+	var reassembledPayload []byte
+	for i, packet := range packets {
+		// Confirm that the packet is valid.
+		allBytes := packet.Header.View().ToVectorisedView()
+		allBytes.Append(packet.Data)
+		ip := header.IPv4(allBytes.ToView())
+		if !ip.IsValid(len(ip)) {
+			t.Errorf("IP packet is invalid:\n%s", hex.Dump(ip))
+		}
+		if got, want := ip.CalculateChecksum(), uint16(0xffff); got != want {
+			t.Errorf("ip.CalculateChecksum() got %#x, want %#x", got, want)
+		}
+		if got, want := len(ip), int(mtu); got > want {
+			t.Errorf("fragment is too large, got %d want %d", got, want)
+		}
+		if got, want := packet.Header.UsedLength(), sourcePacketInfo.Header.UsedLength()+header.IPv4MinimumSize; i == 0 && want < int(mtu) && got != want {
+			t.Errorf("first fragment hdr parts should have unmodified length if possible: got %d, want %d", got, want)
+		}
+		if got, want := packet.Header.AvailableLength(), sourcePacketInfo.Header.AvailableLength()-header.IPv4MinimumSize; got != want {
+			t.Errorf("fragment #%d should have the same available space for prepending as source: got %d, want %d", i, got, want)
+		}
+		if i < len(packets)-1 {
+			sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()|header.IPv4FlagMoreFragments, offset)
+		} else {
+			sourceCopy.SetFlagsFragmentOffset(sourceCopy.Flags()&^header.IPv4FlagMoreFragments, offset)
+		}
+		reassembledPayload = append(reassembledPayload, ip.Payload()...)
+		offset += ip.TotalLength() - uint16(ip.HeaderLength())
+		// Clear out the checksum and length from the ip because we can't compare
+		// it.
+		sourceCopy.SetTotalLength(uint16(len(ip)))
+		sourceCopy.SetChecksum(0)
+		sourceCopy.SetChecksum(^sourceCopy.CalculateChecksum())
+		if !bytes.Equal(ip[:ip.HeaderLength()], sourceCopy[:sourceCopy.HeaderLength()]) {
+			t.Errorf("ip[:ip.HeaderLength()] got:\n%s\nwant:\n%s", hex.Dump(ip[:ip.HeaderLength()]), hex.Dump(sourceCopy[:sourceCopy.HeaderLength()]))
+		}
+	}
+	expected := source[source.HeaderLength():]
+	if !bytes.Equal(reassembledPayload, expected) {
+		t.Errorf("reassembledPayload got:\n%s\nwant:\n%s", hex.Dump(reassembledPayload), hex.Dump(expected))
+	}
+}
+
+type errorChannel struct {
+	*channel.Endpoint
+	Ch                    chan *stack.PacketBuffer
+	packetCollectorErrors []*tcpip.Error
+}
+
+// newErrorChannel creates a new errorChannel endpoint. Each call to WritePacket
+// will return successive errors from packetCollectorErrors until the list is
+// empty and then return nil each time.
+func newErrorChannel(size int, mtu uint32, linkAddr tcpip.LinkAddress, packetCollectorErrors []*tcpip.Error) *errorChannel {
+	return &errorChannel{
+		Endpoint:              channel.New(size, mtu, linkAddr),
+		Ch:                    make(chan *stack.PacketBuffer, size),
+		packetCollectorErrors: packetCollectorErrors,
+	}
+}
+
+// Drain removes all outbound packets from the channel and counts them.
+func (e *errorChannel) Drain() int {
+	c := 0
+	for {
+		select {
+		case <-e.Ch:
+			c++
+		default:
+			return c
+		}
+	}
+}
+
+// WritePacket stores outbound packets into the channel.
+func (e *errorChannel) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	select {
+	case e.Ch <- pkt:
+	default:
+	}
+
+	nextError := (*tcpip.Error)(nil)
+	if len(e.packetCollectorErrors) > 0 {
+		nextError = e.packetCollectorErrors[0]
+		e.packetCollectorErrors = e.packetCollectorErrors[1:]
+	}
+	return nextError
+}
+
+type context struct {
+	stack.Route
+	linkEP *errorChannel
+}
+
+func buildContext(t *testing.T, packetCollectorErrors []*tcpip.Error, mtu uint32) context {
+	// Make the packet and write it.
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()},
+	})
+	ep := newErrorChannel(100 /* Enough for all tests. */, mtu, "", packetCollectorErrors)
+	s.CreateNIC(1, ep)
+	const (
+		src = "\x10\x00\x00\x01"
+		dst = "\x10\x00\x00\x02"
+	)
+	s.AddAddress(1, ipv4.ProtocolNumber, src)
+	{
+		subnet, err := tcpip.NewSubnet(dst, tcpip.AddressMask(header.IPv4Broadcast))
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{
+			Destination: subnet,
+			NIC:         1,
+		}})
+	}
+	r, err := s.FindRoute(0, src, dst, ipv4.ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("s.FindRoute got %v, want %v", err, nil)
+	}
+	return context{
+		Route:  r,
+		linkEP: ep,
+	}
+}
+
+func TestFragmentation(t *testing.T) {
+	var manyPayloadViewsSizes [1000]int
+	for i := range manyPayloadViewsSizes {
+		manyPayloadViewsSizes[i] = 7
+	}
+	fragTests := []struct {
+		description       string
+		mtu               uint32
+		gso               *stack.GSO
+		hdrLength         int
+		extraLength       int
+		payloadViewsSizes []int
+		expectedFrags     int
+	}{
+		{"NoFragmentation", 2000, &stack.GSO{}, 0, header.IPv4MinimumSize, []int{1000}, 1},
+		{"NoFragmentationWithBigHeader", 2000, &stack.GSO{}, 16, header.IPv4MinimumSize, []int{1000}, 1},
+		{"Fragmented", 800, &stack.GSO{}, 0, header.IPv4MinimumSize, []int{1000}, 2},
+		{"FragmentedWithGsoNil", 800, nil, 0, header.IPv4MinimumSize, []int{1000}, 2},
+		{"FragmentedWithManyViews", 300, &stack.GSO{}, 0, header.IPv4MinimumSize, manyPayloadViewsSizes[:], 25},
+		{"FragmentedWithManyViewsAndPrependableBytes", 300, &stack.GSO{}, 0, header.IPv4MinimumSize + 55, manyPayloadViewsSizes[:], 25},
+		{"FragmentedWithBigHeader", 800, &stack.GSO{}, 20, header.IPv4MinimumSize, []int{1000}, 2},
+		{"FragmentedWithBigHeaderAndPrependableBytes", 800, &stack.GSO{}, 20, header.IPv4MinimumSize + 66, []int{1000}, 2},
+		{"FragmentedWithMTUSmallerThanHeaderAndPrependableBytes", 300, &stack.GSO{}, 1000, header.IPv4MinimumSize + 77, []int{500}, 6},
+	}
+
+	for _, ft := range fragTests {
+		t.Run(ft.description, func(t *testing.T) {
+			hdr, payload := makeHdrAndPayload(ft.hdrLength, ft.extraLength, ft.payloadViewsSizes)
+			source := &stack.PacketBuffer{
+				Header: hdr,
+				// Save the source payload because WritePacket will modify it.
+				Data: payload.Clone(nil),
+			}
+			c := buildContext(t, nil, ft.mtu)
+			err := c.Route.WritePacket(ft.gso, stack.NetworkHeaderParams{
+				Protocol: tcp.ProtocolNumber,
+				TTL:      42,
+				TOS:      stack.DefaultTOS,
+			}, &stack.PacketBuffer{
+				Header: hdr,
+				Data:   payload,
+			})
+			if err != nil {
+				t.Errorf("err got %v, want %v", err, nil)
+			}
+
+			var results []*stack.PacketBuffer
+		L:
+			for {
+				select {
+				case pi := <-c.linkEP.Ch:
+					results = append(results, pi)
+				default:
+					break L
+				}
+			}
+
+			if got, want := len(results), ft.expectedFrags; got != want {
+				t.Errorf("len(result) got %d, want %d", got, want)
+			}
+			if got, want := len(results), int(c.Route.Stats().IP.PacketsSent.Value()); got != want {
+				t.Errorf("no errors yet len(result) got %d, want %d", got, want)
+			}
+			compareFragments(t, results, source, ft.mtu)
+		})
+	}
+}
+
+// TestFragmentationErrors checks that errors are returned from write packet
+// correctly.
+func TestFragmentationErrors(t *testing.T) {
+	fragTests := []struct {
+		description           string
+		mtu                   uint32
+		hdrLength             int
+		payloadViewsSizes     []int
+		packetCollectorErrors []*tcpip.Error
+	}{
+		{"NoFrag", 2000, 0, []int{1000}, []*tcpip.Error{tcpip.ErrAborted}},
+		{"ErrorOnFirstFrag", 500, 0, []int{1000}, []*tcpip.Error{tcpip.ErrAborted}},
+		{"ErrorOnSecondFrag", 500, 0, []int{1000}, []*tcpip.Error{nil, tcpip.ErrAborted}},
+		{"ErrorOnFirstFragMTUSmallerThanHdr", 500, 1000, []int{500}, []*tcpip.Error{tcpip.ErrAborted}},
+	}
+
+	for _, ft := range fragTests {
+		t.Run(ft.description, func(t *testing.T) {
+			hdr, payload := makeHdrAndPayload(ft.hdrLength, header.IPv4MinimumSize, ft.payloadViewsSizes)
+			c := buildContext(t, ft.packetCollectorErrors, ft.mtu)
+			err := c.Route.WritePacket(&stack.GSO{}, stack.NetworkHeaderParams{
+				Protocol: tcp.ProtocolNumber,
+				TTL:      42,
+				TOS:      stack.DefaultTOS,
+			}, &stack.PacketBuffer{
+				Header: hdr,
+				Data:   payload,
+			})
+			for i := 0; i < len(ft.packetCollectorErrors)-1; i++ {
+				if got, want := ft.packetCollectorErrors[i], (*tcpip.Error)(nil); got != want {
+					t.Errorf("ft.packetCollectorErrors[%d] got %v, want %v", i, got, want)
+				}
+			}
+			// We only need to check that last error because all the ones before are
+			// nil.
+			if got, want := err, ft.packetCollectorErrors[len(ft.packetCollectorErrors)-1]; got != want {
+				t.Errorf("err got %v, want %v", got, want)
+			}
+			if got, want := c.linkEP.Drain(), int(c.Route.Stats().IP.PacketsSent.Value())+1; err != nil && got != want {
+				t.Errorf("after linkEP error len(result) got %d, want %d", got, want)
+			}
+		})
+	}
+}
+
+func TestInvalidFragments(t *testing.T) {
+	// These packets have both IHL and TotalLength set to 0.
+	testCases := []struct {
+		name                   string
+		packets                [][]byte
+		wantMalformedIPPackets uint64
+		wantMalformedFragments uint64
+	}{
+		{
+			"ihl_totallen_zero_valid_frag_offset",
+			[][]byte{
+				{0x40, 0x30, 0x00, 0x00, 0x6c, 0x74, 0x7d, 0x30, 0x30, 0x30, 0x30, 0x30, 0x39, 0x32, 0x39, 0x33, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			},
+			1,
+			0,
+		},
+		{
+			"ihl_totallen_zero_invalid_frag_offset",
+			[][]byte{
+				{0x40, 0x30, 0x00, 0x00, 0x6c, 0x74, 0x20, 0x00, 0x30, 0x30, 0x30, 0x30, 0x39, 0x32, 0x39, 0x33, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			},
+			1,
+			0,
+		},
+		{
+			// Total Length of 37(20 bytes IP header + 17 bytes of
+			// payload)
+			// Frag Offset of 0x1ffe = 8190*8 = 65520
+			// Leading to the fragment end to be past 65535.
+			"ihl_totallen_valid_invalid_frag_offset_1",
+			[][]byte{
+				{0x45, 0x30, 0x00, 0x25, 0x6c, 0x74, 0x1f, 0xfe, 0x30, 0x30, 0x30, 0x30, 0x39, 0x32, 0x39, 0x33, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			},
+			1,
+			1,
+		},
+		// The following 3 tests were found by running a fuzzer and were
+		// triggering a panic in the IPv4 reassembler code.
+		{
+			"ihl_less_than_ipv4_minimum_size_1",
+			[][]byte{
+				{0x42, 0x30, 0x0, 0x30, 0x30, 0x40, 0x0, 0xf3, 0x30, 0x1, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+				{0x42, 0x30, 0x0, 0x8, 0x30, 0x40, 0x20, 0x0, 0x30, 0x1, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			},
+			2,
+			0,
+		},
+		{
+			"ihl_less_than_ipv4_minimum_size_2",
+			[][]byte{
+				{0x42, 0x30, 0x0, 0x30, 0x30, 0x40, 0xb3, 0x12, 0x30, 0x6, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+				{0x42, 0x30, 0x0, 0x8, 0x30, 0x40, 0x20, 0x0, 0x30, 0x6, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			},
+			2,
+			0,
+		},
+		{
+			"ihl_less_than_ipv4_minimum_size_3",
+			[][]byte{
+				{0x42, 0x30, 0x0, 0x30, 0x30, 0x40, 0xb3, 0x30, 0x30, 0x6, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+				{0x42, 0x30, 0x0, 0x8, 0x30, 0x40, 0x20, 0x0, 0x30, 0x6, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			},
+			2,
+			0,
+		},
+		{
+			"fragment_with_short_total_len_extra_payload",
+			[][]byte{
+				{0x46, 0x30, 0x00, 0x30, 0x30, 0x40, 0x0e, 0x12, 0x30, 0x06, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+				{0x46, 0x30, 0x00, 0x18, 0x30, 0x40, 0x20, 0x00, 0x30, 0x06, 0x30, 0x30, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
+			},
+			1,
+			1,
+		},
+		{
+			"multiple_fragments_with_more_fragments_set_to_false",
+			[][]byte{
+				{0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x00, 0x10, 0x00, 0x06, 0x34, 0x69, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+				{0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x00, 0x01, 0x61, 0x06, 0x34, 0x69, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+				{0x45, 0x00, 0x00, 0x1c, 0x30, 0x40, 0x20, 0x00, 0x00, 0x06, 0x34, 0x1e, 0x73, 0x73, 0x69, 0x6e, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+			},
+			1,
+			1,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			const nicID tcpip.NICID = 42
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{
+					ipv4.NewProtocol(),
+				},
+			})
+
+			var linkAddr = tcpip.LinkAddress([]byte{0x30, 0x30, 0x30, 0x30, 0x30, 0x30})
+			var remoteLinkAddr = tcpip.LinkAddress([]byte{0x30, 0x30, 0x30, 0x30, 0x30, 0x31})
+			ep := channel.New(10, 1500, linkAddr)
+			s.CreateNIC(nicID, sniffer.New(ep))
+
+			for _, pkt := range tc.packets {
+				ep.InjectLinkAddr(header.IPv4ProtocolNumber, remoteLinkAddr, &stack.PacketBuffer{
+					Data: buffer.NewVectorisedView(len(pkt), []buffer.View{pkt}),
+				})
+			}
+
+			if got, want := s.Stats().IP.MalformedPacketsReceived.Value(), tc.wantMalformedIPPackets; got != want {
+				t.Errorf("incorrect Stats.IP.MalformedPacketsReceived, got: %d, want: %d", got, want)
+			}
+			if got, want := s.Stats().IP.MalformedFragmentsReceived.Value(), tc.wantMalformedFragments; got != want {
+				t.Errorf("incorrect Stats.IP.MalformedFragmentsReceived, got: %d, want: %d", got, want)
+			}
+		})
+	}
+}
+
+// TestReceiveFragments feeds fragments in through the incoming packet path to
+// test reassembly
+func TestReceiveFragments(t *testing.T) {
+	const addr1 = "\x0c\xa8\x00\x01" // 192.168.0.1
+	const addr2 = "\x0c\xa8\x00\x02" // 192.168.0.2
+	const nicID = 1
+
+	// Build and return a UDP header containing payload.
+	udpGen := func(payloadLen int, multiplier uint8) buffer.View {
+		payload := buffer.NewView(payloadLen)
+		for i := 0; i < len(payload); i++ {
+			payload[i] = uint8(i) * multiplier
+		}
+
+		udpLength := header.UDPMinimumSize + len(payload)
+
+		hdr := buffer.NewPrependable(udpLength)
+		u := header.UDP(hdr.Prepend(udpLength))
+		u.Encode(&header.UDPFields{
+			SrcPort: 5555,
+			DstPort: 80,
+			Length:  uint16(udpLength),
+		})
+		copy(u.Payload(), payload)
+		sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength))
+		sum = header.Checksum(payload, sum)
+		u.SetChecksum(^u.CalculateChecksum(sum))
+		return hdr.View()
+	}
+
+	// UDP header plus a payload of 0..256
+	ipv4Payload1 := udpGen(256, 1)
+	udpPayload1 := ipv4Payload1[header.UDPMinimumSize:]
+	// UDP header plus a payload of 0..256 in increments of 2.
+	ipv4Payload2 := udpGen(128, 2)
+	udpPayload2 := ipv4Payload2[header.UDPMinimumSize:]
+
+	type fragmentData struct {
+		id             uint16
+		flags          uint8
+		fragmentOffset uint16
+		payload        buffer.View
+	}
+
+	tests := []struct {
+		name             string
+		fragments        []fragmentData
+		expectedPayloads [][]byte
+	}{
+		{
+			name: "No fragmentation",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          0,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1,
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "More fragments without payload",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1,
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Non-zero fragment offset without payload",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          0,
+					fragmentOffset: 8,
+					payload:        ipv4Payload1,
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1[:64],
+				},
+				{
+					id:             1,
+					flags:          0,
+					fragmentOffset: 64,
+					payload:        ipv4Payload1[64:],
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Second fragment has MoreFlags set",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1[:64],
+				},
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 64,
+					payload:        ipv4Payload1[64:],
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments with different IDs",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1[:64],
+				},
+				{
+					id:             2,
+					flags:          0,
+					fragmentOffset: 64,
+					payload:        ipv4Payload1[64:],
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two interleaved fragmented packets",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1[:64],
+				},
+				{
+					id:             2,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload2[:64],
+				},
+				{
+					id:             1,
+					flags:          0,
+					fragmentOffset: 64,
+					payload:        ipv4Payload1[64:],
+				},
+				{
+					id:             2,
+					flags:          0,
+					fragmentOffset: 64,
+					payload:        ipv4Payload2[64:],
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1, udpPayload2},
+		},
+		{
+			name: "Fragment without followup",
+			fragments: []fragmentData{
+				{
+					id:             1,
+					flags:          header.IPv4FlagMoreFragments,
+					fragmentOffset: 0,
+					payload:        ipv4Payload1[:64],
+				},
+			},
+			expectedPayloads: nil,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			// Setup a stack and endpoint.
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			e := channel.New(0, 1280, tcpip.LinkAddress("\xf0\x00"))
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, header.IPv4ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv4ProtocolNumber, addr2, err)
+			}
+
+			wq := waiter.Queue{}
+			we, ch := waiter.NewChannelEntry(nil)
+			wq.EventRegister(&we, waiter.EventIn)
+			defer wq.EventUnregister(&we)
+			defer close(ch)
+			ep, err := s.NewEndpoint(udp.ProtocolNumber, header.IPv4ProtocolNumber, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, header.IPv4ProtocolNumber, err)
+			}
+			defer ep.Close()
+
+			bindAddr := tcpip.FullAddress{Addr: addr2, Port: 80}
+			if err := ep.Bind(bindAddr); err != nil {
+				t.Fatalf("Bind(%+v): %s", bindAddr, err)
+			}
+
+			// Prepare and send the fragments.
+			for _, frag := range test.fragments {
+				hdr := buffer.NewPrependable(header.IPv4MinimumSize)
+
+				// Serialize IPv4 fixed header.
+				ip := header.IPv4(hdr.Prepend(header.IPv4MinimumSize))
+				ip.Encode(&header.IPv4Fields{
+					IHL:            header.IPv4MinimumSize,
+					TotalLength:    header.IPv4MinimumSize + uint16(len(frag.payload)),
+					ID:             frag.id,
+					Flags:          frag.flags,
+					FragmentOffset: frag.fragmentOffset,
+					TTL:            64,
+					Protocol:       uint8(header.UDPProtocolNumber),
+					SrcAddr:        addr1,
+					DstAddr:        addr2,
+				})
+
+				vv := hdr.View().ToVectorisedView()
+				vv.AppendView(frag.payload)
+
+				e.InjectInbound(header.IPv4ProtocolNumber, &stack.PacketBuffer{
+					Data: vv,
+				})
+			}
+
+			if got, want := s.Stats().UDP.PacketsReceived.Value(), uint64(len(test.expectedPayloads)); got != want {
+				t.Errorf("got UDP Rx Packets = %d, want = %d", got, want)
+			}
+
+			for i, expectedPayload := range test.expectedPayloads {
+				gotPayload, _, err := ep.Read(nil)
+				if err != nil {
+					t.Fatalf("(i=%d) Read(nil): %s", i, err)
+				}
+				if diff := cmp.Diff(buffer.View(expectedPayload), gotPayload); diff != "" {
+					t.Errorf("(i=%d) got UDP payload mismatch (-want +got):\n%s", i, diff)
+				}
+			}
+
+			if gotPayload, _, err := ep.Read(nil); err != tcpip.ErrWouldBlock {
+				t.Fatalf("(last) got Read(nil) = (%x, _, %v), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/network/ipv6/BUILD b/pkg/tcpip/network/ipv6/BUILD
new file mode 100644
index 000000000..3f71fc520
--- /dev/null
+++ b/pkg/tcpip/network/ipv6/BUILD
@@ -0,0 +1,44 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "ipv6",
+    srcs = [
+        "icmp.go",
+        "ipv6.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/network/fragmentation",
+        "//pkg/tcpip/network/hash",
+        "//pkg/tcpip/stack",
+    ],
+)
+
+go_test(
+    name = "ipv6_test",
+    size = "small",
+    srcs = [
+        "icmp_test.go",
+        "ipv6_test.go",
+        "ndp_test.go",
+    ],
+    library = ":ipv6",
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/checker",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/icmp",
+        "//pkg/tcpip/transport/udp",
+        "//pkg/waiter",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+    ],
+)
diff --git a/pkg/tcpip/network/ipv6/icmp.go b/pkg/tcpip/network/ipv6/icmp.go
new file mode 100644
index 000000000..2ff7eedf4
--- /dev/null
+++ b/pkg/tcpip/network/ipv6/icmp.go
@@ -0,0 +1,549 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipv6
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// handleControl handles the case when an ICMP packet contains the headers of
+// the original packet that caused the ICMP one to be sent. This information is
+// used to find out which transport endpoint must be notified about the ICMP
+// packet.
+func (e *endpoint) handleControl(typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
+	h, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	if !ok {
+		return
+	}
+	hdr := header.IPv6(h)
+
+	// We don't use IsValid() here because ICMP only requires that up to
+	// 1280 bytes of the original packet be included. So it's likely that it
+	// is truncated, which would cause IsValid to return false.
+	//
+	// Drop packet if it doesn't have the basic IPv6 header or if the
+	// original source address doesn't match the endpoint's address.
+	if hdr.SourceAddress() != e.id.LocalAddress {
+		return
+	}
+
+	// Skip the IP header, then handle the fragmentation header if there
+	// is one.
+	pkt.Data.TrimFront(header.IPv6MinimumSize)
+	p := hdr.TransportProtocol()
+	if p == header.IPv6FragmentHeader {
+		f, ok := pkt.Data.PullUp(header.IPv6FragmentHeaderSize)
+		if !ok {
+			return
+		}
+		fragHdr := header.IPv6Fragment(f)
+		if !fragHdr.IsValid() || fragHdr.FragmentOffset() != 0 {
+			// We can't handle fragments that aren't at offset 0
+			// because they don't have the transport headers.
+			return
+		}
+
+		// Skip fragmentation header and find out the actual protocol
+		// number.
+		pkt.Data.TrimFront(header.IPv6FragmentHeaderSize)
+		p = fragHdr.TransportProtocol()
+	}
+
+	// Deliver the control packet to the transport endpoint.
+	e.dispatcher.DeliverTransportControlPacket(e.id.LocalAddress, hdr.DestinationAddress(), ProtocolNumber, p, typ, extra, pkt)
+}
+
+func (e *endpoint) handleICMP(r *stack.Route, pkt *stack.PacketBuffer, hasFragmentHeader bool) {
+	stats := r.Stats().ICMP
+	sent := stats.V6PacketsSent
+	received := stats.V6PacketsReceived
+	// TODO(gvisor.dev/issue/170): ICMP packets don't have their
+	// TransportHeader fields set. See icmp/protocol.go:protocol.Parse for a
+	// full explanation.
+	v, ok := pkt.Data.PullUp(header.ICMPv6HeaderSize)
+	if !ok {
+		received.Invalid.Increment()
+		return
+	}
+	h := header.ICMPv6(v)
+	iph := header.IPv6(pkt.NetworkHeader)
+
+	// Validate ICMPv6 checksum before processing the packet.
+	//
+	// This copy is used as extra payload during the checksum calculation.
+	payload := pkt.Data.Clone(nil)
+	payload.TrimFront(len(h))
+	if got, want := h.Checksum(), header.ICMPv6Checksum(h, iph.SourceAddress(), iph.DestinationAddress(), payload); got != want {
+		received.Invalid.Increment()
+		return
+	}
+
+	isNDPValid := func() bool {
+		// As per RFC 4861 sections 4.1 - 4.5, 6.1.1, 6.1.2, 7.1.1, 7.1.2 and
+		// 8.1, nodes MUST silently drop NDP packets where the Hop Limit field
+		// in the IPv6 header is not set to 255, or the ICMPv6 Code field is not
+		// set to 0.
+		//
+		// As per RFC 6980 section 5, nodes MUST silently drop NDP messages if the
+		// packet includes a fragmentation header.
+		return !hasFragmentHeader && iph.HopLimit() == header.NDPHopLimit && h.Code() == 0
+	}
+
+	// TODO(b/112892170): Meaningfully handle all ICMP types.
+	switch h.Type() {
+	case header.ICMPv6PacketTooBig:
+		received.PacketTooBig.Increment()
+		hdr, ok := pkt.Data.PullUp(header.ICMPv6PacketTooBigMinimumSize)
+		if !ok {
+			received.Invalid.Increment()
+			return
+		}
+		pkt.Data.TrimFront(header.ICMPv6PacketTooBigMinimumSize)
+		mtu := header.ICMPv6(hdr).MTU()
+		e.handleControl(stack.ControlPacketTooBig, calculateMTU(mtu), pkt)
+
+	case header.ICMPv6DstUnreachable:
+		received.DstUnreachable.Increment()
+		hdr, ok := pkt.Data.PullUp(header.ICMPv6DstUnreachableMinimumSize)
+		if !ok {
+			received.Invalid.Increment()
+			return
+		}
+		pkt.Data.TrimFront(header.ICMPv6DstUnreachableMinimumSize)
+		switch header.ICMPv6(hdr).Code() {
+		case header.ICMPv6PortUnreachable:
+			e.handleControl(stack.ControlPortUnreachable, 0, pkt)
+		}
+
+	case header.ICMPv6NeighborSolicit:
+		received.NeighborSolicit.Increment()
+		if pkt.Data.Size() < header.ICMPv6NeighborSolicitMinimumSize || !isNDPValid() {
+			received.Invalid.Increment()
+			return
+		}
+
+		// The remainder of payload must be only the neighbor solicitation, so
+		// payload.ToView() always returns the solicitation. Per RFC 6980 section 5,
+		// NDP messages cannot be fragmented. Also note that in the common case NDP
+		// datagrams are very small and ToView() will not incur allocations.
+		ns := header.NDPNeighborSolicit(payload.ToView())
+		it, err := ns.Options().Iter(true)
+		if err != nil {
+			// If we have a malformed NDP NS option, drop the packet.
+			received.Invalid.Increment()
+			return
+		}
+
+		targetAddr := ns.TargetAddress()
+		s := r.Stack()
+		if isTentative, err := s.IsAddrTentative(e.nicID, targetAddr); err != nil {
+			// We will only get an error if the NIC is unrecognized, which should not
+			// happen. For now, drop this packet.
+			//
+			// TODO(b/141002840): Handle this better?
+			return
+		} else if isTentative {
+			// If the target address is tentative and the source of the packet is a
+			// unicast (specified) address, then the source of the packet is
+			// attempting to perform address resolution on the target. In this case,
+			// the solicitation is silently ignored, as per RFC 4862 section 5.4.3.
+			//
+			// If the target address is tentative and the source of the packet is the
+			// unspecified address (::), then we know another node is also performing
+			// DAD for the same address (since the target address is tentative for us,
+			// we know we are also performing DAD on it). In this case we let the
+			// stack know so it can handle such a scenario and do nothing further with
+			// the NS.
+			if r.RemoteAddress == header.IPv6Any {
+				s.DupTentativeAddrDetected(e.nicID, targetAddr)
+			}
+
+			// Do not handle neighbor solicitations targeted to an address that is
+			// tentative on the NIC any further.
+			return
+		}
+
+		// At this point we know that the target address is not tentative on the NIC
+		// so the packet is processed as defined in RFC 4861, as per RFC 4862
+		// section 5.4.3.
+
+		// Is the NS targetting us?
+		if e.linkAddrCache.CheckLocalAddress(e.nicID, ProtocolNumber, targetAddr) == 0 {
+			return
+		}
+
+		// If the NS message contains the Source Link-Layer Address option, update
+		// the link address cache with the value of the option.
+		//
+		// TODO(b/148429853): Properly process the NS message and do Neighbor
+		// Unreachability Detection.
+		var sourceLinkAddr tcpip.LinkAddress
+		for {
+			opt, done, err := it.Next()
+			if err != nil {
+				// This should never happen as Iter(true) above did not return an error.
+				panic(fmt.Sprintf("unexpected error when iterating over NDP options: %s", err))
+			}
+			if done {
+				break
+			}
+
+			switch opt := opt.(type) {
+			case header.NDPSourceLinkLayerAddressOption:
+				// No RFCs define what to do when an NS message has multiple Source
+				// Link-Layer Address options. Since no interface can have multiple
+				// link-layer addresses, we consider such messages invalid.
+				if len(sourceLinkAddr) != 0 {
+					received.Invalid.Increment()
+					return
+				}
+
+				sourceLinkAddr = opt.EthernetAddress()
+			}
+		}
+
+		unspecifiedSource := r.RemoteAddress == header.IPv6Any
+
+		// As per RFC 4861 section 4.3, the Source Link-Layer Address Option MUST
+		// NOT be included when the source IP address is the unspecified address.
+		// Otherwise, on link layers that have addresses this option MUST be
+		// included in multicast solicitations and SHOULD be included in unicast
+		// solicitations.
+		if len(sourceLinkAddr) == 0 {
+			if header.IsV6MulticastAddress(r.LocalAddress) && !unspecifiedSource {
+				received.Invalid.Increment()
+				return
+			}
+		} else if unspecifiedSource {
+			received.Invalid.Increment()
+			return
+		} else {
+			e.linkAddrCache.AddLinkAddress(e.nicID, r.RemoteAddress, sourceLinkAddr)
+		}
+
+		// ICMPv6 Neighbor Solicit messages are always sent to
+		// specially crafted IPv6 multicast addresses. As a result, the
+		// route we end up with here has as its LocalAddress such a
+		// multicast address. It would be nonsense to claim that our
+		// source address is a multicast address, so we manually set
+		// the source address to the target address requested in the
+		// solicit message. Since that requires mutating the route, we
+		// must first clone it.
+		r := r.Clone()
+		defer r.Release()
+		r.LocalAddress = targetAddr
+
+		// As per RFC 4861 section 7.2.4, if the the source of the solicitation is
+		// the unspecified address, the node MUST set the Solicited flag to zero and
+		// multicast the advertisement to the all-nodes address.
+		solicited := true
+		if unspecifiedSource {
+			solicited = false
+			r.RemoteAddress = header.IPv6AllNodesMulticastAddress
+		}
+
+		// If the NS has a source link-layer option, use the link address it
+		// specifies as the remote link address for the response instead of the
+		// source link address of the packet.
+		//
+		// TODO(#2401): As per RFC 4861 section 7.2.4 we should consult our link
+		// address cache for the right destination link address instead of manually
+		// patching the route with the remote link address if one is specified in a
+		// Source Link-Layer Address option.
+		if len(sourceLinkAddr) != 0 {
+			r.RemoteLinkAddress = sourceLinkAddr
+		}
+
+		optsSerializer := header.NDPOptionsSerializer{
+			header.NDPTargetLinkLayerAddressOption(r.LocalLinkAddress),
+		}
+		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborAdvertMinimumSize + int(optsSerializer.Length()))
+		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+		packet.SetType(header.ICMPv6NeighborAdvert)
+		na := header.NDPNeighborAdvert(packet.NDPPayload())
+		na.SetSolicitedFlag(solicited)
+		na.SetOverrideFlag(true)
+		na.SetTargetAddress(targetAddr)
+		opts := na.Options()
+		opts.Serialize(optsSerializer)
+		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+
+		// RFC 4861 Neighbor Discovery for IP version 6 (IPv6)
+		//
+		// 7.1.2. Validation of Neighbor Advertisements
+		//
+		// The IP Hop Limit field has a value of 255, i.e., the packet
+		// could not possibly have been forwarded by a router.
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: header.NDPHopLimit, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
+			Header: hdr,
+		}); err != nil {
+			sent.Dropped.Increment()
+			return
+		}
+		sent.NeighborAdvert.Increment()
+
+	case header.ICMPv6NeighborAdvert:
+		received.NeighborAdvert.Increment()
+		if pkt.Data.Size() < header.ICMPv6NeighborAdvertSize || !isNDPValid() {
+			received.Invalid.Increment()
+			return
+		}
+
+		// The remainder of payload must be only the neighbor advertisement, so
+		// payload.ToView() always returns the advertisement. Per RFC 6980 section
+		// 5, NDP messages cannot be fragmented. Also note that in the common case
+		// NDP datagrams are very small and ToView() will not incur allocations.
+		na := header.NDPNeighborAdvert(payload.ToView())
+		it, err := na.Options().Iter(true)
+		if err != nil {
+			// If we have a malformed NDP NA option, drop the packet.
+			received.Invalid.Increment()
+			return
+		}
+
+		targetAddr := na.TargetAddress()
+		stack := r.Stack()
+
+		if isTentative, err := stack.IsAddrTentative(e.nicID, targetAddr); err != nil {
+			// We will only get an error if the NIC is unrecognized, which should not
+			// happen. For now short-circuit this packet.
+			//
+			// TODO(b/141002840): Handle this better?
+			return
+		} else if isTentative {
+			// We just got an NA from a node that owns an address we are performing
+			// DAD on, implying the address is not unique. In this case we let the
+			// stack know so it can handle such a scenario and do nothing furthur with
+			// the NDP NA.
+			stack.DupTentativeAddrDetected(e.nicID, targetAddr)
+			return
+		}
+
+		// At this point we know that the target address is not tentative on the
+		// NIC. However, the target address may still be assigned to the NIC but not
+		// tentative (it could be permanent). Such a scenario is beyond the scope of
+		// RFC 4862. As such, we simply ignore such a scenario for now and proceed
+		// as normal.
+		//
+		// TODO(b/143147598): Handle the scenario described above. Also inform the
+		// netstack integration that a duplicate address was detected outside of
+		// DAD.
+
+		// If the NA message has the target link layer option, update the link
+		// address cache with the link address for the target of the message.
+		//
+		// TODO(b/148429853): Properly process the NA message and do Neighbor
+		// Unreachability Detection.
+		var targetLinkAddr tcpip.LinkAddress
+		for {
+			opt, done, err := it.Next()
+			if err != nil {
+				// This should never happen as Iter(true) above did not return an error.
+				panic(fmt.Sprintf("unexpected error when iterating over NDP options: %s", err))
+			}
+			if done {
+				break
+			}
+
+			switch opt := opt.(type) {
+			case header.NDPTargetLinkLayerAddressOption:
+				// No RFCs define what to do when an NA message has multiple Target
+				// Link-Layer Address options. Since no interface can have multiple
+				// link-layer addresses, we consider such messages invalid.
+				if len(targetLinkAddr) != 0 {
+					received.Invalid.Increment()
+					return
+				}
+
+				targetLinkAddr = opt.EthernetAddress()
+			}
+		}
+
+		if len(targetLinkAddr) != 0 {
+			e.linkAddrCache.AddLinkAddress(e.nicID, targetAddr, targetLinkAddr)
+		}
+
+	case header.ICMPv6EchoRequest:
+		received.EchoRequest.Increment()
+		icmpHdr, ok := pkt.Data.PullUp(header.ICMPv6EchoMinimumSize)
+		if !ok {
+			received.Invalid.Increment()
+			return
+		}
+		pkt.Data.TrimFront(header.ICMPv6EchoMinimumSize)
+		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6EchoMinimumSize)
+		packet := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
+		copy(packet, icmpHdr)
+		packet.SetType(header.ICMPv6EchoReply)
+		packet.SetChecksum(header.ICMPv6Checksum(packet, r.LocalAddress, r.RemoteAddress, pkt.Data))
+		if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   pkt.Data,
+		}); err != nil {
+			sent.Dropped.Increment()
+			return
+		}
+		sent.EchoReply.Increment()
+
+	case header.ICMPv6EchoReply:
+		received.EchoReply.Increment()
+		if pkt.Data.Size() < header.ICMPv6EchoMinimumSize {
+			received.Invalid.Increment()
+			return
+		}
+		e.dispatcher.DeliverTransportPacket(r, header.ICMPv6ProtocolNumber, pkt)
+
+	case header.ICMPv6TimeExceeded:
+		received.TimeExceeded.Increment()
+
+	case header.ICMPv6ParamProblem:
+		received.ParamProblem.Increment()
+
+	case header.ICMPv6RouterSolicit:
+		received.RouterSolicit.Increment()
+		if !isNDPValid() {
+			received.Invalid.Increment()
+			return
+		}
+
+	case header.ICMPv6RouterAdvert:
+		received.RouterAdvert.Increment()
+
+		// Is the NDP payload of sufficient size to hold a Router
+		// Advertisement?
+		if pkt.Data.Size()-header.ICMPv6HeaderSize < header.NDPRAMinimumSize || !isNDPValid() {
+			received.Invalid.Increment()
+			return
+		}
+
+		routerAddr := iph.SourceAddress()
+
+		//
+		// Validate the RA as per RFC 4861 section 6.1.2.
+		//
+
+		// Is the IP Source Address a link-local address?
+		if !header.IsV6LinkLocalAddress(routerAddr) {
+			// ...No, silently drop the packet.
+			received.Invalid.Increment()
+			return
+		}
+
+		// The remainder of payload must be only the router advertisement, so
+		// payload.ToView() always returns the advertisement. Per RFC 6980 section
+		// 5, NDP messages cannot be fragmented. Also note that in the common case
+		// NDP datagrams are very small and ToView() will not incur allocations.
+		ra := header.NDPRouterAdvert(payload.ToView())
+		opts := ra.Options()
+
+		// Are options valid as per the wire format?
+		if _, err := opts.Iter(true); err != nil {
+			// ...No, silently drop the packet.
+			received.Invalid.Increment()
+			return
+		}
+
+		//
+		// At this point, we have a valid Router Advertisement, as far
+		// as RFC 4861 section 6.1.2 is concerned.
+		//
+
+		// Tell the NIC to handle the RA.
+		stack := r.Stack()
+		rxNICID := r.NICID()
+		stack.HandleNDPRA(rxNICID, routerAddr, ra)
+
+	case header.ICMPv6RedirectMsg:
+		received.RedirectMsg.Increment()
+		if !isNDPValid() {
+			received.Invalid.Increment()
+			return
+		}
+
+	default:
+		received.Invalid.Increment()
+	}
+}
+
+const (
+	ndpSolicitedFlag = 1 << 6
+	ndpOverrideFlag  = 1 << 5
+
+	ndpOptSrcLinkAddr = 1
+	ndpOptDstLinkAddr = 2
+
+	icmpV6FlagOffset   = 4
+	icmpV6OptOffset    = 24
+	icmpV6LengthOffset = 25
+)
+
+var broadcastMAC = tcpip.LinkAddress([]byte{0xff, 0xff, 0xff, 0xff, 0xff, 0xff})
+
+var _ stack.LinkAddressResolver = (*protocol)(nil)
+
+// LinkAddressProtocol implements stack.LinkAddressResolver.
+func (*protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
+	return header.IPv6ProtocolNumber
+}
+
+// LinkAddressRequest implements stack.LinkAddressResolver.
+func (*protocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP stack.LinkEndpoint) *tcpip.Error {
+	snaddr := header.SolicitedNodeAddr(addr)
+
+	// TODO(b/148672031): Use stack.FindRoute instead of manually creating the
+	// route here. Note, we would need the nicID to do this properly so the right
+	// NIC (associated to linkEP) is used to send the NDP NS message.
+	r := &stack.Route{
+		LocalAddress:      localAddr,
+		RemoteAddress:     snaddr,
+		RemoteLinkAddress: header.EthernetAddressFromMulticastIPv6Address(snaddr),
+	}
+	hdr := buffer.NewPrependable(int(linkEP.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize)
+	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+	pkt.SetType(header.ICMPv6NeighborSolicit)
+	copy(pkt[icmpV6OptOffset-len(addr):], addr)
+	pkt[icmpV6OptOffset] = ndpOptSrcLinkAddr
+	pkt[icmpV6LengthOffset] = 1
+	copy(pkt[icmpV6LengthOffset+1:], linkEP.LinkAddress())
+	pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+
+	length := uint16(hdr.UsedLength())
+	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+	ip.Encode(&header.IPv6Fields{
+		PayloadLength: length,
+		NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+		HopLimit:      header.NDPHopLimit,
+		SrcAddr:       r.LocalAddress,
+		DstAddr:       r.RemoteAddress,
+	})
+
+	// TODO(stijlist): count this in ICMP stats.
+	return linkEP.WritePacket(r, nil /* gso */, ProtocolNumber, &stack.PacketBuffer{
+		Header: hdr,
+	})
+}
+
+// ResolveStaticAddress implements stack.LinkAddressResolver.
+func (*protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+	if header.IsV6MulticastAddress(addr) {
+		return header.EthernetAddressFromMulticastIPv6Address(addr), true
+	}
+	return tcpip.LinkAddress([]byte(nil)), false
+}
diff --git a/pkg/tcpip/network/ipv6/icmp_test.go b/pkg/tcpip/network/ipv6/icmp_test.go
new file mode 100644
index 000000000..52a01b44e
--- /dev/null
+++ b/pkg/tcpip/network/ipv6/icmp_test.go
@@ -0,0 +1,953 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipv6
+
+import (
+	"context"
+	"reflect"
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	linkAddr0 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+	linkAddr1 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0e")
+	linkAddr2 = tcpip.LinkAddress("\x0a\x0b\x0c\x0d\x0e\x0f")
+)
+
+var (
+	lladdr0 = header.LinkLocalAddr(linkAddr0)
+	lladdr1 = header.LinkLocalAddr(linkAddr1)
+)
+
+type stubLinkEndpoint struct {
+	stack.LinkEndpoint
+}
+
+func (*stubLinkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return 0
+}
+
+func (*stubLinkEndpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+func (*stubLinkEndpoint) LinkAddress() tcpip.LinkAddress {
+	return ""
+}
+
+func (*stubLinkEndpoint) WritePacket(*stack.Route, *stack.GSO, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) *tcpip.Error {
+	return nil
+}
+
+func (*stubLinkEndpoint) Attach(stack.NetworkDispatcher) {}
+
+type stubDispatcher struct {
+	stack.TransportDispatcher
+}
+
+func (*stubDispatcher) DeliverTransportPacket(*stack.Route, tcpip.TransportProtocolNumber, *stack.PacketBuffer) {
+}
+
+type stubLinkAddressCache struct {
+	stack.LinkAddressCache
+}
+
+func (*stubLinkAddressCache) CheckLocalAddress(tcpip.NICID, tcpip.NetworkProtocolNumber, tcpip.Address) tcpip.NICID {
+	return 0
+}
+
+func (*stubLinkAddressCache) AddLinkAddress(tcpip.NICID, tcpip.Address, tcpip.LinkAddress) {
+}
+
+func TestICMPCounts(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+	})
+	{
+		if err := s.CreateNIC(1, &stubLinkEndpoint{}); err != nil {
+			t.Fatalf("CreateNIC(_) = %s", err)
+		}
+		if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+			t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+		}
+	}
+	{
+		subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable(
+			[]tcpip.Route{{
+				Destination: subnet,
+				NIC:         1,
+			}},
+		)
+	}
+
+	netProto := s.NetworkProtocolInstance(ProtocolNumber)
+	if netProto == nil {
+		t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
+	}
+	ep, err := netProto.NewEndpoint(0, tcpip.AddressWithPrefix{lladdr1, netProto.DefaultPrefixLen()}, &stubLinkAddressCache{}, &stubDispatcher{}, nil, s)
+	if err != nil {
+		t.Fatalf("NewEndpoint(_) = _, %s, want = _, nil", err)
+	}
+
+	r, err := s.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
+	}
+	defer r.Release()
+
+	var tllData [header.NDPLinkLayerAddressSize]byte
+	header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+		header.NDPTargetLinkLayerAddressOption(linkAddr1),
+	})
+
+	types := []struct {
+		typ       header.ICMPv6Type
+		size      int
+		extraData []byte
+	}{
+		{
+			typ:  header.ICMPv6DstUnreachable,
+			size: header.ICMPv6DstUnreachableMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6PacketTooBig,
+			size: header.ICMPv6PacketTooBigMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6TimeExceeded,
+			size: header.ICMPv6MinimumSize,
+		},
+		{
+			typ:  header.ICMPv6ParamProblem,
+			size: header.ICMPv6MinimumSize,
+		},
+		{
+			typ:  header.ICMPv6EchoRequest,
+			size: header.ICMPv6EchoMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6EchoReply,
+			size: header.ICMPv6EchoMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6RouterSolicit,
+			size: header.ICMPv6MinimumSize,
+		},
+		{
+			typ:  header.ICMPv6RouterAdvert,
+			size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+		},
+		{
+			typ:  header.ICMPv6NeighborSolicit,
+			size: header.ICMPv6NeighborSolicitMinimumSize,
+		},
+		{
+			typ:       header.ICMPv6NeighborAdvert,
+			size:      header.ICMPv6NeighborAdvertMinimumSize,
+			extraData: tllData[:],
+		},
+		{
+			typ:  header.ICMPv6RedirectMsg,
+			size: header.ICMPv6MinimumSize,
+		},
+	}
+
+	handleIPv6Payload := func(icmp header.ICMPv6) {
+		ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize))
+		ip.Encode(&header.IPv6Fields{
+			PayloadLength: uint16(len(icmp)),
+			NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+			HopLimit:      header.NDPHopLimit,
+			SrcAddr:       r.LocalAddress,
+			DstAddr:       r.RemoteAddress,
+		})
+		ep.HandlePacket(&r, &stack.PacketBuffer{
+			NetworkHeader: buffer.View(ip),
+			Data:          buffer.View(icmp).ToVectorisedView(),
+		})
+	}
+
+	for _, typ := range types {
+		icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+		copy(icmp[typ.size:], typ.extraData)
+		icmp.SetType(typ.typ)
+		icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+		handleIPv6Payload(icmp)
+	}
+
+	// Construct an empty ICMP packet so that
+	// Stats().ICMP.ICMPv6ReceivedPacketStats.Invalid is incremented.
+	handleIPv6Payload(header.ICMPv6(buffer.NewView(header.IPv6MinimumSize)))
+
+	icmpv6Stats := s.Stats().ICMP.V6PacketsReceived
+	visitStats(reflect.ValueOf(&icmpv6Stats).Elem(), func(name string, s *tcpip.StatCounter) {
+		if got, want := s.Value(), uint64(1); got != want {
+			t.Errorf("got %s = %d, want = %d", name, got, want)
+		}
+	})
+	if t.Failed() {
+		t.Logf("stats:\n%+v", s.Stats())
+	}
+}
+
+func visitStats(v reflect.Value, f func(string, *tcpip.StatCounter)) {
+	t := v.Type()
+	for i := 0; i < v.NumField(); i++ {
+		v := v.Field(i)
+		if s, ok := v.Interface().(*tcpip.StatCounter); ok {
+			f(t.Field(i).Name, s)
+		} else {
+			visitStats(v, f)
+		}
+	}
+}
+
+type testContext struct {
+	s0 *stack.Stack
+	s1 *stack.Stack
+
+	linkEP0 *channel.Endpoint
+	linkEP1 *channel.Endpoint
+}
+
+type endpointWithResolutionCapability struct {
+	stack.LinkEndpoint
+}
+
+func (e endpointWithResolutionCapability) Capabilities() stack.LinkEndpointCapabilities {
+	return e.LinkEndpoint.Capabilities() | stack.CapabilityResolutionRequired
+}
+
+func newTestContext(t *testing.T) *testContext {
+	c := &testContext{
+		s0: stack.New(stack.Options{
+			NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
+			TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+		}),
+		s1: stack.New(stack.Options{
+			NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
+			TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+		}),
+	}
+
+	const defaultMTU = 65536
+	c.linkEP0 = channel.New(256, defaultMTU, linkAddr0)
+
+	wrappedEP0 := stack.LinkEndpoint(endpointWithResolutionCapability{LinkEndpoint: c.linkEP0})
+	if testing.Verbose() {
+		wrappedEP0 = sniffer.New(wrappedEP0)
+	}
+	if err := c.s0.CreateNIC(1, wrappedEP0); err != nil {
+		t.Fatalf("CreateNIC s0: %v", err)
+	}
+	if err := c.s0.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+		t.Fatalf("AddAddress lladdr0: %v", err)
+	}
+
+	c.linkEP1 = channel.New(256, defaultMTU, linkAddr1)
+	wrappedEP1 := stack.LinkEndpoint(endpointWithResolutionCapability{LinkEndpoint: c.linkEP1})
+	if err := c.s1.CreateNIC(1, wrappedEP1); err != nil {
+		t.Fatalf("CreateNIC failed: %v", err)
+	}
+	if err := c.s1.AddAddress(1, ProtocolNumber, lladdr1); err != nil {
+		t.Fatalf("AddAddress lladdr1: %v", err)
+	}
+
+	subnet0, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+	if err != nil {
+		t.Fatal(err)
+	}
+	c.s0.SetRouteTable(
+		[]tcpip.Route{{
+			Destination: subnet0,
+			NIC:         1,
+		}},
+	)
+	subnet1, err := tcpip.NewSubnet(lladdr0, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr0))))
+	if err != nil {
+		t.Fatal(err)
+	}
+	c.s1.SetRouteTable(
+		[]tcpip.Route{{
+			Destination: subnet1,
+			NIC:         1,
+		}},
+	)
+
+	return c
+}
+
+func (c *testContext) cleanup() {
+	c.linkEP0.Close()
+	c.linkEP1.Close()
+}
+
+type routeArgs struct {
+	src, dst       *channel.Endpoint
+	typ            header.ICMPv6Type
+	remoteLinkAddr tcpip.LinkAddress
+}
+
+func routeICMPv6Packet(t *testing.T, args routeArgs, fn func(*testing.T, header.ICMPv6)) {
+	t.Helper()
+
+	pi, _ := args.src.ReadContext(context.Background())
+
+	{
+		views := []buffer.View{pi.Pkt.Header.View(), pi.Pkt.Data.ToView()}
+		size := pi.Pkt.Header.UsedLength() + pi.Pkt.Data.Size()
+		vv := buffer.NewVectorisedView(size, views)
+		args.dst.InjectLinkAddr(pi.Proto, args.dst.LinkAddress(), &stack.PacketBuffer{
+			Data: vv,
+		})
+	}
+
+	if pi.Proto != ProtocolNumber {
+		t.Errorf("unexpected protocol number %d", pi.Proto)
+		return
+	}
+
+	if len(args.remoteLinkAddr) != 0 && args.remoteLinkAddr != pi.Route.RemoteLinkAddress {
+		t.Errorf("got remote link address = %s, want = %s", pi.Route.RemoteLinkAddress, args.remoteLinkAddr)
+	}
+
+	ipv6 := header.IPv6(pi.Pkt.Header.View())
+	transProto := tcpip.TransportProtocolNumber(ipv6.NextHeader())
+	if transProto != header.ICMPv6ProtocolNumber {
+		t.Errorf("unexpected transport protocol number %d", transProto)
+		return
+	}
+	icmpv6 := header.ICMPv6(ipv6.Payload())
+	if got, want := icmpv6.Type(), args.typ; got != want {
+		t.Errorf("got ICMPv6 type = %d, want = %d", got, want)
+		return
+	}
+	if fn != nil {
+		fn(t, icmpv6)
+	}
+}
+
+func TestLinkResolution(t *testing.T) {
+	c := newTestContext(t)
+	defer c.cleanup()
+
+	r, err := c.s0.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
+	}
+	defer r.Release()
+
+	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.IPv6MinimumSize + header.ICMPv6EchoMinimumSize)
+	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6EchoMinimumSize))
+	pkt.SetType(header.ICMPv6EchoRequest)
+	pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+	payload := tcpip.SlicePayload(hdr.View())
+
+	// We can't send our payload directly over the route because that
+	// doesn't provoke NDP discovery.
+	var wq waiter.Queue
+	ep, err := c.s0.NewEndpoint(header.ICMPv6ProtocolNumber, ProtocolNumber, &wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint(_) = _, %s, want = _, nil", err)
+	}
+
+	for {
+		_, resCh, err := ep.Write(payload, tcpip.WriteOptions{To: &tcpip.FullAddress{NIC: 1, Addr: lladdr1}})
+		if resCh != nil {
+			if err != tcpip.ErrNoLinkAddress {
+				t.Fatalf("ep.Write(_) = _, <non-nil>, %s, want = _, <non-nil>, tcpip.ErrNoLinkAddress", err)
+			}
+			for _, args := range []routeArgs{
+				{src: c.linkEP0, dst: c.linkEP1, typ: header.ICMPv6NeighborSolicit, remoteLinkAddr: header.EthernetAddressFromMulticastIPv6Address(header.SolicitedNodeAddr(lladdr1))},
+				{src: c.linkEP1, dst: c.linkEP0, typ: header.ICMPv6NeighborAdvert},
+			} {
+				routeICMPv6Packet(t, args, func(t *testing.T, icmpv6 header.ICMPv6) {
+					if got, want := tcpip.Address(icmpv6[8:][:16]), lladdr1; got != want {
+						t.Errorf("%d: got target = %s, want = %s", icmpv6.Type(), got, want)
+					}
+				})
+			}
+			<-resCh
+			continue
+		}
+		if err != nil {
+			t.Fatalf("ep.Write(_) = _, _, %s", err)
+		}
+		break
+	}
+
+	for _, args := range []routeArgs{
+		{src: c.linkEP0, dst: c.linkEP1, typ: header.ICMPv6EchoRequest},
+		{src: c.linkEP1, dst: c.linkEP0, typ: header.ICMPv6EchoReply},
+	} {
+		routeICMPv6Packet(t, args, nil)
+	}
+}
+
+func TestICMPChecksumValidationSimple(t *testing.T) {
+	var tllData [header.NDPLinkLayerAddressSize]byte
+	header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+		header.NDPTargetLinkLayerAddressOption(linkAddr1),
+	})
+
+	types := []struct {
+		name        string
+		typ         header.ICMPv6Type
+		size        int
+		extraData   []byte
+		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+	}{
+		{
+			name: "DstUnreachable",
+			typ:  header.ICMPv6DstUnreachable,
+			size: header.ICMPv6DstUnreachableMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.DstUnreachable
+			},
+		},
+		{
+			name: "PacketTooBig",
+			typ:  header.ICMPv6PacketTooBig,
+			size: header.ICMPv6PacketTooBigMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.PacketTooBig
+			},
+		},
+		{
+			name: "TimeExceeded",
+			typ:  header.ICMPv6TimeExceeded,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.TimeExceeded
+			},
+		},
+		{
+			name: "ParamProblem",
+			typ:  header.ICMPv6ParamProblem,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.ParamProblem
+			},
+		},
+		{
+			name: "EchoRequest",
+			typ:  header.ICMPv6EchoRequest,
+			size: header.ICMPv6EchoMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.EchoRequest
+			},
+		},
+		{
+			name: "EchoReply",
+			typ:  header.ICMPv6EchoReply,
+			size: header.ICMPv6EchoMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.EchoReply
+			},
+		},
+		{
+			name: "RouterSolicit",
+			typ:  header.ICMPv6RouterSolicit,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RouterSolicit
+			},
+		},
+		{
+			name: "RouterAdvert",
+			typ:  header.ICMPv6RouterAdvert,
+			size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RouterAdvert
+			},
+		},
+		{
+			name: "NeighborSolicit",
+			typ:  header.ICMPv6NeighborSolicit,
+			size: header.ICMPv6NeighborSolicitMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.NeighborSolicit
+			},
+		},
+		{
+			name:      "NeighborAdvert",
+			typ:       header.ICMPv6NeighborAdvert,
+			size:      header.ICMPv6NeighborAdvertMinimumSize,
+			extraData: tllData[:],
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.NeighborAdvert
+			},
+		},
+		{
+			name: "RedirectMsg",
+			typ:  header.ICMPv6RedirectMsg,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RedirectMsg
+			},
+		},
+	}
+
+	for _, typ := range types {
+		t.Run(typ.name, func(t *testing.T) {
+			e := channel.New(10, 1280, linkAddr0)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(_) = %s", err)
+			}
+
+			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+			}
+			{
+				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable(
+					[]tcpip.Route{{
+						Destination: subnet,
+						NIC:         1,
+					}},
+				)
+			}
+
+			handleIPv6Payload := func(checksum bool) {
+				icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+				copy(icmp[typ.size:], typ.extraData)
+				icmp.SetType(typ.typ)
+				if checksum {
+					icmp.SetChecksum(header.ICMPv6Checksum(icmp, lladdr1, lladdr0, buffer.View{}.ToVectorisedView()))
+				}
+				ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(len(icmp)),
+					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+					HopLimit:      header.NDPHopLimit,
+					SrcAddr:       lladdr1,
+					DstAddr:       lladdr0,
+				})
+				e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+					Data: buffer.NewVectorisedView(len(ip)+len(icmp), []buffer.View{buffer.View(ip), buffer.View(icmp)}),
+				})
+			}
+
+			stats := s.Stats().ICMP.V6PacketsReceived
+			invalid := stats.Invalid
+			typStat := typ.statCounter(stats)
+
+			// Initial stat counts should be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+			if got := typStat.Value(); got != 0 {
+				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+			}
+
+			// Without setting checksum, the incoming packet should
+			// be invalid.
+			handleIPv6Payload(false)
+			if got := invalid.Value(); got != 1 {
+				t.Fatalf("got invalid = %d, want = 1", got)
+			}
+			// Rx count of type typ.typ should not have increased.
+			if got := typStat.Value(); got != 0 {
+				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+			}
+
+			// When checksum is set, it should be received.
+			handleIPv6Payload(true)
+			if got := typStat.Value(); got != 1 {
+				t.Fatalf("got %s = %d, want = 1", typ.name, got)
+			}
+			// Invalid count should not have increased again.
+			if got := invalid.Value(); got != 1 {
+				t.Fatalf("got invalid = %d, want = 1", got)
+			}
+		})
+	}
+}
+
+func TestICMPChecksumValidationWithPayload(t *testing.T) {
+	const simpleBodySize = 64
+	simpleBody := func(view buffer.View) {
+		for i := 0; i < simpleBodySize; i++ {
+			view[i] = uint8(i)
+		}
+	}
+
+	const errorICMPBodySize = header.IPv6MinimumSize + simpleBodySize
+	errorICMPBody := func(view buffer.View) {
+		ip := header.IPv6(view)
+		ip.Encode(&header.IPv6Fields{
+			PayloadLength: simpleBodySize,
+			NextHeader:    10,
+			HopLimit:      20,
+			SrcAddr:       lladdr0,
+			DstAddr:       lladdr1,
+		})
+		simpleBody(view[header.IPv6MinimumSize:])
+	}
+
+	types := []struct {
+		name        string
+		typ         header.ICMPv6Type
+		size        int
+		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+		payloadSize int
+		payload     func(buffer.View)
+	}{
+		{
+			"DstUnreachable",
+			header.ICMPv6DstUnreachable,
+			header.ICMPv6DstUnreachableMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.DstUnreachable
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"PacketTooBig",
+			header.ICMPv6PacketTooBig,
+			header.ICMPv6PacketTooBigMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.PacketTooBig
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"TimeExceeded",
+			header.ICMPv6TimeExceeded,
+			header.ICMPv6MinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.TimeExceeded
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"ParamProblem",
+			header.ICMPv6ParamProblem,
+			header.ICMPv6MinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.ParamProblem
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"EchoRequest",
+			header.ICMPv6EchoRequest,
+			header.ICMPv6EchoMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.EchoRequest
+			},
+			simpleBodySize,
+			simpleBody,
+		},
+		{
+			"EchoReply",
+			header.ICMPv6EchoReply,
+			header.ICMPv6EchoMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.EchoReply
+			},
+			simpleBodySize,
+			simpleBody,
+		},
+	}
+
+	for _, typ := range types {
+		t.Run(typ.name, func(t *testing.T) {
+			e := channel.New(10, 1280, linkAddr0)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(_) = %s", err)
+			}
+
+			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+			}
+			{
+				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable(
+					[]tcpip.Route{{
+						Destination: subnet,
+						NIC:         1,
+					}},
+				)
+			}
+
+			handleIPv6Payload := func(typ header.ICMPv6Type, size, payloadSize int, payloadFn func(buffer.View), checksum bool) {
+				icmpSize := size + payloadSize
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
+				pkt := header.ICMPv6(hdr.Prepend(icmpSize))
+				pkt.SetType(typ)
+				payloadFn(pkt.Payload())
+
+				if checksum {
+					pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+				}
+
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(icmpSize),
+					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+					HopLimit:      header.NDPHopLimit,
+					SrcAddr:       lladdr1,
+					DstAddr:       lladdr0,
+				})
+				e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+					Data: hdr.View().ToVectorisedView(),
+				})
+			}
+
+			stats := s.Stats().ICMP.V6PacketsReceived
+			invalid := stats.Invalid
+			typStat := typ.statCounter(stats)
+
+			// Initial stat counts should be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+			if got := typStat.Value(); got != 0 {
+				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+			}
+
+			// Without setting checksum, the incoming packet should
+			// be invalid.
+			handleIPv6Payload(typ.typ, typ.size, typ.payloadSize, typ.payload, false)
+			if got := invalid.Value(); got != 1 {
+				t.Fatalf("got invalid = %d, want = 1", got)
+			}
+			// Rx count of type typ.typ should not have increased.
+			if got := typStat.Value(); got != 0 {
+				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+			}
+
+			// When checksum is set, it should be received.
+			handleIPv6Payload(typ.typ, typ.size, typ.payloadSize, typ.payload, true)
+			if got := typStat.Value(); got != 1 {
+				t.Fatalf("got %s = %d, want = 1", typ.name, got)
+			}
+			// Invalid count should not have increased again.
+			if got := invalid.Value(); got != 1 {
+				t.Fatalf("got invalid = %d, want = 1", got)
+			}
+		})
+	}
+}
+
+func TestICMPChecksumValidationWithPayloadMultipleViews(t *testing.T) {
+	const simpleBodySize = 64
+	simpleBody := func(view buffer.View) {
+		for i := 0; i < simpleBodySize; i++ {
+			view[i] = uint8(i)
+		}
+	}
+
+	const errorICMPBodySize = header.IPv6MinimumSize + simpleBodySize
+	errorICMPBody := func(view buffer.View) {
+		ip := header.IPv6(view)
+		ip.Encode(&header.IPv6Fields{
+			PayloadLength: simpleBodySize,
+			NextHeader:    10,
+			HopLimit:      20,
+			SrcAddr:       lladdr0,
+			DstAddr:       lladdr1,
+		})
+		simpleBody(view[header.IPv6MinimumSize:])
+	}
+
+	types := []struct {
+		name        string
+		typ         header.ICMPv6Type
+		size        int
+		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+		payloadSize int
+		payload     func(buffer.View)
+	}{
+		{
+			"DstUnreachable",
+			header.ICMPv6DstUnreachable,
+			header.ICMPv6DstUnreachableMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.DstUnreachable
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"PacketTooBig",
+			header.ICMPv6PacketTooBig,
+			header.ICMPv6PacketTooBigMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.PacketTooBig
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"TimeExceeded",
+			header.ICMPv6TimeExceeded,
+			header.ICMPv6MinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.TimeExceeded
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"ParamProblem",
+			header.ICMPv6ParamProblem,
+			header.ICMPv6MinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.ParamProblem
+			},
+			errorICMPBodySize,
+			errorICMPBody,
+		},
+		{
+			"EchoRequest",
+			header.ICMPv6EchoRequest,
+			header.ICMPv6EchoMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.EchoRequest
+			},
+			simpleBodySize,
+			simpleBody,
+		},
+		{
+			"EchoReply",
+			header.ICMPv6EchoReply,
+			header.ICMPv6EchoMinimumSize,
+			func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.EchoReply
+			},
+			simpleBodySize,
+			simpleBody,
+		},
+	}
+
+	for _, typ := range types {
+		t.Run(typ.name, func(t *testing.T) {
+			e := channel.New(10, 1280, linkAddr0)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(_) = %s", err)
+			}
+
+			if err := s.AddAddress(1, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, lladdr0, err)
+			}
+			{
+				subnet, err := tcpip.NewSubnet(lladdr1, tcpip.AddressMask(strings.Repeat("\xff", len(lladdr1))))
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable(
+					[]tcpip.Route{{
+						Destination: subnet,
+						NIC:         1,
+					}},
+				)
+			}
+
+			handleIPv6Payload := func(typ header.ICMPv6Type, size, payloadSize int, payloadFn func(buffer.View), checksum bool) {
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + size)
+				pkt := header.ICMPv6(hdr.Prepend(size))
+				pkt.SetType(typ)
+
+				payload := buffer.NewView(payloadSize)
+				payloadFn(payload)
+
+				if checksum {
+					pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, payload.ToVectorisedView()))
+				}
+
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(size + payloadSize),
+					NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+					HopLimit:      header.NDPHopLimit,
+					SrcAddr:       lladdr1,
+					DstAddr:       lladdr0,
+				})
+				e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+					Data: buffer.NewVectorisedView(header.IPv6MinimumSize+size+payloadSize, []buffer.View{hdr.View(), payload}),
+				})
+			}
+
+			stats := s.Stats().ICMP.V6PacketsReceived
+			invalid := stats.Invalid
+			typStat := typ.statCounter(stats)
+
+			// Initial stat counts should be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+			if got := typStat.Value(); got != 0 {
+				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+			}
+
+			// Without setting checksum, the incoming packet should
+			// be invalid.
+			handleIPv6Payload(typ.typ, typ.size, typ.payloadSize, typ.payload, false)
+			if got := invalid.Value(); got != 1 {
+				t.Fatalf("got invalid = %d, want = 1", got)
+			}
+			// Rx count of type typ.typ should not have increased.
+			if got := typStat.Value(); got != 0 {
+				t.Fatalf("got %s = %d, want = 0", typ.name, got)
+			}
+
+			// When checksum is set, it should be received.
+			handleIPv6Payload(typ.typ, typ.size, typ.payloadSize, typ.payload, true)
+			if got := typStat.Value(); got != 1 {
+				t.Fatalf("got %s = %d, want = 1", typ.name, got)
+			}
+			// Invalid count should not have increased again.
+			if got := invalid.Value(); got != 1 {
+				t.Fatalf("got invalid = %d, want = 1", got)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/network/ipv6/ipv6.go b/pkg/tcpip/network/ipv6/ipv6.go
new file mode 100644
index 000000000..95fbcf2d1
--- /dev/null
+++ b/pkg/tcpip/network/ipv6/ipv6.go
@@ -0,0 +1,599 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ipv6 contains the implementation of the ipv6 network protocol. To use
+// it in the networking stack, this package must be added to the project, and
+// activated on the stack by passing ipv6.NewProtocol() as one of the network
+// protocols when calling stack.New(). Then endpoints can be created by passing
+// ipv6.ProtocolNumber as the network protocol number when calling
+// Stack.NewEndpoint().
+package ipv6
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/network/fragmentation"
+	"gvisor.dev/gvisor/pkg/tcpip/network/hash"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	// ProtocolNumber is the ipv6 protocol number.
+	ProtocolNumber = header.IPv6ProtocolNumber
+
+	// maxTotalSize is maximum size that can be encoded in the 16-bit
+	// PayloadLength field of the ipv6 header.
+	maxPayloadSize = 0xffff
+
+	// DefaultTTL is the default hop limit for IPv6 Packets egressed by
+	// Netstack.
+	DefaultTTL = 64
+)
+
+type endpoint struct {
+	nicID         tcpip.NICID
+	id            stack.NetworkEndpointID
+	prefixLen     int
+	linkEP        stack.LinkEndpoint
+	linkAddrCache stack.LinkAddressCache
+	dispatcher    stack.TransportDispatcher
+	fragmentation *fragmentation.Fragmentation
+	protocol      *protocol
+}
+
+// DefaultTTL is the default hop limit for this endpoint.
+func (e *endpoint) DefaultTTL() uint8 {
+	return e.protocol.DefaultTTL()
+}
+
+// MTU implements stack.NetworkEndpoint.MTU. It returns the link-layer MTU minus
+// the network layer max header length.
+func (e *endpoint) MTU() uint32 {
+	return calculateMTU(e.linkEP.MTU())
+}
+
+// NICID returns the ID of the NIC this endpoint belongs to.
+func (e *endpoint) NICID() tcpip.NICID {
+	return e.nicID
+}
+
+// ID returns the ipv6 endpoint ID.
+func (e *endpoint) ID() *stack.NetworkEndpointID {
+	return &e.id
+}
+
+// PrefixLen returns the ipv6 endpoint subnet prefix length in bits.
+func (e *endpoint) PrefixLen() int {
+	return e.prefixLen
+}
+
+// Capabilities implements stack.NetworkEndpoint.Capabilities.
+func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.linkEP.Capabilities()
+}
+
+// MaxHeaderLength returns the maximum length needed by ipv6 headers (and
+// underlying protocols).
+func (e *endpoint) MaxHeaderLength() uint16 {
+	return e.linkEP.MaxHeaderLength() + header.IPv6MinimumSize
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (e *endpoint) GSOMaxSize() uint32 {
+	if gso, ok := e.linkEP.(stack.GSOEndpoint); ok {
+		return gso.GSOMaxSize()
+	}
+	return 0
+}
+
+func (e *endpoint) addIPHeader(r *stack.Route, hdr *buffer.Prependable, payloadSize int, params stack.NetworkHeaderParams) header.IPv6 {
+	length := uint16(hdr.UsedLength() + payloadSize)
+	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+	ip.Encode(&header.IPv6Fields{
+		PayloadLength: length,
+		NextHeader:    uint8(params.Protocol),
+		HopLimit:      params.TTL,
+		TrafficClass:  params.TOS,
+		SrcAddr:       r.LocalAddress,
+		DstAddr:       r.RemoteAddress,
+	})
+	return ip
+}
+
+// WritePacket writes a packet to the given destination address and protocol.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
+	ip := e.addIPHeader(r, &pkt.Header, pkt.Data.Size(), params)
+	pkt.NetworkHeader = buffer.View(ip)
+
+	if r.Loop&stack.PacketLoop != 0 {
+		// The inbound path expects the network header to still be in
+		// the PacketBuffer's Data field.
+		views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
+		views[0] = pkt.Header.View()
+		views = append(views, pkt.Data.Views()...)
+		loopedR := r.MakeLoopedRoute()
+
+		e.HandlePacket(&loopedR, &stack.PacketBuffer{
+			Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
+		})
+
+		loopedR.Release()
+	}
+	if r.Loop&stack.PacketOut == 0 {
+		return nil
+	}
+
+	r.Stats().IP.PacketsSent.Increment()
+	return e.linkEP.WritePacket(r, gso, ProtocolNumber, pkt)
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+	if r.Loop&stack.PacketLoop != 0 {
+		panic("not implemented")
+	}
+	if r.Loop&stack.PacketOut == 0 {
+		return pkts.Len(), nil
+	}
+
+	for pb := pkts.Front(); pb != nil; pb = pb.Next() {
+		ip := e.addIPHeader(r, &pb.Header, pb.Data.Size(), params)
+		pb.NetworkHeader = buffer.View(ip)
+	}
+
+	n, err := e.linkEP.WritePackets(r, gso, pkts, ProtocolNumber)
+	r.Stats().IP.PacketsSent.IncrementBy(uint64(n))
+	return n, err
+}
+
+// WriteHeaderIncludedPacker implements stack.NetworkEndpoint. It is not yet
+// supported by IPv6.
+func (*endpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
+	// TODO(b/146666412): Support IPv6 header-included packets.
+	return tcpip.ErrNotSupported
+}
+
+// HandlePacket is called by the link layer when new ipv6 packets arrive for
+// this endpoint.
+func (e *endpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	h := header.IPv6(pkt.NetworkHeader)
+	if !h.IsValid(pkt.Data.Size() + len(pkt.NetworkHeader) + len(pkt.TransportHeader)) {
+		r.Stats().IP.MalformedPacketsReceived.Increment()
+		return
+	}
+
+	// vv consists of:
+	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
+	// - The transport header, if present.
+	// - Any other payload data.
+	vv := pkt.NetworkHeader[header.IPv6MinimumSize:].ToVectorisedView()
+	vv.AppendView(pkt.TransportHeader)
+	vv.Append(pkt.Data)
+	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(h.NextHeader()), vv)
+	hasFragmentHeader := false
+
+	for firstHeader := true; ; firstHeader = false {
+		extHdr, done, err := it.Next()
+		if err != nil {
+			r.Stats().IP.MalformedPacketsReceived.Increment()
+			return
+		}
+		if done {
+			break
+		}
+
+		switch extHdr := extHdr.(type) {
+		case header.IPv6HopByHopOptionsExtHdr:
+			// As per RFC 8200 section 4.1, the Hop By Hop extension header is
+			// restricted to appear immediately after an IPv6 fixed header.
+			//
+			// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1
+			// (unrecognized next header) error in response to an extension header's
+			// Next Header field with the Hop By Hop extension header identifier.
+			if !firstHeader {
+				return
+			}
+
+			optsIt := extHdr.Iter()
+
+			for {
+				opt, done, err := optsIt.Next()
+				if err != nil {
+					r.Stats().IP.MalformedPacketsReceived.Increment()
+					return
+				}
+				if done {
+					break
+				}
+
+				// We currently do not support any IPv6 Hop By Hop extension header
+				// options.
+				switch opt.UnknownAction() {
+				case header.IPv6OptionUnknownActionSkip:
+				case header.IPv6OptionUnknownActionDiscard:
+					return
+				case header.IPv6OptionUnknownActionDiscardSendICMP:
+					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
+					// unrecognized IPv6 extension header options.
+					return
+				case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest:
+					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
+					// unrecognized IPv6 extension header options.
+					return
+				default:
+					panic(fmt.Sprintf("unrecognized action for an unrecognized Hop By Hop extension header option = %d", opt))
+				}
+			}
+
+		case header.IPv6RoutingExtHdr:
+			// As per RFC 8200 section 4.4, if a node encounters a routing header with
+			// an unrecognized routing type value, with a non-zero Segments Left
+			// value, the node must discard the packet and send an ICMP Parameter
+			// Problem, Code 0. If the Segments Left is 0, the node must ignore the
+			// Routing extension header and process the next header in the packet.
+			//
+			// Note, the stack does not yet handle any type of routing extension
+			// header, so we just make sure Segments Left is zero before processing
+			// the next extension header.
+			//
+			// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 0 for
+			// unrecognized routing types with a non-zero Segments Left value.
+			if extHdr.SegmentsLeft() != 0 {
+				return
+			}
+
+		case header.IPv6FragmentExtHdr:
+			hasFragmentHeader = true
+
+			if extHdr.IsAtomic() {
+				// This fragment extension header indicates that this packet is an
+				// atomic fragment. An atomic fragment is a fragment that contains
+				// all the data required to reassemble a full packet. As per RFC 6946,
+				// atomic fragments must not interfere with "normal" fragmented traffic
+				// so we skip processing the fragment instead of feeding it through the
+				// reassembly process below.
+				continue
+			}
+
+			// Don't consume the iterator if we have the first fragment because we
+			// will use it to validate that the first fragment holds the upper layer
+			// header.
+			rawPayload := it.AsRawHeader(extHdr.FragmentOffset() != 0 /* consume */)
+
+			if extHdr.FragmentOffset() == 0 {
+				// Check that the iterator ends with a raw payload as the first fragment
+				// should include all headers up to and including any upper layer
+				// headers, as per RFC 8200 section 4.5; only upper layer data
+				// (non-headers) should follow the fragment extension header.
+				var lastHdr header.IPv6PayloadHeader
+
+				for {
+					it, done, err := it.Next()
+					if err != nil {
+						r.Stats().IP.MalformedPacketsReceived.Increment()
+						r.Stats().IP.MalformedPacketsReceived.Increment()
+						return
+					}
+					if done {
+						break
+					}
+
+					lastHdr = it
+				}
+
+				// If the last header is a raw header, then the last portion of the IPv6
+				// payload is not a known IPv6 extension header. Note, this does not
+				// mean that the last portion is an upper layer header or not an
+				// extension header because:
+				//  1) we do not yet support all extension headers
+				//  2) we do not validate the upper layer header before reassembling.
+				//
+				// This check makes sure that a known IPv6 extension header is not
+				// present after the Fragment extension header in a non-initial
+				// fragment.
+				//
+				// TODO(#2196): Support IPv6 Authentication and Encapsulated
+				// Security Payload extension headers.
+				// TODO(#2333): Validate that the upper layer header is valid.
+				switch lastHdr.(type) {
+				case header.IPv6RawPayloadHeader:
+				default:
+					r.Stats().IP.MalformedPacketsReceived.Increment()
+					r.Stats().IP.MalformedFragmentsReceived.Increment()
+					return
+				}
+			}
+
+			fragmentPayloadLen := rawPayload.Buf.Size()
+			if fragmentPayloadLen == 0 {
+				// Drop the packet as it's marked as a fragment but has no payload.
+				r.Stats().IP.MalformedPacketsReceived.Increment()
+				r.Stats().IP.MalformedFragmentsReceived.Increment()
+				return
+			}
+
+			// The packet is a fragment, let's try to reassemble it.
+			start := extHdr.FragmentOffset() * header.IPv6FragmentExtHdrFragmentOffsetBytesPerUnit
+			last := start + uint16(fragmentPayloadLen) - 1
+
+			// Drop the packet if the fragmentOffset is incorrect. i.e the
+			// combination of fragmentOffset and pkt.Data.size() causes a
+			// wrap around resulting in last being less than the offset.
+			if last < start {
+				r.Stats().IP.MalformedPacketsReceived.Increment()
+				r.Stats().IP.MalformedFragmentsReceived.Increment()
+				return
+			}
+
+			var ready bool
+			// Note that pkt doesn't have its transport header set after reassembly,
+			// and won't until DeliverNetworkPacket sets it.
+			pkt.Data, ready, err = e.fragmentation.Process(hash.IPv6FragmentHash(h, extHdr.ID()), start, last, extHdr.More(), rawPayload.Buf)
+			if err != nil {
+				r.Stats().IP.MalformedPacketsReceived.Increment()
+				r.Stats().IP.MalformedFragmentsReceived.Increment()
+				return
+			}
+
+			if ready {
+				// We create a new iterator with the reassembled packet because we could
+				// have more extension headers in the reassembled payload, as per RFC
+				// 8200 section 4.5.
+				it = header.MakeIPv6PayloadIterator(rawPayload.Identifier, pkt.Data)
+			}
+
+		case header.IPv6DestinationOptionsExtHdr:
+			optsIt := extHdr.Iter()
+
+			for {
+				opt, done, err := optsIt.Next()
+				if err != nil {
+					r.Stats().IP.MalformedPacketsReceived.Increment()
+					return
+				}
+				if done {
+					break
+				}
+
+				// We currently do not support any IPv6 Destination extension header
+				// options.
+				switch opt.UnknownAction() {
+				case header.IPv6OptionUnknownActionSkip:
+				case header.IPv6OptionUnknownActionDiscard:
+					return
+				case header.IPv6OptionUnknownActionDiscardSendICMP:
+					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
+					// unrecognized IPv6 extension header options.
+					return
+				case header.IPv6OptionUnknownActionDiscardSendICMPNoMulticastDest:
+					// TODO(b/152019344): Send an ICMPv6 Parameter Problem Code 2 for
+					// unrecognized IPv6 extension header options.
+					return
+				default:
+					panic(fmt.Sprintf("unrecognized action for an unrecognized Destination extension header option = %d", opt))
+				}
+			}
+
+		case header.IPv6RawPayloadHeader:
+			// If the last header in the payload isn't a known IPv6 extension header,
+			// handle it as if it is transport layer data.
+
+			// For unfragmented packets, extHdr still contains the transport header.
+			// Get rid of it.
+			//
+			// For reassembled fragments, pkt.TransportHeader is unset, so this is a
+			// no-op and pkt.Data begins with the transport header.
+			extHdr.Buf.TrimFront(len(pkt.TransportHeader))
+			pkt.Data = extHdr.Buf
+
+			if p := tcpip.TransportProtocolNumber(extHdr.Identifier); p == header.ICMPv6ProtocolNumber {
+				e.handleICMP(r, pkt, hasFragmentHeader)
+			} else {
+				r.Stats().IP.PacketsDelivered.Increment()
+				// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1 error
+				// in response to unrecognized next header values.
+				e.dispatcher.DeliverTransportPacket(r, p, pkt)
+			}
+
+		default:
+			// If we receive a packet for an extension header we do not yet handle,
+			// drop the packet for now.
+			//
+			// TODO(b/152019344): Send an ICMPv6 Parameter Problem, Code 1 error
+			// in response to unrecognized next header values.
+			r.Stats().UnknownProtocolRcvdPackets.Increment()
+			return
+		}
+	}
+}
+
+// Close cleans up resources associated with the endpoint.
+func (*endpoint) Close() {}
+
+// NetworkProtocolNumber implements stack.NetworkEndpoint.NetworkProtocolNumber.
+func (e *endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return e.protocol.Number()
+}
+
+type protocol struct {
+	// defaultTTL is the current default TTL for the protocol. Only the
+	// uint8 portion of it is meaningful and it must be accessed
+	// atomically.
+	defaultTTL uint32
+}
+
+// Number returns the ipv6 protocol number.
+func (p *protocol) Number() tcpip.NetworkProtocolNumber {
+	return ProtocolNumber
+}
+
+// MinimumPacketSize returns the minimum valid ipv6 packet size.
+func (p *protocol) MinimumPacketSize() int {
+	return header.IPv6MinimumSize
+}
+
+// DefaultPrefixLen returns the IPv6 default prefix length.
+func (p *protocol) DefaultPrefixLen() int {
+	return header.IPv6AddressSize * 8
+}
+
+// ParseAddresses implements NetworkProtocol.ParseAddresses.
+func (*protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
+	h := header.IPv6(v)
+	return h.SourceAddress(), h.DestinationAddress()
+}
+
+// NewEndpoint creates a new ipv6 endpoint.
+func (p *protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, linkEP stack.LinkEndpoint, st *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
+	return &endpoint{
+		nicID:         nicID,
+		id:            stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
+		prefixLen:     addrWithPrefix.PrefixLen,
+		linkEP:        linkEP,
+		linkAddrCache: linkAddrCache,
+		dispatcher:    dispatcher,
+		fragmentation: fragmentation.NewFragmentation(fragmentation.HighFragThreshold, fragmentation.LowFragThreshold, fragmentation.DefaultReassembleTimeout),
+		protocol:      p,
+	}, nil
+}
+
+// SetOption implements NetworkProtocol.SetOption.
+func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+	switch v := option.(type) {
+	case tcpip.DefaultTTLOption:
+		p.SetDefaultTTL(uint8(v))
+		return nil
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// Option implements NetworkProtocol.Option.
+func (p *protocol) Option(option interface{}) *tcpip.Error {
+	switch v := option.(type) {
+	case *tcpip.DefaultTTLOption:
+		*v = tcpip.DefaultTTLOption(p.DefaultTTL())
+		return nil
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// SetDefaultTTL sets the default TTL for endpoints created with this protocol.
+func (p *protocol) SetDefaultTTL(ttl uint8) {
+	atomic.StoreUint32(&p.defaultTTL, uint32(ttl))
+}
+
+// DefaultTTL returns the default TTL for endpoints created with this protocol.
+func (p *protocol) DefaultTTL() uint8 {
+	return uint8(atomic.LoadUint32(&p.defaultTTL))
+}
+
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool) {
+	hdr, ok := pkt.Data.PullUp(header.IPv6MinimumSize)
+	if !ok {
+		return 0, false, false
+	}
+	ipHdr := header.IPv6(hdr)
+
+	// dataClone consists of:
+	// - Any IPv6 header bytes after the first 40 (i.e. extensions).
+	// - The transport header, if present.
+	// - Any other payload data.
+	views := [8]buffer.View{}
+	dataClone := pkt.Data.Clone(views[:])
+	dataClone.TrimFront(header.IPv6MinimumSize)
+	it := header.MakeIPv6PayloadIterator(header.IPv6ExtensionHeaderIdentifier(ipHdr.NextHeader()), dataClone)
+
+	// Iterate over the IPv6 extensions to find their length.
+	//
+	// Parsing occurs again in HandlePacket because we don't track the
+	// extensions in PacketBuffer. Unfortunately, that means HandlePacket
+	// has to do the parsing work again.
+	var nextHdr tcpip.TransportProtocolNumber
+	foundNext := true
+	extensionsSize := 0
+traverseExtensions:
+	for extHdr, done, err := it.Next(); ; extHdr, done, err = it.Next() {
+		if err != nil {
+			break
+		}
+		// If we exhaust the extension list, the entire packet is the IPv6 header
+		// and (possibly) extensions.
+		if done {
+			extensionsSize = dataClone.Size()
+			foundNext = false
+			break
+		}
+
+		switch extHdr := extHdr.(type) {
+		case header.IPv6FragmentExtHdr:
+			// If this is an atomic fragment, we don't have to treat it specially.
+			if !extHdr.More() && extHdr.FragmentOffset() == 0 {
+				continue
+			}
+			// This is a non-atomic fragment and has to be re-assembled before we can
+			// examine the payload for a transport header.
+			foundNext = false
+
+		case header.IPv6RawPayloadHeader:
+			// We've found the payload after any extensions.
+			extensionsSize = dataClone.Size() - extHdr.Buf.Size()
+			nextHdr = tcpip.TransportProtocolNumber(extHdr.Identifier)
+			break traverseExtensions
+
+		default:
+			// Any other extension is a no-op, keep looping until we find the payload.
+		}
+	}
+
+	// Put the IPv6 header with extensions in pkt.NetworkHeader.
+	hdr, ok = pkt.Data.PullUp(header.IPv6MinimumSize + extensionsSize)
+	if !ok {
+		panic(fmt.Sprintf("pkt.Data should have at least %d bytes, but only has %d.", header.IPv6MinimumSize+extensionsSize, pkt.Data.Size()))
+	}
+	ipHdr = header.IPv6(hdr)
+
+	pkt.NetworkHeader = hdr
+	pkt.Data.TrimFront(len(hdr))
+	pkt.Data.CapLength(int(ipHdr.PayloadLength()))
+
+	return nextHdr, foundNext, true
+}
+
+// calculateMTU calculates the network-layer payload MTU based on the link-layer
+// payload mtu.
+func calculateMTU(mtu uint32) uint32 {
+	mtu -= header.IPv6MinimumSize
+	if mtu <= maxPayloadSize {
+		return mtu
+	}
+	return maxPayloadSize
+}
+
+// NewProtocol returns an IPv6 network protocol.
+func NewProtocol() stack.NetworkProtocol {
+	return &protocol{defaultTTL: DefaultTTL}
+}
diff --git a/pkg/tcpip/network/ipv6/ipv6_test.go b/pkg/tcpip/network/ipv6/ipv6_test.go
new file mode 100644
index 000000000..213ff64f2
--- /dev/null
+++ b/pkg/tcpip/network/ipv6/ipv6_test.go
@@ -0,0 +1,1265 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipv6
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	addr1 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+	addr2 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	// The least significant 3 bytes are the same as addr2 so both addr2 and
+	// addr3 will have the same solicited-node address.
+	addr3 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x02"
+	addr4 = "\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x03"
+
+	// Tests use the extension header identifier values as uint8 instead of
+	// header.IPv6ExtensionHeaderIdentifier.
+	hopByHopExtHdrID    = uint8(header.IPv6HopByHopOptionsExtHdrIdentifier)
+	routingExtHdrID     = uint8(header.IPv6RoutingExtHdrIdentifier)
+	fragmentExtHdrID    = uint8(header.IPv6FragmentExtHdrIdentifier)
+	destinationExtHdrID = uint8(header.IPv6DestinationOptionsExtHdrIdentifier)
+	noNextHdrID         = uint8(header.IPv6NoNextHeaderIdentifier)
+)
+
+// testReceiveICMP tests receiving an ICMP packet from src to dst. want is the
+// expected Neighbor Advertisement received count after receiving the packet.
+func testReceiveICMP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst tcpip.Address, want uint64) {
+	t.Helper()
+
+	// Receive ICMP packet.
+	hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborAdvertSize)
+	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborAdvertSize))
+	pkt.SetType(header.ICMPv6NeighborAdvert)
+	pkt.SetChecksum(header.ICMPv6Checksum(pkt, src, dst, buffer.VectorisedView{}))
+	payloadLength := hdr.UsedLength()
+	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+	ip.Encode(&header.IPv6Fields{
+		PayloadLength: uint16(payloadLength),
+		NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+		HopLimit:      255,
+		SrcAddr:       src,
+		DstAddr:       dst,
+	})
+
+	e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+		Data: hdr.View().ToVectorisedView(),
+	})
+
+	stats := s.Stats().ICMP.V6PacketsReceived
+
+	if got := stats.NeighborAdvert.Value(); got != want {
+		t.Fatalf("got NeighborAdvert = %d, want = %d", got, want)
+	}
+}
+
+// testReceiveUDP tests receiving a UDP packet from src to dst. want is the
+// expected UDP received count after receiving the packet.
+func testReceiveUDP(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst tcpip.Address, want uint64) {
+	t.Helper()
+
+	wq := waiter.Queue{}
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+	defer close(ch)
+
+	ep, err := s.NewEndpoint(udp.ProtocolNumber, ProtocolNumber, &wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+	defer ep.Close()
+
+	if err := ep.Bind(tcpip.FullAddress{Addr: dst, Port: 80}); err != nil {
+		t.Fatalf("ep.Bind(...) failed: %v", err)
+	}
+
+	// Receive UDP Packet.
+	hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.UDPMinimumSize)
+	u := header.UDP(hdr.Prepend(header.UDPMinimumSize))
+	u.Encode(&header.UDPFields{
+		SrcPort: 5555,
+		DstPort: 80,
+		Length:  header.UDPMinimumSize,
+	})
+
+	// UDP pseudo-header checksum.
+	sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, src, dst, header.UDPMinimumSize)
+
+	// UDP checksum
+	sum = header.Checksum(header.UDP([]byte{}), sum)
+	u.SetChecksum(^u.CalculateChecksum(sum))
+
+	payloadLength := hdr.UsedLength()
+	ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+	ip.Encode(&header.IPv6Fields{
+		PayloadLength: uint16(payloadLength),
+		NextHeader:    uint8(udp.ProtocolNumber),
+		HopLimit:      255,
+		SrcAddr:       src,
+		DstAddr:       dst,
+	})
+
+	e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+		Data: hdr.View().ToVectorisedView(),
+	})
+
+	stat := s.Stats().UDP.PacketsReceived
+
+	if got := stat.Value(); got != want {
+		t.Fatalf("got UDPPacketsReceived = %d, want = %d", got, want)
+	}
+}
+
+// TestReceiveOnAllNodesMulticastAddr tests that IPv6 endpoints receive ICMP and
+// UDP packets destined to the IPv6 link-local all-nodes multicast address.
+func TestReceiveOnAllNodesMulticastAddr(t *testing.T) {
+	tests := []struct {
+		name            string
+		protocolFactory stack.TransportProtocol
+		rxf             func(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst tcpip.Address, want uint64)
+	}{
+		{"ICMP", icmp.NewProtocol6(), testReceiveICMP},
+		{"UDP", udp.NewProtocol(), testReceiveUDP},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{test.protocolFactory},
+			})
+			e := channel.New(10, 1280, linkAddr1)
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(_) = %s", err)
+			}
+
+			// Should receive a packet destined to the all-nodes
+			// multicast address.
+			test.rxf(t, s, e, addr1, header.IPv6AllNodesMulticastAddress, 1)
+		})
+	}
+}
+
+// TestReceiveOnSolicitedNodeAddr tests that IPv6 endpoints receive ICMP and UDP
+// packets destined to the IPv6 solicited-node address of an assigned IPv6
+// address.
+func TestReceiveOnSolicitedNodeAddr(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name            string
+		protocolFactory stack.TransportProtocol
+		rxf             func(t *testing.T, s *stack.Stack, e *channel.Endpoint, src, dst tcpip.Address, want uint64)
+	}{
+		{"ICMP", icmp.NewProtocol6(), testReceiveICMP},
+		{"UDP", udp.NewProtocol(), testReceiveUDP},
+	}
+
+	snmc := header.SolicitedNodeAddr(addr2)
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{test.protocolFactory},
+			})
+			e := channel.New(1, 1280, linkAddr1)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+
+			s.SetRouteTable([]tcpip.Route{
+				tcpip.Route{
+					Destination: header.IPv6EmptySubnet,
+					NIC:         nicID,
+				},
+			})
+
+			// Should not receive a packet destined to the solicited node address of
+			// addr2/addr3 yet as we haven't added those addresses.
+			test.rxf(t, s, e, addr1, snmc, 0)
+
+			if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err)
+			}
+
+			// Should receive a packet destined to the solicited node address of
+			// addr2/addr3 now that we have added added addr2.
+			test.rxf(t, s, e, addr1, snmc, 1)
+
+			if err := s.AddAddress(nicID, ProtocolNumber, addr3); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr3, err)
+			}
+
+			// Should still receive a packet destined to the solicited node address of
+			// addr2/addr3 now that we have added addr3.
+			test.rxf(t, s, e, addr1, snmc, 2)
+
+			if err := s.RemoveAddress(nicID, addr2); err != nil {
+				t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr2, err)
+			}
+
+			// Should still receive a packet destined to the solicited node address of
+			// addr2/addr3 now that we have removed addr2.
+			test.rxf(t, s, e, addr1, snmc, 3)
+
+			// Make sure addr3's endpoint does not get removed from the NIC by
+			// incrementing its reference count with a route.
+			r, err := s.FindRoute(nicID, addr3, addr4, ProtocolNumber, false)
+			if err != nil {
+				t.Fatalf("FindRoute(%d, %s, %s, %d, false): %s", nicID, addr3, addr4, ProtocolNumber, err)
+			}
+			defer r.Release()
+
+			if err := s.RemoveAddress(nicID, addr3); err != nil {
+				t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr3, err)
+			}
+
+			// Should not receive a packet destined to the solicited node address of
+			// addr2/addr3 yet as both of them got removed, even though a route using
+			// addr3 exists.
+			test.rxf(t, s, e, addr1, snmc, 3)
+		})
+	}
+}
+
+// TestAddIpv6Address tests adding IPv6 addresses.
+func TestAddIpv6Address(t *testing.T) {
+	tests := []struct {
+		name string
+		addr tcpip.Address
+	}{
+		// This test is in response to b/140943433.
+		{
+			"Nil",
+			tcpip.Address([]byte(nil)),
+		},
+		{
+			"ValidUnicast",
+			addr1,
+		},
+		{
+			"ValidLinkLocalUnicast",
+			lladdr0,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			if err := s.CreateNIC(1, &stubLinkEndpoint{}); err != nil {
+				t.Fatalf("CreateNIC(_) = %s", err)
+			}
+
+			if err := s.AddAddress(1, ProtocolNumber, test.addr); err != nil {
+				t.Fatalf("AddAddress(_, %d, nil) = %s", ProtocolNumber, err)
+			}
+
+			addr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+			}
+			if addr.Address != test.addr {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", addr.Address, test.addr)
+			}
+		})
+	}
+}
+
+func TestReceiveIPv6ExtHdrs(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name         string
+		extHdr       func(nextHdr uint8) ([]byte, uint8)
+		shouldAccept bool
+	}{
+		{
+			name:         "None",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, nextHdr },
+			shouldAccept: true,
+		},
+		{
+			name: "hopbyhop with unknown option skippable action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Skippable unknown.
+					62, 6, 1, 2, 3, 4, 5, 6,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "hopbyhop with unknown option discard action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard unknown.
+					127, 6, 1, 2, 3, 4, 5, 6,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "hopbyhop with unknown option discard and send icmp action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP if option is unknown.
+					191, 6, 1, 2, 3, 4, 5, 6,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "hopbyhop with unknown option discard and send icmp action unless multicast dest",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP unless packet is for multicast destination if
+					// option is unknown.
+					255, 6, 1, 2, 3, 4, 5, 6,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name:         "routing with zero segments left",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 0, 2, 3, 4, 5}, routingExtHdrID },
+			shouldAccept: true,
+		},
+		{
+			name:         "routing with non-zero segments left",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 1, 2, 3, 4, 5}, routingExtHdrID },
+			shouldAccept: false,
+		},
+		{
+			name:         "atomic fragment with zero ID",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 0, 0, 0, 0, 0, 0}, fragmentExtHdrID },
+			shouldAccept: true,
+		},
+		{
+			name:         "atomic fragment with non-zero ID",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 0, 0, 1, 2, 3, 4}, fragmentExtHdrID },
+			shouldAccept: true,
+		},
+		{
+			name:         "fragment",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{nextHdr, 0, 1, 0, 1, 2, 3, 4}, fragmentExtHdrID },
+			shouldAccept: false,
+		},
+		{
+			name:         "No next header",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, noNextHdrID },
+			shouldAccept: false,
+		},
+		{
+			name: "destination with unknown option skippable action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Skippable unknown.
+					62, 6, 1, 2, 3, 4, 5, 6,
+				}, destinationExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "destination with unknown option discard action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard unknown.
+					127, 6, 1, 2, 3, 4, 5, 6,
+				}, destinationExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "destination with unknown option discard and send icmp action",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP if option is unknown.
+					191, 6, 1, 2, 3, 4, 5, 6,
+				}, destinationExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "destination with unknown option discard and send icmp action unless multicast dest",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					nextHdr, 1,
+
+					// Skippable unknown.
+					63, 4, 1, 2, 3, 4,
+
+					// Discard & send ICMP unless packet is for multicast destination if
+					// option is unknown.
+					255, 6, 1, 2, 3, 4, 5, 6,
+				}, destinationExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "routing - atomic fragment",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Routing extension header.
+					fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Fragment extension header.
+					nextHdr, 0, 0, 0, 1, 2, 3, 4,
+				}, routingExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "atomic fragment - routing",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Fragment extension header.
+					routingExtHdrID, 0, 0, 0, 1, 2, 3, 4,
+
+					// Routing extension header.
+					nextHdr, 0, 1, 0, 2, 3, 4, 5,
+				}, fragmentExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "hop by hop (with skippable unknown) - routing",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Hop By Hop extension header with skippable unknown option.
+					routingExtHdrID, 0, 62, 4, 1, 2, 3, 4,
+
+					// Routing extension header.
+					nextHdr, 0, 1, 0, 2, 3, 4, 5,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "routing - hop by hop (with skippable unknown)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Routing extension header.
+					hopByHopExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Hop By Hop extension header with skippable unknown option.
+					nextHdr, 0, 62, 4, 1, 2, 3, 4,
+				}, routingExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name:         "No next header",
+			extHdr:       func(nextHdr uint8) ([]byte, uint8) { return []byte{}, noNextHdrID },
+			shouldAccept: false,
+		},
+		{
+			name: "hopbyhop (with skippable unknown) - routing - atomic fragment - destination (with skippable unknown)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Hop By Hop extension header with skippable unknown option.
+					routingExtHdrID, 0, 62, 4, 1, 2, 3, 4,
+
+					// Routing extension header.
+					fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Fragment extension header.
+					destinationExtHdrID, 0, 0, 0, 1, 2, 3, 4,
+
+					// Destination extension header with skippable unknown option.
+					nextHdr, 0, 63, 4, 1, 2, 3, 4,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: true,
+		},
+		{
+			name: "hopbyhop (with discard unknown) - routing - atomic fragment - destination (with skippable unknown)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Hop By Hop extension header with discard action for unknown option.
+					routingExtHdrID, 0, 65, 4, 1, 2, 3, 4,
+
+					// Routing extension header.
+					fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Fragment extension header.
+					destinationExtHdrID, 0, 0, 0, 1, 2, 3, 4,
+
+					// Destination extension header with skippable unknown option.
+					nextHdr, 0, 63, 4, 1, 2, 3, 4,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
+		{
+			name: "hopbyhop (with skippable unknown) - routing - atomic fragment - destination (with discard unknown)",
+			extHdr: func(nextHdr uint8) ([]byte, uint8) {
+				return []byte{
+					// Hop By Hop extension header with skippable unknown option.
+					routingExtHdrID, 0, 62, 4, 1, 2, 3, 4,
+
+					// Routing extension header.
+					fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5,
+
+					// Fragment extension header.
+					destinationExtHdrID, 0, 0, 0, 1, 2, 3, 4,
+
+					// Destination extension header with discard action for unknown
+					// option.
+					nextHdr, 0, 65, 4, 1, 2, 3, 4,
+				}, hopByHopExtHdrID
+			},
+			shouldAccept: false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			e := channel.New(0, 1280, linkAddr1)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err)
+			}
+
+			wq := waiter.Queue{}
+			we, ch := waiter.NewChannelEntry(nil)
+			wq.EventRegister(&we, waiter.EventIn)
+			defer wq.EventUnregister(&we)
+			defer close(ch)
+			ep, err := s.NewEndpoint(udp.ProtocolNumber, ProtocolNumber, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, ProtocolNumber, err)
+			}
+			defer ep.Close()
+
+			bindAddr := tcpip.FullAddress{Addr: addr2, Port: 80}
+			if err := ep.Bind(bindAddr); err != nil {
+				t.Fatalf("Bind(%+v): %s", bindAddr, err)
+			}
+
+			udpPayload := []byte{1, 2, 3, 4, 5, 6, 7, 8}
+			udpLength := header.UDPMinimumSize + len(udpPayload)
+			extHdrBytes, ipv6NextHdr := test.extHdr(uint8(header.UDPProtocolNumber))
+			extHdrLen := len(extHdrBytes)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + extHdrLen + udpLength)
+
+			// Serialize UDP message.
+			u := header.UDP(hdr.Prepend(udpLength))
+			u.Encode(&header.UDPFields{
+				SrcPort: 5555,
+				DstPort: 80,
+				Length:  uint16(udpLength),
+			})
+			copy(u.Payload(), udpPayload)
+			sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength))
+			sum = header.Checksum(udpPayload, sum)
+			u.SetChecksum(^u.CalculateChecksum(sum))
+
+			// Copy extension header bytes between the UDP message and the IPv6
+			// fixed header.
+			copy(hdr.Prepend(extHdrLen), extHdrBytes)
+
+			// Serialize IPv6 fixed header.
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    ipv6NextHdr,
+				HopLimit:      255,
+				SrcAddr:       addr1,
+				DstAddr:       addr2,
+			})
+
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			stats := s.Stats().UDP.PacketsReceived
+
+			if !test.shouldAccept {
+				if got := stats.Value(); got != 0 {
+					t.Errorf("got UDP Rx Packets = %d, want = 0", got)
+				}
+
+				return
+			}
+
+			// Expect a UDP packet.
+			if got := stats.Value(); got != 1 {
+				t.Errorf("got UDP Rx Packets = %d, want = 1", got)
+			}
+			gotPayload, _, err := ep.Read(nil)
+			if err != nil {
+				t.Fatalf("Read(nil): %s", err)
+			}
+			if diff := cmp.Diff(buffer.View(udpPayload), gotPayload); diff != "" {
+				t.Errorf("got UDP payload mismatch (-want +got):\n%s", diff)
+			}
+
+			// Should not have any more UDP packets.
+			if gotPayload, _, err := ep.Read(nil); err != tcpip.ErrWouldBlock {
+				t.Fatalf("got Read(nil) = (%x, _, %v), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+			}
+		})
+	}
+}
+
+// fragmentData holds the IPv6 payload for a fragmented IPv6 packet.
+type fragmentData struct {
+	nextHdr uint8
+	data    buffer.VectorisedView
+}
+
+func TestReceiveIPv6Fragments(t *testing.T) {
+	const nicID = 1
+	const udpPayload1Length = 256
+	const udpPayload2Length = 128
+	const fragmentExtHdrLen = 8
+	// Note, not all routing extension headers will be 8 bytes but this test
+	// uses 8 byte routing extension headers for most sub tests.
+	const routingExtHdrLen = 8
+
+	udpGen := func(payload []byte, multiplier uint8) buffer.View {
+		payloadLen := len(payload)
+		for i := 0; i < payloadLen; i++ {
+			payload[i] = uint8(i) * multiplier
+		}
+
+		udpLength := header.UDPMinimumSize + payloadLen
+
+		hdr := buffer.NewPrependable(udpLength)
+		u := header.UDP(hdr.Prepend(udpLength))
+		u.Encode(&header.UDPFields{
+			SrcPort: 5555,
+			DstPort: 80,
+			Length:  uint16(udpLength),
+		})
+		copy(u.Payload(), payload)
+		sum := header.PseudoHeaderChecksum(udp.ProtocolNumber, addr1, addr2, uint16(udpLength))
+		sum = header.Checksum(payload, sum)
+		u.SetChecksum(^u.CalculateChecksum(sum))
+		return hdr.View()
+	}
+
+	var udpPayload1Buf [udpPayload1Length]byte
+	udpPayload1 := udpPayload1Buf[:]
+	ipv6Payload1 := udpGen(udpPayload1, 1)
+
+	var udpPayload2Buf [udpPayload2Length]byte
+	udpPayload2 := udpPayload2Buf[:]
+	ipv6Payload2 := udpGen(udpPayload2, 2)
+
+	tests := []struct {
+		name             string
+		expectedPayload  []byte
+		fragments        []fragmentData
+		expectedPayloads [][]byte
+	}{
+		{
+			name: "No fragmentation",
+			fragments: []fragmentData{
+				{
+					nextHdr: uint8(header.UDPProtocolNumber),
+					data:    ipv6Payload1.ToVectorisedView(),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Atomic fragment",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1),
+						[]buffer.View{
+							// Fragment extension header.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 0, 0, 0, 0, 0}),
+
+							ipv6Payload1,
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Two fragments",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Two fragments with different IDs",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 2
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 2}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments with per-fragment routing header with zero segments left",
+			fragments: []fragmentData{
+				{
+					nextHdr: routingExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Routing extension header.
+							//
+							// Segments left = 0.
+							buffer.View([]byte{fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5}),
+
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: routingExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Routing extension header.
+							//
+							// Segments left = 0.
+							buffer.View([]byte{fragmentExtHdrID, 0, 1, 0, 2, 3, 4, 5}),
+
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Two fragments with per-fragment routing header with non-zero segments left",
+			fragments: []fragmentData{
+				{
+					nextHdr: routingExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Routing extension header.
+							//
+							// Segments left = 1.
+							buffer.View([]byte{fragmentExtHdrID, 0, 1, 1, 2, 3, 4, 5}),
+
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: routingExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Routing extension header.
+							//
+							// Segments left = 1.
+							buffer.View([]byte{fragmentExtHdrID, 0, 1, 1, 2, 3, 4, 5}),
+
+							// Fragment extension header.
+							//
+							// Fragment offset = 9, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 72, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments with routing header with zero segments left",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 1, 0, 0, 0, 1}),
+
+							// Routing extension header.
+							//
+							// Segments left = 0.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 1, 0, 2, 3, 4, 5}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 9, More = false, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 72, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1},
+		},
+		{
+			name: "Two fragments with routing header with non-zero segments left",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						routingExtHdrLen+fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 1, 0, 0, 0, 1}),
+
+							// Routing extension header.
+							//
+							// Segments left = 1.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 1, 1, 2, 3, 4, 5}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 9, More = false, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 72, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments with routing header with zero segments left across fragments",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						// The length of this payload is fragmentExtHdrLen+8 because the
+						// first 8 bytes of the 16 byte routing extension header is in
+						// this fragment.
+						fragmentExtHdrLen+8,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 1, 0, 0, 0, 1}),
+
+							// Routing extension header (part 1)
+							//
+							// Segments left = 0.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 1, 1, 0, 2, 3, 4, 5}),
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						// The length of this payload is
+						// fragmentExtHdrLen+8+len(ipv6Payload1) because the last 8 bytes of
+						// the 16 byte routing extension header is in this fagment.
+						fragmentExtHdrLen+8+len(ipv6Payload1),
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 1, More = false, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 8, 0, 0, 0, 1}),
+
+							// Routing extension header (part 2)
+							buffer.View([]byte{6, 7, 8, 9, 10, 11, 12, 13}),
+
+							ipv6Payload1,
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		{
+			name: "Two fragments with routing header with non-zero segments left across fragments",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						// The length of this payload is fragmentExtHdrLen+8 because the
+						// first 8 bytes of the 16 byte routing extension header is in
+						// this fragment.
+						fragmentExtHdrLen+8,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 1, 0, 0, 0, 1}),
+
+							// Routing extension header (part 1)
+							//
+							// Segments left = 1.
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 1, 1, 1, 2, 3, 4, 5}),
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						// The length of this payload is
+						// fragmentExtHdrLen+8+len(ipv6Payload1) because the last 8 bytes of
+						// the 16 byte routing extension header is in this fagment.
+						fragmentExtHdrLen+8+len(ipv6Payload1),
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 1, More = false, ID = 1
+							buffer.View([]byte{routingExtHdrID, 0, 0, 8, 0, 0, 0, 1}),
+
+							// Routing extension header (part 2)
+							buffer.View([]byte{6, 7, 8, 9, 10, 11, 12, 13}),
+
+							ipv6Payload1,
+						},
+					),
+				},
+			},
+			expectedPayloads: nil,
+		},
+		// As per RFC 6946, IPv6 atomic fragments MUST NOT interfere with "normal"
+		// fragmented traffic.
+		{
+			name: "Two fragments with atomic",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				// This fragment has the same ID as the other fragments but is an atomic
+				// fragment. It should not interfere with the other fragments.
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload2),
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 0, 0, 0, 0, 1}),
+
+							ipv6Payload2,
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload2, udpPayload1},
+		},
+		{
+			name: "Two interleaved fragmented packets",
+			fragments: []fragmentData{
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 1}),
+
+							ipv6Payload1[:64],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+32,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 0, More = true, ID = 2
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 1, 0, 0, 0, 2}),
+
+							ipv6Payload2[:32],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload1)-64,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 8, More = false, ID = 1
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 64, 0, 0, 0, 1}),
+
+							ipv6Payload1[64:],
+						},
+					),
+				},
+				{
+					nextHdr: fragmentExtHdrID,
+					data: buffer.NewVectorisedView(
+						fragmentExtHdrLen+len(ipv6Payload2)-32,
+						[]buffer.View{
+							// Fragment extension header.
+							//
+							// Fragment offset = 4, More = false, ID = 2
+							buffer.View([]byte{uint8(header.UDPProtocolNumber), 0, 0, 32, 0, 0, 0, 2}),
+
+							ipv6Payload2[32:],
+						},
+					),
+				},
+			},
+			expectedPayloads: [][]byte{udpPayload1, udpPayload2},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			e := channel.New(0, 1280, linkAddr1)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, addr2, err)
+			}
+
+			wq := waiter.Queue{}
+			we, ch := waiter.NewChannelEntry(nil)
+			wq.EventRegister(&we, waiter.EventIn)
+			defer wq.EventUnregister(&we)
+			defer close(ch)
+			ep, err := s.NewEndpoint(udp.ProtocolNumber, ProtocolNumber, &wq)
+			if err != nil {
+				t.Fatalf("NewEndpoint(%d, %d, _): %s", udp.ProtocolNumber, ProtocolNumber, err)
+			}
+			defer ep.Close()
+
+			bindAddr := tcpip.FullAddress{Addr: addr2, Port: 80}
+			if err := ep.Bind(bindAddr); err != nil {
+				t.Fatalf("Bind(%+v): %s", bindAddr, err)
+			}
+
+			for _, f := range test.fragments {
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize)
+
+				// Serialize IPv6 fixed header.
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(f.data.Size()),
+					NextHeader:    f.nextHdr,
+					HopLimit:      255,
+					SrcAddr:       addr1,
+					DstAddr:       addr2,
+				})
+
+				vv := hdr.View().ToVectorisedView()
+				vv.Append(f.data)
+
+				e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+					Data: vv,
+				})
+			}
+
+			if got, want := s.Stats().UDP.PacketsReceived.Value(), uint64(len(test.expectedPayloads)); got != want {
+				t.Errorf("got UDP Rx Packets = %d, want = %d", got, want)
+			}
+
+			for i, p := range test.expectedPayloads {
+				gotPayload, _, err := ep.Read(nil)
+				if err != nil {
+					t.Fatalf("(i=%d) Read(nil): %s", i, err)
+				}
+				if diff := cmp.Diff(buffer.View(p), gotPayload); diff != "" {
+					t.Errorf("(i=%d) got UDP payload mismatch (-want +got):\n%s", i, diff)
+				}
+			}
+
+			if gotPayload, _, err := ep.Read(nil); err != tcpip.ErrWouldBlock {
+				t.Fatalf("(last) got Read(nil) = (%x, _, %v), want = (_, _, %s)", gotPayload, err, tcpip.ErrWouldBlock)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/network/ipv6/ndp_test.go b/pkg/tcpip/network/ipv6/ndp_test.go
new file mode 100644
index 000000000..64239ce9a
--- /dev/null
+++ b/pkg/tcpip/network/ipv6/ndp_test.go
@@ -0,0 +1,907 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ipv6
+
+import (
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+)
+
+// setupStackAndEndpoint creates a stack with a single NIC with a link-local
+// address llladdr and an IPv6 endpoint to a remote with link-local address
+// rlladdr
+func setupStackAndEndpoint(t *testing.T, llladdr, rlladdr tcpip.Address) (*stack.Stack, stack.NetworkEndpoint) {
+	t.Helper()
+
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{icmp.NewProtocol6()},
+	})
+
+	if err := s.CreateNIC(1, &stubLinkEndpoint{}); err != nil {
+		t.Fatalf("CreateNIC(_) = %s", err)
+	}
+	if err := s.AddAddress(1, ProtocolNumber, llladdr); err != nil {
+		t.Fatalf("AddAddress(_, %d, %s) = %s", ProtocolNumber, llladdr, err)
+	}
+
+	{
+		subnet, err := tcpip.NewSubnet(rlladdr, tcpip.AddressMask(strings.Repeat("\xff", len(rlladdr))))
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable(
+			[]tcpip.Route{{
+				Destination: subnet,
+				NIC:         1,
+			}},
+		)
+	}
+
+	netProto := s.NetworkProtocolInstance(ProtocolNumber)
+	if netProto == nil {
+		t.Fatalf("cannot find protocol instance for network protocol %d", ProtocolNumber)
+	}
+
+	ep, err := netProto.NewEndpoint(0, tcpip.AddressWithPrefix{rlladdr, netProto.DefaultPrefixLen()}, &stubLinkAddressCache{}, &stubDispatcher{}, nil, s)
+	if err != nil {
+		t.Fatalf("NewEndpoint(_) = _, %s, want = _, nil", err)
+	}
+
+	return s, ep
+}
+
+// TestNeighorSolicitationWithSourceLinkLayerOption tests that receiving a
+// valid NDP NS message with the Source Link Layer Address option results in a
+// new entry in the link address cache for the sender of the message.
+func TestNeighorSolicitationWithSourceLinkLayerOption(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name             string
+		optsBuf          []byte
+		expectedLinkAddr tcpip.LinkAddress
+	}{
+		{
+			name:             "Valid",
+			optsBuf:          []byte{1, 1, 2, 3, 4, 5, 6, 7},
+			expectedLinkAddr: "\x02\x03\x04\x05\x06\x07",
+		},
+		{
+			name:    "Too Small",
+			optsBuf: []byte{1, 1, 2, 3, 4, 5, 6},
+		},
+		{
+			name:    "Invalid Length",
+			optsBuf: []byte{1, 2, 2, 3, 4, 5, 6, 7},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			e := channel.New(0, 1280, linkAddr0)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
+			}
+
+			ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + len(test.optsBuf)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
+			pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
+			pkt.SetType(header.ICMPv6NeighborSolicit)
+			ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+			ns.SetTargetAddress(lladdr0)
+			opts := ns.Options()
+			copy(opts, test.optsBuf)
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      255,
+				SrcAddr:       lladdr1,
+				DstAddr:       lladdr0,
+			})
+
+			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+
+			// Invalid count should initially be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil)
+			if linkAddr != test.expectedLinkAddr {
+				t.Errorf("got link address = %s, want = %s", linkAddr, test.expectedLinkAddr)
+			}
+
+			if test.expectedLinkAddr != "" {
+				if err != nil {
+					t.Errorf("s.GetLinkAddress(%d, %s, %s, %d, nil): %s", nicID, lladdr1, lladdr0, ProtocolNumber, err)
+				}
+				if c != nil {
+					t.Errorf("got unexpected channel")
+				}
+
+				// Invalid count should not have increased.
+				if got := invalid.Value(); got != 0 {
+					t.Errorf("got invalid = %d, want = 0", got)
+				}
+			} else {
+				if err != tcpip.ErrWouldBlock {
+					t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (_, _, %v), want = (_, _, %s)", nicID, lladdr1, lladdr0, ProtocolNumber, err, tcpip.ErrWouldBlock)
+				}
+				if c == nil {
+					t.Errorf("expected channel from call to s.GetLinkAddress(%d, %s, %s, %d, nil)", nicID, lladdr1, lladdr0, ProtocolNumber)
+				}
+
+				// Invalid count should have increased.
+				if got := invalid.Value(); got != 1 {
+					t.Errorf("got invalid = %d, want = 1", got)
+				}
+			}
+		})
+	}
+}
+
+func TestNeighorSolicitationResponse(t *testing.T) {
+	const nicID = 1
+	nicAddr := lladdr0
+	remoteAddr := lladdr1
+	nicAddrSNMC := header.SolicitedNodeAddr(nicAddr)
+	nicLinkAddr := linkAddr0
+	remoteLinkAddr0 := linkAddr1
+	remoteLinkAddr1 := linkAddr2
+
+	tests := []struct {
+		name          string
+		nsOpts        header.NDPOptionsSerializer
+		nsSrcLinkAddr tcpip.LinkAddress
+		nsSrc         tcpip.Address
+		nsDst         tcpip.Address
+		nsInvalid     bool
+		naDstLinkAddr tcpip.LinkAddress
+		naSolicited   bool
+		naSrc         tcpip.Address
+		naDst         tcpip.Address
+	}{
+		{
+			name:          "Unspecified source to multicast destination",
+			nsOpts:        nil,
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         header.IPv6Any,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   false,
+			naSrc:         nicAddr,
+			naDst:         header.IPv6AllNodesMulticastAddress,
+		},
+		{
+			name: "Unspecified source with source ll option to multicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         header.IPv6Any,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     true,
+		},
+		{
+			name:          "Unspecified source to unicast destination",
+			nsOpts:        nil,
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         header.IPv6Any,
+			nsDst:         nicAddr,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   false,
+			naSrc:         nicAddr,
+			naDst:         header.IPv6AllNodesMulticastAddress,
+		},
+		{
+			name: "Unspecified source with source ll option to unicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         header.IPv6Any,
+			nsDst:         nicAddr,
+			nsInvalid:     true,
+		},
+
+		{
+			name: "Specified source with 1 source ll to multicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name: "Specified source with 1 source ll different from route to multicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr1[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr1,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name:          "Specified source to multicast destination",
+			nsOpts:        nil,
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     true,
+		},
+		{
+			name: "Specified source with 2 source ll to multicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr1[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddrSNMC,
+			nsInvalid:     true,
+		},
+
+		{
+			name:          "Specified source to unicast destination",
+			nsOpts:        nil,
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddr,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name: "Specified source with 1 source ll to unicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddr,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr0,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name: "Specified source with 1 source ll different from route to unicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr1[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddr,
+			nsInvalid:     false,
+			naDstLinkAddr: remoteLinkAddr1,
+			naSolicited:   true,
+			naSrc:         nicAddr,
+			naDst:         remoteAddr,
+		},
+		{
+			name: "Specified source with 2 source ll to unicast destination",
+			nsOpts: header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr0[:]),
+				header.NDPSourceLinkLayerAddressOption(remoteLinkAddr1[:]),
+			},
+			nsSrcLinkAddr: remoteLinkAddr0,
+			nsSrc:         remoteAddr,
+			nsDst:         nicAddr,
+			nsInvalid:     true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			e := channel.New(1, 1280, nicLinkAddr)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, nicAddr); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, nicAddr, err)
+			}
+
+			ndpNSSize := header.ICMPv6NeighborSolicitMinimumSize + test.nsOpts.Length()
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNSSize)
+			pkt := header.ICMPv6(hdr.Prepend(ndpNSSize))
+			pkt.SetType(header.ICMPv6NeighborSolicit)
+			ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+			ns.SetTargetAddress(nicAddr)
+			opts := ns.Options()
+			opts.Serialize(test.nsOpts)
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.nsSrc, test.nsDst, buffer.VectorisedView{}))
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      255,
+				SrcAddr:       test.nsSrc,
+				DstAddr:       test.nsDst,
+			})
+
+			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+
+			// Invalid count should initially be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			e.InjectLinkAddr(ProtocolNumber, test.nsSrcLinkAddr, &stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			if test.nsInvalid {
+				if got := invalid.Value(); got != 1 {
+					t.Fatalf("got invalid = %d, want = 1", got)
+				}
+
+				if p, got := e.Read(); got {
+					t.Fatalf("unexpected response to an invalid NS = %+v", p.Pkt)
+				}
+
+				// If we expected the NS to be invalid, we have nothing else to check.
+				return
+			}
+
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			p, got := e.Read()
+			if !got {
+				t.Fatal("expected an NDP NA response")
+			}
+
+			if p.Route.RemoteLinkAddress != test.naDstLinkAddr {
+				t.Errorf("got p.Route.RemoteLinkAddress = %s, want = %s", p.Route.RemoteLinkAddress, test.naDstLinkAddr)
+			}
+
+			checker.IPv6(t, p.Pkt.Header.View(),
+				checker.SrcAddr(test.naSrc),
+				checker.DstAddr(test.naDst),
+				checker.TTL(header.NDPHopLimit),
+				checker.NDPNA(
+					checker.NDPNASolicitedFlag(test.naSolicited),
+					checker.NDPNATargetAddress(nicAddr),
+					checker.NDPNAOptions([]header.NDPOption{
+						header.NDPTargetLinkLayerAddressOption(nicLinkAddr[:]),
+					}),
+				))
+		})
+	}
+}
+
+// TestNeighorAdvertisementWithTargetLinkLayerOption tests that receiving a
+// valid NDP NA message with the Target Link Layer Address option results in a
+// new entry in the link address cache for the target of the message.
+func TestNeighorAdvertisementWithTargetLinkLayerOption(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name             string
+		optsBuf          []byte
+		expectedLinkAddr tcpip.LinkAddress
+	}{
+		{
+			name:             "Valid",
+			optsBuf:          []byte{2, 1, 2, 3, 4, 5, 6, 7},
+			expectedLinkAddr: "\x02\x03\x04\x05\x06\x07",
+		},
+		{
+			name:    "Too Small",
+			optsBuf: []byte{2, 1, 2, 3, 4, 5, 6},
+		},
+		{
+			name:    "Invalid Length",
+			optsBuf: []byte{2, 2, 2, 3, 4, 5, 6, 7},
+		},
+		{
+			name: "Multiple",
+			optsBuf: []byte{
+				2, 1, 2, 3, 4, 5, 6, 7,
+				2, 1, 2, 3, 4, 5, 6, 8,
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+			e := channel.New(0, 1280, linkAddr0)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			if err := s.AddAddress(nicID, ProtocolNumber, lladdr0); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, ProtocolNumber, lladdr0, err)
+			}
+
+			ndpNASize := header.ICMPv6NeighborAdvertMinimumSize + len(test.optsBuf)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + ndpNASize)
+			pkt := header.ICMPv6(hdr.Prepend(ndpNASize))
+			pkt.SetType(header.ICMPv6NeighborAdvert)
+			ns := header.NDPNeighborAdvert(pkt.NDPPayload())
+			ns.SetTargetAddress(lladdr1)
+			opts := ns.Options()
+			copy(opts, test.optsBuf)
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, lladdr1, lladdr0, buffer.VectorisedView{}))
+			payloadLength := hdr.UsedLength()
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(header.ICMPv6ProtocolNumber),
+				HopLimit:      255,
+				SrcAddr:       lladdr1,
+				DstAddr:       lladdr0,
+			})
+
+			invalid := s.Stats().ICMP.V6PacketsReceived.Invalid
+
+			// Invalid count should initially be 0.
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+
+			e.InjectInbound(ProtocolNumber, &stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			linkAddr, c, err := s.GetLinkAddress(nicID, lladdr1, lladdr0, ProtocolNumber, nil)
+			if linkAddr != test.expectedLinkAddr {
+				t.Errorf("got link address = %s, want = %s", linkAddr, test.expectedLinkAddr)
+			}
+
+			if test.expectedLinkAddr != "" {
+				if err != nil {
+					t.Errorf("s.GetLinkAddress(%d, %s, %s, %d, nil): %s", nicID, lladdr1, lladdr0, ProtocolNumber, err)
+				}
+				if c != nil {
+					t.Errorf("got unexpected channel")
+				}
+
+				// Invalid count should not have increased.
+				if got := invalid.Value(); got != 0 {
+					t.Errorf("got invalid = %d, want = 0", got)
+				}
+			} else {
+				if err != tcpip.ErrWouldBlock {
+					t.Errorf("got s.GetLinkAddress(%d, %s, %s, %d, nil) = (_, _, %v), want = (_, _, %s)", nicID, lladdr1, lladdr0, ProtocolNumber, err, tcpip.ErrWouldBlock)
+				}
+				if c == nil {
+					t.Errorf("expected channel from call to s.GetLinkAddress(%d, %s, %s, %d, nil)", nicID, lladdr1, lladdr0, ProtocolNumber)
+				}
+
+				// Invalid count should have increased.
+				if got := invalid.Value(); got != 1 {
+					t.Errorf("got invalid = %d, want = 1", got)
+				}
+			}
+		})
+	}
+}
+
+func TestNDPValidation(t *testing.T) {
+	setup := func(t *testing.T) (*stack.Stack, stack.NetworkEndpoint, stack.Route) {
+		t.Helper()
+
+		// Create a stack with the assigned link-local address lladdr0
+		// and an endpoint to lladdr1.
+		s, ep := setupStackAndEndpoint(t, lladdr0, lladdr1)
+
+		r, err := s.FindRoute(1, lladdr0, lladdr1, ProtocolNumber, false /* multicastLoop */)
+		if err != nil {
+			t.Fatalf("FindRoute(_) = _, %s, want = _, nil", err)
+		}
+
+		return s, ep, r
+	}
+
+	handleIPv6Payload := func(payload buffer.View, hopLimit uint8, atomicFragment bool, ep stack.NetworkEndpoint, r *stack.Route) {
+		nextHdr := uint8(header.ICMPv6ProtocolNumber)
+		var extensions buffer.View
+		if atomicFragment {
+			extensions = buffer.NewView(header.IPv6FragmentExtHdrLength)
+			extensions[0] = nextHdr
+			nextHdr = uint8(header.IPv6FragmentExtHdrIdentifier)
+		}
+
+		ip := header.IPv6(buffer.NewView(header.IPv6MinimumSize + len(extensions)))
+		ip.Encode(&header.IPv6Fields{
+			PayloadLength: uint16(len(payload) + len(extensions)),
+			NextHeader:    nextHdr,
+			HopLimit:      hopLimit,
+			SrcAddr:       r.LocalAddress,
+			DstAddr:       r.RemoteAddress,
+		})
+		if n := copy(ip[header.IPv6MinimumSize:], extensions); n != len(extensions) {
+			t.Fatalf("expected to write %d bytes of extensions, but wrote %d", len(extensions), n)
+		}
+		ep.HandlePacket(r, &stack.PacketBuffer{
+			NetworkHeader: buffer.View(ip),
+			Data:          payload.ToVectorisedView(),
+		})
+	}
+
+	var tllData [header.NDPLinkLayerAddressSize]byte
+	header.NDPOptions(tllData[:]).Serialize(header.NDPOptionsSerializer{
+		header.NDPTargetLinkLayerAddressOption(linkAddr1),
+	})
+
+	types := []struct {
+		name        string
+		typ         header.ICMPv6Type
+		size        int
+		extraData   []byte
+		statCounter func(tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+	}{
+		{
+			name: "RouterSolicit",
+			typ:  header.ICMPv6RouterSolicit,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RouterSolicit
+			},
+		},
+		{
+			name: "RouterAdvert",
+			typ:  header.ICMPv6RouterAdvert,
+			size: header.ICMPv6HeaderSize + header.NDPRAMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RouterAdvert
+			},
+		},
+		{
+			name: "NeighborSolicit",
+			typ:  header.ICMPv6NeighborSolicit,
+			size: header.ICMPv6NeighborSolicitMinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.NeighborSolicit
+			},
+		},
+		{
+			name:      "NeighborAdvert",
+			typ:       header.ICMPv6NeighborAdvert,
+			size:      header.ICMPv6NeighborAdvertMinimumSize,
+			extraData: tllData[:],
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.NeighborAdvert
+			},
+		},
+		{
+			name: "RedirectMsg",
+			typ:  header.ICMPv6RedirectMsg,
+			size: header.ICMPv6MinimumSize,
+			statCounter: func(stats tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return stats.RedirectMsg
+			},
+		},
+	}
+
+	subTests := []struct {
+		name           string
+		atomicFragment bool
+		hopLimit       uint8
+		code           uint8
+		valid          bool
+	}{
+		{
+			name:           "Valid",
+			atomicFragment: false,
+			hopLimit:       header.NDPHopLimit,
+			code:           0,
+			valid:          true,
+		},
+		{
+			name:           "Fragmented",
+			atomicFragment: true,
+			hopLimit:       header.NDPHopLimit,
+			code:           0,
+			valid:          false,
+		},
+		{
+			name:           "Invalid hop limit",
+			atomicFragment: false,
+			hopLimit:       header.NDPHopLimit - 1,
+			code:           0,
+			valid:          false,
+		},
+		{
+			name:           "Invalid ICMPv6 code",
+			atomicFragment: false,
+			hopLimit:       header.NDPHopLimit,
+			code:           1,
+			valid:          false,
+		},
+	}
+
+	for _, typ := range types {
+		t.Run(typ.name, func(t *testing.T) {
+			for _, test := range subTests {
+				t.Run(test.name, func(t *testing.T) {
+					s, ep, r := setup(t)
+					defer r.Release()
+
+					stats := s.Stats().ICMP.V6PacketsReceived
+					invalid := stats.Invalid
+					typStat := typ.statCounter(stats)
+
+					icmp := header.ICMPv6(buffer.NewView(typ.size + len(typ.extraData)))
+					copy(icmp[typ.size:], typ.extraData)
+					icmp.SetType(typ.typ)
+					icmp.SetCode(test.code)
+					icmp.SetChecksum(header.ICMPv6Checksum(icmp[:typ.size], r.LocalAddress, r.RemoteAddress, buffer.View(typ.extraData).ToVectorisedView()))
+
+					// Rx count of the NDP message should initially be 0.
+					if got := typStat.Value(); got != 0 {
+						t.Errorf("got %s = %d, want = 0", typ.name, got)
+					}
+
+					// Invalid count should initially be 0.
+					if got := invalid.Value(); got != 0 {
+						t.Errorf("got invalid = %d, want = 0", got)
+					}
+
+					if t.Failed() {
+						t.FailNow()
+					}
+
+					handleIPv6Payload(buffer.View(icmp), test.hopLimit, test.atomicFragment, ep, &r)
+
+					// Rx count of the NDP packet should have increased.
+					if got := typStat.Value(); got != 1 {
+						t.Errorf("got %s = %d, want = 1", typ.name, got)
+					}
+
+					want := uint64(0)
+					if !test.valid {
+						// Invalid count should have increased.
+						want = 1
+					}
+					if got := invalid.Value(); got != want {
+						t.Errorf("got invalid = %d, want = %d", got, want)
+					}
+				})
+			}
+		})
+	}
+}
+
+// TestRouterAdvertValidation tests that when the NIC is configured to handle
+// NDP Router Advertisement packets, it validates the Router Advertisement
+// properly before handling them.
+func TestRouterAdvertValidation(t *testing.T) {
+	tests := []struct {
+		name            string
+		src             tcpip.Address
+		hopLimit        uint8
+		code            uint8
+		ndpPayload      []byte
+		expectedSuccess bool
+	}{
+		{
+			"OK",
+			lladdr0,
+			255,
+			0,
+			[]byte{
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+			},
+			true,
+		},
+		{
+			"NonLinkLocalSourceAddr",
+			addr1,
+			255,
+			0,
+			[]byte{
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+			},
+			false,
+		},
+		{
+			"HopLimitNot255",
+			lladdr0,
+			254,
+			0,
+			[]byte{
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+			},
+			false,
+		},
+		{
+			"NonZeroCode",
+			lladdr0,
+			255,
+			1,
+			[]byte{
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+			},
+			false,
+		},
+		{
+			"NDPPayloadTooSmall",
+			lladdr0,
+			255,
+			0,
+			[]byte{
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0,
+			},
+			false,
+		},
+		{
+			"OKWithOptions",
+			lladdr0,
+			255,
+			0,
+			[]byte{
+				// RA payload
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+
+				// Option #1 (TargetLinkLayerAddress)
+				2, 1, 0, 0, 0, 0, 0, 0,
+
+				// Option #2 (unrecognized)
+				255, 1, 0, 0, 0, 0, 0, 0,
+
+				// Option #3 (PrefixInformation)
+				3, 4, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			true,
+		},
+		{
+			"OptionWithZeroLength",
+			lladdr0,
+			255,
+			0,
+			[]byte{
+				// RA payload
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+				0, 0, 0, 0,
+
+				// Option #1 (TargetLinkLayerAddress)
+				// Invalid as it has 0 length.
+				2, 0, 0, 0, 0, 0, 0, 0,
+
+				// Option #2 (unrecognized)
+				255, 1, 0, 0, 0, 0, 0, 0,
+
+				// Option #3 (PrefixInformation)
+				3, 4, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+				0, 0, 0, 0, 0, 0, 0, 0,
+			},
+			false,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e := channel.New(10, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{NewProtocol()},
+			})
+
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(_) = %s", err)
+			}
+
+			icmpSize := header.ICMPv6HeaderSize + len(test.ndpPayload)
+			hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
+			pkt := header.ICMPv6(hdr.Prepend(icmpSize))
+			pkt.SetType(header.ICMPv6RouterAdvert)
+			pkt.SetCode(test.code)
+			copy(pkt.NDPPayload(), test.ndpPayload)
+			payloadLength := hdr.UsedLength()
+			pkt.SetChecksum(header.ICMPv6Checksum(pkt, test.src, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
+			ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+			ip.Encode(&header.IPv6Fields{
+				PayloadLength: uint16(payloadLength),
+				NextHeader:    uint8(icmp.ProtocolNumber6),
+				HopLimit:      test.hopLimit,
+				SrcAddr:       test.src,
+				DstAddr:       header.IPv6AllNodesMulticastAddress,
+			})
+
+			stats := s.Stats().ICMP.V6PacketsReceived
+			invalid := stats.Invalid
+			rxRA := stats.RouterAdvert
+
+			if got := invalid.Value(); got != 0 {
+				t.Fatalf("got invalid = %d, want = 0", got)
+			}
+			if got := rxRA.Value(); got != 0 {
+				t.Fatalf("got rxRA = %d, want = 0", got)
+			}
+
+			e.InjectInbound(header.IPv6ProtocolNumber, &stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			if got := rxRA.Value(); got != 1 {
+				t.Fatalf("got rxRA = %d, want = 1", got)
+			}
+
+			if test.expectedSuccess {
+				if got := invalid.Value(); got != 0 {
+					t.Fatalf("got invalid = %d, want = 0", got)
+				}
+			} else {
+				if got := invalid.Value(); got != 1 {
+					t.Fatalf("got invalid = %d, want = 1", got)
+				}
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/ports/BUILD b/pkg/tcpip/ports/BUILD
new file mode 100644
index 000000000..2bad05a2e
--- /dev/null
+++ b/pkg/tcpip/ports/BUILD
@@ -0,0 +1,22 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "ports",
+    srcs = ["ports.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sync",
+        "//pkg/tcpip",
+    ],
+)
+
+go_test(
+    name = "ports_test",
+    srcs = ["ports_test.go"],
+    library = ":ports",
+    deps = [
+        "//pkg/tcpip",
+    ],
+)
diff --git a/pkg/tcpip/ports/ports.go b/pkg/tcpip/ports/ports.go
new file mode 100644
index 000000000..f6d592eb5
--- /dev/null
+++ b/pkg/tcpip/ports/ports.go
@@ -0,0 +1,554 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package ports provides PortManager that manages allocating, reserving and releasing ports.
+package ports
+
+import (
+	"math"
+	"math/rand"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// FirstEphemeral is the first ephemeral port.
+	FirstEphemeral = 16000
+
+	// numEphemeralPorts it the mnumber of available ephemeral ports to
+	// Netstack.
+	numEphemeralPorts = math.MaxUint16 - FirstEphemeral + 1
+
+	anyIPAddress tcpip.Address = ""
+)
+
+type portDescriptor struct {
+	network   tcpip.NetworkProtocolNumber
+	transport tcpip.TransportProtocolNumber
+	port      uint16
+}
+
+// Flags represents the type of port reservation.
+//
+// +stateify savable
+type Flags struct {
+	// MostRecent represents UDP SO_REUSEADDR.
+	MostRecent bool
+
+	// LoadBalanced indicates SO_REUSEPORT.
+	//
+	// LoadBalanced takes precidence over MostRecent.
+	LoadBalanced bool
+
+	// TupleOnly represents TCP SO_REUSEADDR.
+	TupleOnly bool
+}
+
+// Bits converts the Flags to their bitset form.
+func (f Flags) Bits() BitFlags {
+	var rf BitFlags
+	if f.MostRecent {
+		rf |= MostRecentFlag
+	}
+	if f.LoadBalanced {
+		rf |= LoadBalancedFlag
+	}
+	if f.TupleOnly {
+		rf |= TupleOnlyFlag
+	}
+	return rf
+}
+
+// Effective returns the effective behavior of a flag config.
+func (f Flags) Effective() Flags {
+	e := f
+	if e.LoadBalanced && e.MostRecent {
+		e.MostRecent = false
+	}
+	return e
+}
+
+// PortManager manages allocating, reserving and releasing ports.
+type PortManager struct {
+	mu             sync.RWMutex
+	allocatedPorts map[portDescriptor]bindAddresses
+
+	// hint is used to pick ports ephemeral ports in a stable order for
+	// a given port offset.
+	//
+	// hint must be accessed using the portHint/incPortHint helpers.
+	// TODO(gvisor.dev/issue/940): S/R this field.
+	hint uint32
+}
+
+// BitFlags is a bitset representation of Flags.
+type BitFlags uint32
+
+const (
+	// MostRecentFlag represents Flags.MostRecent.
+	MostRecentFlag BitFlags = 1 << iota
+
+	// LoadBalancedFlag represents Flags.LoadBalanced.
+	LoadBalancedFlag
+
+	// TupleOnlyFlag represents Flags.TupleOnly.
+	TupleOnlyFlag
+
+	// nextFlag is the value that the next added flag will have.
+	//
+	// It is used to calculate FlagMask below. It is also the number of
+	// valid flag states.
+	nextFlag
+
+	// FlagMask is a bit mask for BitFlags.
+	FlagMask = nextFlag - 1
+
+	// MultiBindFlagMask contains the flags that allow binding the same
+	// tuple multiple times.
+	MultiBindFlagMask = MostRecentFlag | LoadBalancedFlag
+)
+
+// ToFlags converts the bitset into a Flags struct.
+func (f BitFlags) ToFlags() Flags {
+	return Flags{
+		MostRecent:   f&MostRecentFlag != 0,
+		LoadBalanced: f&LoadBalancedFlag != 0,
+		TupleOnly:    f&TupleOnlyFlag != 0,
+	}
+}
+
+// FlagCounter counts how many references each flag combination has.
+type FlagCounter struct {
+	// refs stores the count for each possible flag combination, (0 though
+	// FlagMask).
+	refs [nextFlag]int
+}
+
+// AddRef increases the reference count for a specific flag combination.
+func (c *FlagCounter) AddRef(flags BitFlags) {
+	c.refs[flags]++
+}
+
+// DropRef decreases the reference count for a specific flag combination.
+func (c *FlagCounter) DropRef(flags BitFlags) {
+	c.refs[flags]--
+}
+
+// TotalRefs calculates the total number of references for all flag
+// combinations.
+func (c FlagCounter) TotalRefs() int {
+	var total int
+	for _, r := range c.refs {
+		total += r
+	}
+	return total
+}
+
+// FlagRefs returns the number of references with all specified flags.
+func (c FlagCounter) FlagRefs(flags BitFlags) int {
+	var total int
+	for i, r := range c.refs {
+		if BitFlags(i)&flags == flags {
+			total += r
+		}
+	}
+	return total
+}
+
+// AllRefsHave returns if all references have all specified flags.
+func (c FlagCounter) AllRefsHave(flags BitFlags) bool {
+	for i, r := range c.refs {
+		if BitFlags(i)&flags != flags && r > 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// IntersectionRefs returns the set of flags shared by all references.
+func (c FlagCounter) IntersectionRefs() BitFlags {
+	intersection := FlagMask
+	for i, r := range c.refs {
+		if r > 0 {
+			intersection &= BitFlags(i)
+		}
+	}
+	return intersection
+}
+
+type destination struct {
+	addr tcpip.Address
+	port uint16
+}
+
+func makeDestination(a tcpip.FullAddress) destination {
+	return destination{
+		a.Addr,
+		a.Port,
+	}
+}
+
+// portNode is never empty. When it has no elements, it is removed from the
+// map that references it.
+type portNode map[destination]FlagCounter
+
+// intersectionRefs calculates the intersection of flag bit values which affect
+// the specified destination.
+//
+// If no destinations are present, all flag values are returned as there are no
+// entries to limit possible flag values of a new entry.
+//
+// In addition to the intersection, the number of intersecting refs is
+// returned.
+func (p portNode) intersectionRefs(dst destination) (BitFlags, int) {
+	intersection := FlagMask
+	var count int
+
+	for d, f := range p {
+		if d == dst {
+			intersection &= f.IntersectionRefs()
+			count++
+			continue
+		}
+		// Wildcard destinations affect all destinations for TupleOnly.
+		if d.addr == anyIPAddress || dst.addr == anyIPAddress {
+			// Only bitwise and the TupleOnlyFlag.
+			intersection &= ((^TupleOnlyFlag) | f.IntersectionRefs())
+			count++
+		}
+	}
+
+	return intersection, count
+}
+
+// deviceNode is never empty. When it has no elements, it is removed from the
+// map that references it.
+type deviceNode map[tcpip.NICID]portNode
+
+// isAvailable checks whether binding is possible by device. If not binding to a
+// device, check against all FlagCounters. If binding to a specific device, check
+// against the unspecified device and the provided device.
+//
+// If either of the port reuse flags is enabled on any of the nodes, all nodes
+// sharing a port must share at least one reuse flag. This matches Linux's
+// behavior.
+func (d deviceNode) isAvailable(flags Flags, bindToDevice tcpip.NICID, dst destination) bool {
+	flagBits := flags.Bits()
+	if bindToDevice == 0 {
+		intersection := FlagMask
+		for _, p := range d {
+			i, c := p.intersectionRefs(dst)
+			if c == 0 {
+				continue
+			}
+			intersection &= i
+			if intersection&flagBits == 0 {
+				// Can't bind because the (addr,port) was
+				// previously bound without reuse.
+				return false
+			}
+		}
+		return true
+	}
+
+	intersection := FlagMask
+
+	if p, ok := d[0]; ok {
+		var c int
+		intersection, c = p.intersectionRefs(dst)
+		if c > 0 && intersection&flagBits == 0 {
+			return false
+		}
+	}
+
+	if p, ok := d[bindToDevice]; ok {
+		i, c := p.intersectionRefs(dst)
+		intersection &= i
+		if c > 0 && intersection&flagBits == 0 {
+			return false
+		}
+	}
+
+	return true
+}
+
+// bindAddresses is a set of IP addresses.
+type bindAddresses map[tcpip.Address]deviceNode
+
+// isAvailable checks whether an IP address is available to bind to. If the
+// address is the "any" address, check all other addresses. Otherwise, just
+// check against the "any" address and the provided address.
+func (b bindAddresses) isAvailable(addr tcpip.Address, flags Flags, bindToDevice tcpip.NICID, dst destination) bool {
+	if addr == anyIPAddress {
+		// If binding to the "any" address then check that there are no conflicts
+		// with all addresses.
+		for _, d := range b {
+			if !d.isAvailable(flags, bindToDevice, dst) {
+				return false
+			}
+		}
+		return true
+	}
+
+	// Check that there is no conflict with the "any" address.
+	if d, ok := b[anyIPAddress]; ok {
+		if !d.isAvailable(flags, bindToDevice, dst) {
+			return false
+		}
+	}
+
+	// Check that this is no conflict with the provided address.
+	if d, ok := b[addr]; ok {
+		if !d.isAvailable(flags, bindToDevice, dst) {
+			return false
+		}
+	}
+
+	return true
+}
+
+// NewPortManager creates new PortManager.
+func NewPortManager() *PortManager {
+	return &PortManager{allocatedPorts: make(map[portDescriptor]bindAddresses)}
+}
+
+// PickEphemeralPort randomly chooses a starting point and iterates over all
+// possible ephemeral ports, allowing the caller to decide whether a given port
+// is suitable for its needs, and stopping when a port is found or an error
+// occurs.
+func (s *PortManager) PickEphemeralPort(testPort func(p uint16) (bool, *tcpip.Error)) (port uint16, err *tcpip.Error) {
+	offset := uint32(rand.Int31n(numEphemeralPorts))
+	return s.pickEphemeralPort(offset, numEphemeralPorts, testPort)
+}
+
+// portHint atomically reads and returns the s.hint value.
+func (s *PortManager) portHint() uint32 {
+	return atomic.LoadUint32(&s.hint)
+}
+
+// incPortHint atomically increments s.hint by 1.
+func (s *PortManager) incPortHint() {
+	atomic.AddUint32(&s.hint, 1)
+}
+
+// PickEphemeralPortStable starts at the specified offset + s.portHint and
+// iterates over all ephemeral ports, allowing the caller to decide whether a
+// given port is suitable for its needs and stopping when a port is found or an
+// error occurs.
+func (s *PortManager) PickEphemeralPortStable(offset uint32, testPort func(p uint16) (bool, *tcpip.Error)) (port uint16, err *tcpip.Error) {
+	p, err := s.pickEphemeralPort(s.portHint()+offset, numEphemeralPorts, testPort)
+	if err == nil {
+		s.incPortHint()
+	}
+	return p, err
+
+}
+
+// pickEphemeralPort starts at the offset specified from the FirstEphemeral port
+// and iterates over the number of ports specified by count and allows the
+// caller to decide whether a given port is suitable for its needs, and stopping
+// when a port is found or an error occurs.
+func (s *PortManager) pickEphemeralPort(offset, count uint32, testPort func(p uint16) (bool, *tcpip.Error)) (port uint16, err *tcpip.Error) {
+	for i := uint32(0); i < count; i++ {
+		port = uint16(FirstEphemeral + (offset+i)%count)
+		ok, err := testPort(port)
+		if err != nil {
+			return 0, err
+		}
+
+		if ok {
+			return port, nil
+		}
+	}
+
+	return 0, tcpip.ErrNoPortAvailable
+}
+
+// IsPortAvailable tests if the given port is available on all given protocols.
+func (s *PortManager) IsPortAvailable(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dest tcpip.FullAddress) bool {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice, makeDestination(dest))
+}
+
+func (s *PortManager) isPortAvailableLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dst destination) bool {
+	for _, network := range networks {
+		desc := portDescriptor{network, transport, port}
+		if addrs, ok := s.allocatedPorts[desc]; ok {
+			if !addrs.isAvailable(addr, flags, bindToDevice, dst) {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// ReservePort marks a port/IP combination as reserved so that it cannot be
+// reserved by another endpoint. If port is zero, ReservePort will search for
+// an unreserved ephemeral port and reserve it, returning its value in the
+// "port" return value.
+func (s *PortManager) ReservePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dest tcpip.FullAddress) (reservedPort uint16, err *tcpip.Error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	dst := makeDestination(dest)
+
+	// If a port is specified, just try to reserve it for all network
+	// protocols.
+	if port != 0 {
+		if !s.reserveSpecificPort(networks, transport, addr, port, flags, bindToDevice, dst) {
+			return 0, tcpip.ErrPortInUse
+		}
+		return port, nil
+	}
+
+	// A port wasn't specified, so try to find one.
+	return s.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
+		return s.reserveSpecificPort(networks, transport, addr, p, flags, bindToDevice, dst), nil
+	})
+}
+
+// reserveSpecificPort tries to reserve the given port on all given protocols.
+func (s *PortManager) reserveSpecificPort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dst destination) bool {
+	if !s.isPortAvailableLocked(networks, transport, addr, port, flags, bindToDevice, dst) {
+		return false
+	}
+
+	flagBits := flags.Bits()
+
+	// Reserve port on all network protocols.
+	for _, network := range networks {
+		desc := portDescriptor{network, transport, port}
+		m, ok := s.allocatedPorts[desc]
+		if !ok {
+			m = make(bindAddresses)
+			s.allocatedPorts[desc] = m
+		}
+		d, ok := m[addr]
+		if !ok {
+			d = make(deviceNode)
+			m[addr] = d
+		}
+		p := d[bindToDevice]
+		if p == nil {
+			p = make(portNode)
+		}
+		n := p[dst]
+		n.AddRef(flagBits)
+		p[dst] = n
+		d[bindToDevice] = p
+	}
+
+	return true
+}
+
+// ReserveTuple adds a port reservation for the tuple on all given protocol.
+func (s *PortManager) ReserveTuple(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dest tcpip.FullAddress) bool {
+	flagBits := flags.Bits()
+	dst := makeDestination(dest)
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// It is easier to undo the entire reservation, so if we find that the
+	// tuple can't be fully added, finish and undo the whole thing.
+	undo := false
+
+	// Reserve port on all network protocols.
+	for _, network := range networks {
+		desc := portDescriptor{network, transport, port}
+		m, ok := s.allocatedPorts[desc]
+		if !ok {
+			m = make(bindAddresses)
+			s.allocatedPorts[desc] = m
+		}
+		d, ok := m[addr]
+		if !ok {
+			d = make(deviceNode)
+			m[addr] = d
+		}
+		p := d[bindToDevice]
+		if p == nil {
+			p = make(portNode)
+		}
+
+		n := p[dst]
+		if n.TotalRefs() != 0 && n.IntersectionRefs()&flagBits == 0 {
+			// Tuple already exists.
+			undo = true
+		}
+		n.AddRef(flagBits)
+		p[dst] = n
+		d[bindToDevice] = p
+	}
+
+	if undo {
+		// releasePortLocked decrements the counts (rather than setting
+		// them to zero), so it will undo the incorrect incrementing
+		// above.
+		s.releasePortLocked(networks, transport, addr, port, flagBits, bindToDevice, dst)
+		return false
+	}
+
+	return true
+}
+
+// ReleasePort releases the reservation on a port/IP combination so that it can
+// be reserved by other endpoints.
+func (s *PortManager) ReleasePort(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags Flags, bindToDevice tcpip.NICID, dest tcpip.FullAddress) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	s.releasePortLocked(networks, transport, addr, port, flags.Bits(), bindToDevice, makeDestination(dest))
+}
+
+func (s *PortManager) releasePortLocked(networks []tcpip.NetworkProtocolNumber, transport tcpip.TransportProtocolNumber, addr tcpip.Address, port uint16, flags BitFlags, bindToDevice tcpip.NICID, dst destination) {
+	for _, network := range networks {
+		desc := portDescriptor{network, transport, port}
+		if m, ok := s.allocatedPorts[desc]; ok {
+			d, ok := m[addr]
+			if !ok {
+				continue
+			}
+			p, ok := d[bindToDevice]
+			if !ok {
+				continue
+			}
+			n, ok := p[dst]
+			if !ok {
+				continue
+			}
+			n.DropRef(flags)
+			if n.TotalRefs() > 0 {
+				p[dst] = n
+				continue
+			}
+			delete(p, dst)
+			if len(p) > 0 {
+				continue
+			}
+			delete(d, bindToDevice)
+			if len(d) > 0 {
+				continue
+			}
+			delete(m, addr)
+			if len(m) > 0 {
+				continue
+			}
+			delete(s.allocatedPorts, desc)
+		}
+	}
+}
diff --git a/pkg/tcpip/ports/ports_test.go b/pkg/tcpip/ports/ports_test.go
new file mode 100644
index 000000000..58db5868c
--- /dev/null
+++ b/pkg/tcpip/ports/ports_test.go
@@ -0,0 +1,450 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package ports
+
+import (
+	"math/rand"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	fakeTransNumber   tcpip.TransportProtocolNumber = 1
+	fakeNetworkNumber tcpip.NetworkProtocolNumber   = 2
+
+	fakeIPAddress  = tcpip.Address("\x08\x08\x08\x08")
+	fakeIPAddress1 = tcpip.Address("\x08\x08\x08\x09")
+)
+
+type portReserveTestAction struct {
+	port    uint16
+	ip      tcpip.Address
+	want    *tcpip.Error
+	flags   Flags
+	release bool
+	device  tcpip.NICID
+	dest    tcpip.FullAddress
+}
+
+func TestPortReservation(t *testing.T) {
+	for _, test := range []struct {
+		tname   string
+		actions []portReserveTestAction
+	}{
+		{
+			tname: "bind to ip",
+			actions: []portReserveTestAction{
+				{port: 80, ip: fakeIPAddress, want: nil},
+				{port: 80, ip: fakeIPAddress1, want: nil},
+				/* N.B. Order of tests matters! */
+				{port: 80, ip: anyIPAddress, want: tcpip.ErrPortInUse},
+				{port: 80, ip: fakeIPAddress, want: tcpip.ErrPortInUse, flags: Flags{LoadBalanced: true}},
+			},
+		},
+		{
+			tname: "bind to inaddr any",
+			actions: []portReserveTestAction{
+				{port: 22, ip: anyIPAddress, want: nil},
+				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
+				/* release fakeIPAddress, but anyIPAddress is still inuse */
+				{port: 22, ip: fakeIPAddress, release: true},
+				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
+				{port: 22, ip: fakeIPAddress, want: tcpip.ErrPortInUse, flags: Flags{LoadBalanced: true}},
+				/* Release port 22 from any IP address, then try to reserve fake IP address on 22 */
+				{port: 22, ip: anyIPAddress, want: nil, release: true},
+				{port: 22, ip: fakeIPAddress, want: nil},
+			},
+		}, {
+			tname: "bind to zero port",
+			actions: []portReserveTestAction{
+				{port: 00, ip: fakeIPAddress, want: nil},
+				{port: 00, ip: fakeIPAddress, want: nil},
+				{port: 00, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+			},
+		}, {
+			tname: "bind to ip with reuseport",
+			actions: []portReserveTestAction{
+				{port: 25, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 25, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+
+				{port: 25, ip: fakeIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
+				{port: 25, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
+
+				{port: 25, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+			},
+		}, {
+			tname: "bind to inaddr any with reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+
+				{port: 24, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
+
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, release: true, want: nil},
+
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, release: true},
+				{port: 24, ip: anyIPAddress, flags: Flags{}, want: tcpip.ErrPortInUse},
+
+				{port: 24, ip: anyIPAddress, flags: Flags{LoadBalanced: true}, release: true},
+				{port: 24, ip: anyIPAddress, flags: Flags{}, want: nil},
+			},
+		}, {
+			tname: "bind twice with device fails",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 3, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 3, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind to device",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 1, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 2, want: nil},
+			},
+		}, {
+			tname: "bind to device and then without device",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind without device",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with device",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 789, want: nil},
+				{port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
+			},
+		}, {
+			tname: "binding with reuseport and device",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 999, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "mixing reuseport and not reuseport by binding to device",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 456, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 999, want: nil},
+			},
+		}, {
+			tname: "can't bind to 0 after mixing reuseport and not reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 456, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind and release",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 789, flags: Flags{LoadBalanced: true}, want: nil},
+
+				// Release the bind to device 0 and try again.
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: nil},
+			},
+		}, {
+			tname: "bind twice with reuseport once",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "release an unreserved device",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{}, want: nil},
+				// The below don't exist.
+				{port: 24, ip: fakeIPAddress, device: 345, flags: Flags{}, want: nil, release: true},
+				{port: 9999, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil, release: true},
+				// Release all.
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil, release: true},
+				{port: 24, ip: fakeIPAddress, device: 456, flags: Flags{}, want: nil, release: true},
+			},
+		}, {
+			tname: "bind with reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 123, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, want: tcpip.ErrPortInUse},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{MostRecent: true}, want: nil},
+			},
+		}, {
+			tname: "bind twice with reuseaddr once",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, device: 123, flags: Flags{}, want: nil},
+				{port: 24, ip: fakeIPAddress, device: 0, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport, and then reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport, and then reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport twice, and then reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+			},
+		}, {
+			tname: "bind with reuseaddr and reuseport twice, and then reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+			},
+		}, {
+			tname: "bind with reuseaddr, and then reuseaddr and reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind with reuseport, and then reuseaddr and reuseport",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true, LoadBalanced: true}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{MostRecent: true}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind tuple with reuseaddr, and then wildcard with reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 24}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{}, want: nil},
+			},
+		}, {
+			tname: "bind tuple with reuseaddr, and then wildcard",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 24}, want: nil},
+				{port: 24, ip: fakeIPAddress, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind wildcard with reuseaddr, and then tuple with reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 24}, want: nil},
+			},
+		}, {
+			tname: "bind tuple with reuseaddr, and then wildcard",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 24}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind two tuples with reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 24}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 25}, want: nil},
+			},
+		}, {
+			tname: "bind two tuples",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 24}, want: nil},
+				{port: 24, ip: fakeIPAddress, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 25}, want: nil},
+			},
+		}, {
+			tname: "bind wildcard, and then tuple with reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: fakeIPAddress, dest: tcpip.FullAddress{}, want: nil},
+				{port: 24, ip: fakeIPAddress, flags: Flags{TupleOnly: true}, dest: tcpip.FullAddress{Addr: fakeIPAddress, Port: 24}, want: tcpip.ErrPortInUse},
+			},
+		}, {
+			tname: "bind wildcard twice with reuseaddr",
+			actions: []portReserveTestAction{
+				{port: 24, ip: anyIPAddress, flags: Flags{TupleOnly: true}, want: nil},
+				{port: 24, ip: anyIPAddress, flags: Flags{TupleOnly: true}, want: nil},
+			},
+		},
+	} {
+		t.Run(test.tname, func(t *testing.T) {
+			pm := NewPortManager()
+			net := []tcpip.NetworkProtocolNumber{fakeNetworkNumber}
+
+			for _, test := range test.actions {
+				if test.release {
+					pm.ReleasePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device, test.dest)
+					continue
+				}
+				gotPort, err := pm.ReservePort(net, fakeTransNumber, test.ip, test.port, test.flags, test.device, test.dest)
+				if err != test.want {
+					t.Fatalf("ReservePort(.., .., %s, %d, %+v, %d, %v) = %v, want %v", test.ip, test.port, test.flags, test.device, test.dest, err, test.want)
+				}
+				if test.port == 0 && (gotPort == 0 || gotPort < FirstEphemeral) {
+					t.Fatalf("ReservePort(.., .., .., 0, ..) = %d, want port number >= %d to be picked", gotPort, FirstEphemeral)
+				}
+			}
+		})
+	}
+}
+
+func TestPickEphemeralPort(t *testing.T) {
+	customErr := &tcpip.Error{}
+	for _, test := range []struct {
+		name     string
+		f        func(port uint16) (bool, *tcpip.Error)
+		wantErr  *tcpip.Error
+		wantPort uint16
+	}{
+		{
+			name: "no-port-available",
+			f: func(port uint16) (bool, *tcpip.Error) {
+				return false, nil
+			},
+			wantErr: tcpip.ErrNoPortAvailable,
+		},
+		{
+			name: "port-tester-error",
+			f: func(port uint16) (bool, *tcpip.Error) {
+				return false, customErr
+			},
+			wantErr: customErr,
+		},
+		{
+			name: "only-port-16042-available",
+			f: func(port uint16) (bool, *tcpip.Error) {
+				if port == FirstEphemeral+42 {
+					return true, nil
+				}
+				return false, nil
+			},
+			wantPort: FirstEphemeral + 42,
+		},
+		{
+			name: "only-port-under-16000-available",
+			f: func(port uint16) (bool, *tcpip.Error) {
+				if port < FirstEphemeral {
+					return true, nil
+				}
+				return false, nil
+			},
+			wantErr: tcpip.ErrNoPortAvailable,
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			pm := NewPortManager()
+			if port, err := pm.PickEphemeralPort(test.f); port != test.wantPort || err != test.wantErr {
+				t.Errorf("PickEphemeralPort(..) = (port %d, err %v); want (port %d, err %v)", port, err, test.wantPort, test.wantErr)
+			}
+		})
+	}
+}
+
+func TestPickEphemeralPortStable(t *testing.T) {
+	customErr := &tcpip.Error{}
+	for _, test := range []struct {
+		name     string
+		f        func(port uint16) (bool, *tcpip.Error)
+		wantErr  *tcpip.Error
+		wantPort uint16
+	}{
+		{
+			name: "no-port-available",
+			f: func(port uint16) (bool, *tcpip.Error) {
+				return false, nil
+			},
+			wantErr: tcpip.ErrNoPortAvailable,
+		},
+		{
+			name: "port-tester-error",
+			f: func(port uint16) (bool, *tcpip.Error) {
+				return false, customErr
+			},
+			wantErr: customErr,
+		},
+		{
+			name: "only-port-16042-available",
+			f: func(port uint16) (bool, *tcpip.Error) {
+				if port == FirstEphemeral+42 {
+					return true, nil
+				}
+				return false, nil
+			},
+			wantPort: FirstEphemeral + 42,
+		},
+		{
+			name: "only-port-under-16000-available",
+			f: func(port uint16) (bool, *tcpip.Error) {
+				if port < FirstEphemeral {
+					return true, nil
+				}
+				return false, nil
+			},
+			wantErr: tcpip.ErrNoPortAvailable,
+		},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			pm := NewPortManager()
+			portOffset := uint32(rand.Int31n(int32(numEphemeralPorts)))
+			if port, err := pm.PickEphemeralPortStable(portOffset, test.f); port != test.wantPort || err != test.wantErr {
+				t.Errorf("PickEphemeralPort(..) = (port %d, err %v); want (port %d, err %v)", port, err, test.wantPort, test.wantErr)
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/sample/tun_tcp_connect/BUILD b/pkg/tcpip/sample/tun_tcp_connect/BUILD
new file mode 100644
index 000000000..cf0a5fefe
--- /dev/null
+++ b/pkg/tcpip/sample/tun_tcp_connect/BUILD
@@ -0,0 +1,22 @@
+load("//tools:defs.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "tun_tcp_connect",
+    srcs = ["main.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/fdbased",
+        "//pkg/tcpip/link/rawfile",
+        "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/link/tun",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/tcpip/sample/tun_tcp_connect/main.go b/pkg/tcpip/sample/tun_tcp_connect/main.go
new file mode 100644
index 000000000..0ab089208
--- /dev/null
+++ b/pkg/tcpip/sample/tun_tcp_connect/main.go
@@ -0,0 +1,225 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+// This sample creates a stack with TCP and IPv4 protocols on top of a TUN
+// device, and connects to a peer. Similar to "nc <address> <port>". While the
+// sample is running, attempts to connect to its IPv4 address will result in
+// a RST segment.
+//
+// As an example of how to run it, a TUN device can be created and enabled on
+// a linux host as follows (this only needs to be done once per boot):
+//
+// [sudo] ip tuntap add user <username> mode tun <device-name>
+// [sudo] ip link set <device-name> up
+// [sudo] ip addr add <ipv4-address>/<mask-length> dev <device-name>
+//
+// A concrete example:
+//
+// $ sudo ip tuntap add user wedsonaf mode tun tun0
+// $ sudo ip link set tun0 up
+// $ sudo ip addr add 192.168.1.1/24 dev tun0
+//
+// Then one can run tun_tcp_connect as such:
+//
+// $ ./tun/tun_tcp_connect tun0 192.168.1.2 0 192.168.1.1 1234
+//
+// This will attempt to connect to the linux host's stack. One can run nc in
+// listen mode to accept a connect from tun_tcp_connect and exchange data.
+package main
+
+import (
+	"bufio"
+	"fmt"
+	"log"
+	"math/rand"
+	"net"
+	"os"
+	"strconv"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
+	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.dev/gvisor/pkg/tcpip/link/tun"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// writer reads from standard input and writes to the endpoint until standard
+// input is closed. It signals that it's done by closing the provided channel.
+func writer(ch chan struct{}, ep tcpip.Endpoint) {
+	defer func() {
+		ep.Shutdown(tcpip.ShutdownWrite)
+		close(ch)
+	}()
+
+	r := bufio.NewReader(os.Stdin)
+	for {
+		v := buffer.NewView(1024)
+		n, err := r.Read(v)
+		if err != nil {
+			return
+		}
+
+		v.CapLength(n)
+		for len(v) > 0 {
+			n, _, err := ep.Write(tcpip.SlicePayload(v), tcpip.WriteOptions{})
+			if err != nil {
+				fmt.Println("Write failed:", err)
+				return
+			}
+
+			v.TrimFront(int(n))
+		}
+	}
+}
+
+func main() {
+	if len(os.Args) != 6 {
+		log.Fatal("Usage: ", os.Args[0], " <tun-device> <local-ipv4-address> <local-port> <remote-ipv4-address> <remote-port>")
+	}
+
+	tunName := os.Args[1]
+	addrName := os.Args[2]
+	portName := os.Args[3]
+	remoteAddrName := os.Args[4]
+	remotePortName := os.Args[5]
+
+	rand.Seed(time.Now().UnixNano())
+
+	addr := tcpip.Address(net.ParseIP(addrName).To4())
+	remote := tcpip.FullAddress{
+		NIC:  1,
+		Addr: tcpip.Address(net.ParseIP(remoteAddrName).To4()),
+	}
+
+	var localPort uint16
+	if v, err := strconv.Atoi(portName); err != nil {
+		log.Fatalf("Unable to convert port %v: %v", portName, err)
+	} else {
+		localPort = uint16(v)
+	}
+
+	if v, err := strconv.Atoi(remotePortName); err != nil {
+		log.Fatalf("Unable to convert port %v: %v", remotePortName, err)
+	} else {
+		remote.Port = uint16(v)
+	}
+
+	// Create the stack with ipv4 and tcp protocols, then add a tun-based
+	// NIC and ipv4 address.
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+	})
+
+	mtu, err := rawfile.GetMTU(tunName)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	fd, err := tun.Open(tunName)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	linkEP, err := fdbased.New(&fdbased.Options{FDs: []int{fd}, MTU: mtu})
+	if err != nil {
+		log.Fatal(err)
+	}
+	if err := s.CreateNIC(1, sniffer.New(linkEP)); err != nil {
+		log.Fatal(err)
+	}
+
+	if err := s.AddAddress(1, ipv4.ProtocolNumber, addr); err != nil {
+		log.Fatal(err)
+	}
+
+	// Add default route.
+	s.SetRouteTable([]tcpip.Route{
+		{
+			Destination: header.IPv4EmptySubnet,
+			NIC:         1,
+		},
+	})
+
+	// Create TCP endpoint.
+	var wq waiter.Queue
+	ep, e := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
+	if e != nil {
+		log.Fatal(e)
+	}
+
+	// Bind if a port is specified.
+	if localPort != 0 {
+		if err := ep.Bind(tcpip.FullAddress{0, "", localPort}); err != nil {
+			log.Fatal("Bind failed: ", err)
+		}
+	}
+
+	// Issue connect request and wait for it to complete.
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&waitEntry, waiter.EventOut)
+	terr := ep.Connect(remote)
+	if terr == tcpip.ErrConnectStarted {
+		fmt.Println("Connect is pending...")
+		<-notifyCh
+		terr = ep.GetSockOpt(tcpip.ErrorOption{})
+	}
+	wq.EventUnregister(&waitEntry)
+
+	if terr != nil {
+		log.Fatal("Unable to connect: ", terr)
+	}
+
+	fmt.Println("Connected")
+
+	// Start the writer in its own goroutine.
+	writerCompletedCh := make(chan struct{})
+	go writer(writerCompletedCh, ep) // S/R-SAFE: sample code.
+
+	// Read data and write to standard output until the peer closes the
+	// connection from its side.
+	wq.EventRegister(&waitEntry, waiter.EventIn)
+	for {
+		v, _, err := ep.Read(nil)
+		if err != nil {
+			if err == tcpip.ErrClosedForReceive {
+				break
+			}
+
+			if err == tcpip.ErrWouldBlock {
+				<-notifyCh
+				continue
+			}
+
+			log.Fatal("Read() failed:", err)
+		}
+
+		os.Stdout.Write(v)
+	}
+	wq.EventUnregister(&waitEntry)
+
+	// The reader has completed. Now wait for the writer as well.
+	<-writerCompletedCh
+
+	ep.Close()
+}
diff --git a/pkg/tcpip/sample/tun_tcp_echo/BUILD b/pkg/tcpip/sample/tun_tcp_echo/BUILD
new file mode 100644
index 000000000..43264b76d
--- /dev/null
+++ b/pkg/tcpip/sample/tun_tcp_echo/BUILD
@@ -0,0 +1,21 @@
+load("//tools:defs.bzl", "go_binary")
+
+package(licenses = ["notice"])
+
+go_binary(
+    name = "tun_tcp_echo",
+    srcs = ["main.go"],
+    visibility = ["//:sandbox"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/link/fdbased",
+        "//pkg/tcpip/link/rawfile",
+        "//pkg/tcpip/link/tun",
+        "//pkg/tcpip/network/arp",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/tcpip/sample/tun_tcp_echo/main.go b/pkg/tcpip/sample/tun_tcp_echo/main.go
new file mode 100644
index 000000000..9e37cab18
--- /dev/null
+++ b/pkg/tcpip/sample/tun_tcp_echo/main.go
@@ -0,0 +1,203 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+// This sample creates a stack with TCP and IPv4 protocols on top of a TUN
+// device, and listens on a port. Data received by the server in the accepted
+// connections is echoed back to the clients.
+package main
+
+import (
+	"flag"
+	"log"
+	"math/rand"
+	"net"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
+	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/link/tun"
+	"gvisor.dev/gvisor/pkg/tcpip/network/arp"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+var tap = flag.Bool("tap", false, "use tap istead of tun")
+var mac = flag.String("mac", "aa:00:01:01:01:01", "mac address to use in tap device")
+
+func echo(wq *waiter.Queue, ep tcpip.Endpoint) {
+	defer ep.Close()
+
+	// Create wait queue entry that notifies a channel.
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+
+	wq.EventRegister(&waitEntry, waiter.EventIn)
+	defer wq.EventUnregister(&waitEntry)
+
+	for {
+		v, _, err := ep.Read(nil)
+		if err != nil {
+			if err == tcpip.ErrWouldBlock {
+				<-notifyCh
+				continue
+			}
+
+			return
+		}
+
+		ep.Write(tcpip.SlicePayload(v), tcpip.WriteOptions{})
+	}
+}
+
+func main() {
+	flag.Parse()
+	if len(flag.Args()) != 3 {
+		log.Fatal("Usage: ", os.Args[0], " <tun-device> <local-address> <local-port>")
+	}
+
+	tunName := flag.Arg(0)
+	addrName := flag.Arg(1)
+	portName := flag.Arg(2)
+
+	rand.Seed(time.Now().UnixNano())
+
+	// Parse the mac address.
+	maddr, err := net.ParseMAC(*mac)
+	if err != nil {
+		log.Fatalf("Bad MAC address: %v", *mac)
+	}
+
+	// Parse the IP address. Support both ipv4 and ipv6.
+	parsedAddr := net.ParseIP(addrName)
+	if parsedAddr == nil {
+		log.Fatalf("Bad IP address: %v", addrName)
+	}
+
+	var addr tcpip.Address
+	var proto tcpip.NetworkProtocolNumber
+	if parsedAddr.To4() != nil {
+		addr = tcpip.Address(parsedAddr.To4())
+		proto = ipv4.ProtocolNumber
+	} else if parsedAddr.To16() != nil {
+		addr = tcpip.Address(parsedAddr.To16())
+		proto = ipv6.ProtocolNumber
+	} else {
+		log.Fatalf("Unknown IP type: %v", addrName)
+	}
+
+	localPort, err := strconv.Atoi(portName)
+	if err != nil {
+		log.Fatalf("Unable to convert port %v: %v", portName, err)
+	}
+
+	// Create the stack with ip and tcp protocols, then add a tun-based
+	// NIC and address.
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+	})
+
+	mtu, err := rawfile.GetMTU(tunName)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	var fd int
+	if *tap {
+		fd, err = tun.OpenTAP(tunName)
+	} else {
+		fd, err = tun.Open(tunName)
+	}
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	linkEP, err := fdbased.New(&fdbased.Options{
+		FDs:            []int{fd},
+		MTU:            mtu,
+		EthernetHeader: *tap,
+		Address:        tcpip.LinkAddress(maddr),
+	})
+	if err != nil {
+		log.Fatal(err)
+	}
+	if err := s.CreateNIC(1, linkEP); err != nil {
+		log.Fatal(err)
+	}
+
+	if err := s.AddAddress(1, proto, addr); err != nil {
+		log.Fatal(err)
+	}
+
+	if err := s.AddAddress(1, arp.ProtocolNumber, arp.ProtocolAddress); err != nil {
+		log.Fatal(err)
+	}
+
+	subnet, err := tcpip.NewSubnet(tcpip.Address(strings.Repeat("\x00", len(addr))), tcpip.AddressMask(strings.Repeat("\x00", len(addr))))
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	// Add default route.
+	s.SetRouteTable([]tcpip.Route{
+		{
+			Destination: subnet,
+			NIC:         1,
+		},
+	})
+
+	// Create TCP endpoint, bind it, then start listening.
+	var wq waiter.Queue
+	ep, e := s.NewEndpoint(tcp.ProtocolNumber, proto, &wq)
+	if e != nil {
+		log.Fatal(e)
+	}
+
+	defer ep.Close()
+
+	if err := ep.Bind(tcpip.FullAddress{0, "", uint16(localPort)}); err != nil {
+		log.Fatal("Bind failed: ", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		log.Fatal("Listen failed: ", err)
+	}
+
+	// Wait for connections to appear.
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&waitEntry, waiter.EventIn)
+	defer wq.EventUnregister(&waitEntry)
+
+	for {
+		n, wq, err := ep.Accept()
+		if err != nil {
+			if err == tcpip.ErrWouldBlock {
+				<-notifyCh
+				continue
+			}
+
+			log.Fatal("Accept() failed:", err)
+		}
+
+		go echo(wq, n) // S/R-SAFE: sample code.
+	}
+}
diff --git a/pkg/tcpip/seqnum/BUILD b/pkg/tcpip/seqnum/BUILD
new file mode 100644
index 000000000..45f503845
--- /dev/null
+++ b/pkg/tcpip/seqnum/BUILD
@@ -0,0 +1,9 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "seqnum",
+    srcs = ["seqnum.go"],
+    visibility = ["//visibility:public"],
+)
diff --git a/pkg/tcpip/seqnum/seqnum.go b/pkg/tcpip/seqnum/seqnum.go
new file mode 100644
index 000000000..d3bea7de4
--- /dev/null
+++ b/pkg/tcpip/seqnum/seqnum.go
@@ -0,0 +1,62 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package seqnum defines the types and methods for TCP sequence numbers such
+// that they fit in 32-bit words and work properly when overflows occur.
+package seqnum
+
+// Value represents the value of a sequence number.
+type Value uint32
+
+// Size represents the size (length) of a sequence number window.
+type Size uint32
+
+// LessThan checks if v is before w, i.e., v < w.
+func (v Value) LessThan(w Value) bool {
+	return int32(v-w) < 0
+}
+
+// LessThanEq returns true if v==w or v is before i.e., v < w.
+func (v Value) LessThanEq(w Value) bool {
+	if v == w {
+		return true
+	}
+	return v.LessThan(w)
+}
+
+// InRange checks if v is in the range [a,b), i.e., a <= v < b.
+func (v Value) InRange(a, b Value) bool {
+	return v-a < b-a
+}
+
+// InWindow checks if v is in the window that starts at 'first' and spans 'size'
+// sequence numbers.
+func (v Value) InWindow(first Value, size Size) bool {
+	return v.InRange(first, first.Add(size))
+}
+
+// Add calculates the sequence number following the [v, v+s) window.
+func (v Value) Add(s Size) Value {
+	return v + Value(s)
+}
+
+// Size calculates the size of the window defined by [v, w).
+func (v Value) Size(w Value) Size {
+	return Size(w - v)
+}
+
+// UpdateForward updates v such that it becomes v + s.
+func (v *Value) UpdateForward(s Size) {
+	*v += Value(s)
+}
diff --git a/pkg/tcpip/stack/BUILD b/pkg/tcpip/stack/BUILD
new file mode 100644
index 000000000..e65c731c2
--- /dev/null
+++ b/pkg/tcpip/stack/BUILD
@@ -0,0 +1,118 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "linkaddrentry_list",
+    out = "linkaddrentry_list.go",
+    package = "stack",
+    prefix = "linkAddrEntry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*linkAddrEntry",
+        "Linker": "*linkAddrEntry",
+    },
+)
+
+go_template_instance(
+    name = "packet_buffer_list",
+    out = "packet_buffer_list.go",
+    package = "stack",
+    prefix = "PacketBuffer",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*PacketBuffer",
+        "Linker": "*PacketBuffer",
+    },
+)
+
+go_library(
+    name = "stack",
+    srcs = [
+        "conntrack.go",
+        "dhcpv6configurationfromndpra_string.go",
+        "forwarder.go",
+        "icmp_rate_limit.go",
+        "iptables.go",
+        "iptables_targets.go",
+        "iptables_types.go",
+        "linkaddrcache.go",
+        "linkaddrentry_list.go",
+        "ndp.go",
+        "nic.go",
+        "packet_buffer.go",
+        "packet_buffer_list.go",
+        "rand.go",
+        "registration.go",
+        "route.go",
+        "stack.go",
+        "stack_global_state.go",
+        "stack_options.go",
+        "transport_demuxer.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/ilist",
+        "//pkg/log",
+        "//pkg/rand",
+        "//pkg/sleep",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/hash/jenkins",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/ports",
+        "//pkg/tcpip/seqnum",
+        "//pkg/tcpip/transport/tcpconntrack",
+        "//pkg/waiter",
+        "@org_golang_x_time//rate:go_default_library",
+    ],
+)
+
+go_test(
+    name = "stack_x_test",
+    size = "medium",
+    srcs = [
+        "ndp_test.go",
+        "stack_test.go",
+        "transport_demuxer_test.go",
+        "transport_test.go",
+    ],
+    shard_count = 20,
+    deps = [
+        ":stack",
+        "//pkg/rand",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/checker",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/ports",
+        "//pkg/tcpip/transport/icmp",
+        "//pkg/tcpip/transport/udp",
+        "//pkg/waiter",
+        "@com_github_google_go-cmp//cmp:go_default_library",
+    ],
+)
+
+go_test(
+    name = "stack_test",
+    size = "small",
+    srcs = [
+        "forwarder_test.go",
+        "linkaddrcache_test.go",
+        "nic_test.go",
+    ],
+    library = ":stack",
+    deps = [
+        "//pkg/sleep",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+    ],
+)
diff --git a/pkg/tcpip/stack/conntrack.go b/pkg/tcpip/stack/conntrack.go
new file mode 100644
index 000000000..af9c325ca
--- /dev/null
+++ b/pkg/tcpip/stack/conntrack.go
@@ -0,0 +1,331 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack"
+)
+
+// Connection tracking is used to track and manipulate packets for NAT rules.
+// The connection is created for a packet if it does not exist. Every
+// connection contains two tuples (original and reply). The tuples are
+// manipulated if there is a matching NAT rule. The packet is modified by
+// looking at the tuples in the Prerouting and Output hooks.
+//
+// Currently, only TCP tracking is supported.
+
+// Direction of the tuple.
+type direction int
+
+const (
+	dirOriginal direction = iota
+	dirReply
+)
+
+// Manipulation type for the connection.
+type manipType int
+
+const (
+	manipDstPrerouting manipType = iota
+	manipDstOutput
+)
+
+// tuple holds a connection's identifying and manipulating data in one
+// direction. It is immutable.
+type tuple struct {
+	tupleID
+
+	// conn is the connection tracking entry this tuple belongs to.
+	conn *conn
+
+	// direction is the direction of the tuple.
+	direction direction
+}
+
+// tupleID uniquely identifies a connection in one direction. It currently
+// contains enough information to distinguish between any TCP or UDP
+// connection, and will need to be extended to support other protocols.
+type tupleID struct {
+	srcAddr    tcpip.Address
+	srcPort    uint16
+	dstAddr    tcpip.Address
+	dstPort    uint16
+	transProto tcpip.TransportProtocolNumber
+	netProto   tcpip.NetworkProtocolNumber
+}
+
+// reply creates the reply tupleID.
+func (ti tupleID) reply() tupleID {
+	return tupleID{
+		srcAddr:    ti.dstAddr,
+		srcPort:    ti.dstPort,
+		dstAddr:    ti.srcAddr,
+		dstPort:    ti.srcPort,
+		transProto: ti.transProto,
+		netProto:   ti.netProto,
+	}
+}
+
+// conn is a tracked connection.
+type conn struct {
+	// original is the tuple in original direction. It is immutable.
+	original tuple
+
+	// reply is the tuple in reply direction. It is immutable.
+	reply tuple
+
+	// manip indicates if the packet should be manipulated. It is immutable.
+	manip manipType
+
+	// tcbHook indicates if the packet is inbound or outbound to
+	// update the state of tcb. It is immutable.
+	tcbHook Hook
+
+	// mu protects tcb.
+	mu sync.Mutex
+
+	// tcb is TCB control block. It is used to keep track of states
+	// of tcp connection and is protected by mu.
+	tcb tcpconntrack.TCB
+}
+
+// ConnTrack tracks all connections created for NAT rules. Most users are
+// expected to only call handlePacket and createConnFor.
+type ConnTrack struct {
+	// mu protects conns.
+	mu sync.RWMutex
+
+	// conns maintains a map of tuples needed for connection tracking for
+	// iptables NAT rules. It is protected by mu.
+	conns map[tupleID]tuple
+}
+
+// packetToTupleID converts packet to a tuple ID. It fails when pkt lacks a valid
+// TCP header.
+func packetToTupleID(pkt *PacketBuffer) (tupleID, *tcpip.Error) {
+	// TODO(gvisor.dev/issue/170): Need to support for other
+	// protocols as well.
+	netHeader := header.IPv4(pkt.NetworkHeader)
+	if netHeader == nil || netHeader.TransportProtocol() != header.TCPProtocolNumber {
+		return tupleID{}, tcpip.ErrUnknownProtocol
+	}
+	tcpHeader := header.TCP(pkt.TransportHeader)
+	if tcpHeader == nil {
+		return tupleID{}, tcpip.ErrUnknownProtocol
+	}
+
+	return tupleID{
+		srcAddr:    netHeader.SourceAddress(),
+		srcPort:    tcpHeader.SourcePort(),
+		dstAddr:    netHeader.DestinationAddress(),
+		dstPort:    tcpHeader.DestinationPort(),
+		transProto: netHeader.TransportProtocol(),
+		netProto:   header.IPv4ProtocolNumber,
+	}, nil
+}
+
+// newConn creates new connection.
+func newConn(orig, reply tupleID, manip manipType, hook Hook) *conn {
+	conn := conn{
+		manip:   manip,
+		tcbHook: hook,
+	}
+	conn.original = tuple{conn: &conn, tupleID: orig}
+	conn.reply = tuple{conn: &conn, tupleID: reply, direction: dirReply}
+	return &conn
+}
+
+// connFor gets the conn for pkt if it exists, or returns nil
+// if it does not. It returns an error when pkt does not contain a valid TCP
+// header.
+// TODO(gvisor.dev/issue/170): Only TCP packets are supported. Need to support
+// other transport protocols.
+func (ct *ConnTrack) connFor(pkt *PacketBuffer) (*conn, direction) {
+	tid, err := packetToTupleID(pkt)
+	if err != nil {
+		return nil, dirOriginal
+	}
+
+	ct.mu.Lock()
+	defer ct.mu.Unlock()
+
+	tuple, ok := ct.conns[tid]
+	if !ok {
+		return nil, dirOriginal
+	}
+	return tuple.conn, tuple.direction
+}
+
+// createConnFor creates a new conn for pkt.
+func (ct *ConnTrack) createConnFor(pkt *PacketBuffer, hook Hook, rt RedirectTarget) *conn {
+	tid, err := packetToTupleID(pkt)
+	if err != nil {
+		return nil
+	}
+	if hook != Prerouting && hook != Output {
+		return nil
+	}
+
+	// Create a new connection and change the port as per the iptables
+	// rule. This tuple will be used to manipulate the packet in
+	// handlePacket.
+	replyTID := tid.reply()
+	replyTID.srcAddr = rt.MinIP
+	replyTID.srcPort = rt.MinPort
+	var manip manipType
+	switch hook {
+	case Prerouting:
+		manip = manipDstPrerouting
+	case Output:
+		manip = manipDstOutput
+	}
+	conn := newConn(tid, replyTID, manip, hook)
+
+	// Add the changed tuple to the map.
+	// TODO(gvisor.dev/issue/170): Need to support collisions using linked
+	// list.
+	ct.mu.Lock()
+	defer ct.mu.Unlock()
+	ct.conns[tid] = conn.original
+	ct.conns[replyTID] = conn.reply
+
+	return conn
+}
+
+// handlePacketPrerouting manipulates ports for packets in Prerouting hook.
+// TODO(gvisor.dev/issue/170): Change address for Prerouting hook.
+func handlePacketPrerouting(pkt *PacketBuffer, conn *conn, dir direction) {
+	netHeader := header.IPv4(pkt.NetworkHeader)
+	tcpHeader := header.TCP(pkt.TransportHeader)
+
+	// For prerouting redirection, packets going in the original direction
+	// have their destinations modified and replies have their sources
+	// modified.
+	switch dir {
+	case dirOriginal:
+		port := conn.reply.srcPort
+		tcpHeader.SetDestinationPort(port)
+		netHeader.SetDestinationAddress(conn.reply.srcAddr)
+	case dirReply:
+		port := conn.original.dstPort
+		tcpHeader.SetSourcePort(port)
+		netHeader.SetSourceAddress(conn.original.dstAddr)
+	}
+
+	netHeader.SetChecksum(0)
+	netHeader.SetChecksum(^netHeader.CalculateChecksum())
+}
+
+// handlePacketOutput manipulates ports for packets in Output hook.
+func handlePacketOutput(pkt *PacketBuffer, conn *conn, gso *GSO, r *Route, dir direction) {
+	netHeader := header.IPv4(pkt.NetworkHeader)
+	tcpHeader := header.TCP(pkt.TransportHeader)
+
+	// For output redirection, packets going in the original direction
+	// have their destinations modified and replies have their sources
+	// modified. For prerouting redirection, we only reach this point
+	// when replying, so packet sources are modified.
+	if conn.manip == manipDstOutput && dir == dirOriginal {
+		port := conn.reply.srcPort
+		tcpHeader.SetDestinationPort(port)
+		netHeader.SetDestinationAddress(conn.reply.srcAddr)
+	} else {
+		port := conn.original.dstPort
+		tcpHeader.SetSourcePort(port)
+		netHeader.SetSourceAddress(conn.original.dstAddr)
+	}
+
+	// Calculate the TCP checksum and set it.
+	tcpHeader.SetChecksum(0)
+	hdr := &pkt.Header
+	length := uint16(pkt.Data.Size()+hdr.UsedLength()) - uint16(netHeader.HeaderLength())
+	xsum := r.PseudoHeaderChecksum(header.TCPProtocolNumber, length)
+	if gso != nil && gso.NeedsCsum {
+		tcpHeader.SetChecksum(xsum)
+	} else if r.Capabilities()&CapabilityTXChecksumOffload == 0 {
+		xsum = header.ChecksumVVWithOffset(pkt.Data, xsum, int(tcpHeader.DataOffset()), pkt.Data.Size())
+		tcpHeader.SetChecksum(^tcpHeader.CalculateChecksum(xsum))
+	}
+
+	netHeader.SetChecksum(0)
+	netHeader.SetChecksum(^netHeader.CalculateChecksum())
+}
+
+// handlePacket will manipulate the port and address of the packet if the
+// connection exists.
+func (ct *ConnTrack) handlePacket(pkt *PacketBuffer, hook Hook, gso *GSO, r *Route) {
+	if pkt.NatDone {
+		return
+	}
+
+	if hook != Prerouting && hook != Output {
+		return
+	}
+
+	conn, dir := ct.connFor(pkt)
+	if conn == nil {
+		// Connection not found for the packet or the packet is invalid.
+		return
+	}
+
+	switch hook {
+	case Prerouting:
+		handlePacketPrerouting(pkt, conn, dir)
+	case Output:
+		handlePacketOutput(pkt, conn, gso, r, dir)
+	}
+	pkt.NatDone = true
+
+	// Update the state of tcb.
+	// TODO(gvisor.dev/issue/170): Add support in tcpcontrack to handle
+	// other tcp states.
+	conn.mu.Lock()
+	defer conn.mu.Unlock()
+	var st tcpconntrack.Result
+	tcpHeader := header.TCP(pkt.TransportHeader)
+	if conn.tcb.IsEmpty() {
+		conn.tcb.Init(tcpHeader)
+		conn.tcbHook = hook
+	} else {
+		switch hook {
+		case conn.tcbHook:
+			st = conn.tcb.UpdateStateOutbound(tcpHeader)
+		default:
+			st = conn.tcb.UpdateStateInbound(tcpHeader)
+		}
+	}
+
+	// Delete conn if tcp connection is closed.
+	if st == tcpconntrack.ResultClosedByPeer || st == tcpconntrack.ResultClosedBySelf || st == tcpconntrack.ResultReset {
+		ct.deleteConn(conn)
+	}
+}
+
+// deleteConn deletes the connection.
+func (ct *ConnTrack) deleteConn(conn *conn) {
+	if conn == nil {
+		return
+	}
+
+	ct.mu.Lock()
+	defer ct.mu.Unlock()
+
+	delete(ct.conns, conn.original.tupleID)
+	delete(ct.conns, conn.reply.tupleID)
+}
diff --git a/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go b/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go
new file mode 100644
index 000000000..d199ded6a
--- /dev/null
+++ b/pkg/tcpip/stack/dhcpv6configurationfromndpra_string.go
@@ -0,0 +1,40 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by "stringer -type DHCPv6ConfigurationFromNDPRA"; DO NOT EDIT.
+
+package stack
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[DHCPv6NoConfiguration-1]
+	_ = x[DHCPv6ManagedAddress-2]
+	_ = x[DHCPv6OtherConfigurations-3]
+}
+
+const _DHCPv6ConfigurationFromNDPRA_name = "DHCPv6NoConfigurationDHCPv6ManagedAddressDHCPv6OtherConfigurations"
+
+var _DHCPv6ConfigurationFromNDPRA_index = [...]uint8{0, 21, 41, 66}
+
+func (i DHCPv6ConfigurationFromNDPRA) String() string {
+	i -= 1
+	if i < 0 || i >= DHCPv6ConfigurationFromNDPRA(len(_DHCPv6ConfigurationFromNDPRA_index)-1) {
+		return "DHCPv6ConfigurationFromNDPRA(" + strconv.FormatInt(int64(i+1), 10) + ")"
+	}
+	return _DHCPv6ConfigurationFromNDPRA_name[_DHCPv6ConfigurationFromNDPRA_index[i]:_DHCPv6ConfigurationFromNDPRA_index[i+1]]
+}
diff --git a/pkg/tcpip/stack/forwarder.go b/pkg/tcpip/stack/forwarder.go
new file mode 100644
index 000000000..3eff141e6
--- /dev/null
+++ b/pkg/tcpip/stack/forwarder.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// maxPendingResolutions is the maximum number of pending link-address
+	// resolutions.
+	maxPendingResolutions          = 64
+	maxPendingPacketsPerResolution = 256
+)
+
+type pendingPacket struct {
+	nic   *NIC
+	route *Route
+	proto tcpip.NetworkProtocolNumber
+	pkt   *PacketBuffer
+}
+
+type forwardQueue struct {
+	sync.Mutex
+
+	// The packets to send once the resolver completes.
+	packets map[<-chan struct{}][]*pendingPacket
+
+	// FIFO of channels used to cancel the oldest goroutine waiting for
+	// link-address resolution.
+	cancelChans []chan struct{}
+}
+
+func newForwardQueue() *forwardQueue {
+	return &forwardQueue{packets: make(map[<-chan struct{}][]*pendingPacket)}
+}
+
+func (f *forwardQueue) enqueue(ch <-chan struct{}, n *NIC, r *Route, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	shouldWait := false
+
+	f.Lock()
+	packets, ok := f.packets[ch]
+	if !ok {
+		shouldWait = true
+	}
+	for len(packets) == maxPendingPacketsPerResolution {
+		p := packets[0]
+		packets = packets[1:]
+		p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+		p.route.Release()
+	}
+	if l := len(packets); l >= maxPendingPacketsPerResolution {
+		panic(fmt.Sprintf("max pending packets for resolution reached; got %d packets, max = %d", l, maxPendingPacketsPerResolution))
+	}
+	f.packets[ch] = append(packets, &pendingPacket{
+		nic:   n,
+		route: r,
+		proto: protocol,
+		pkt:   pkt,
+	})
+	f.Unlock()
+
+	if !shouldWait {
+		return
+	}
+
+	// Wait for the link-address resolution to complete.
+	// Start a goroutine with a forwarding-cancel channel so that we can
+	// limit the maximum number of goroutines running concurrently.
+	cancel := f.newCancelChannel()
+	go func() {
+		cancelled := false
+		select {
+		case <-ch:
+		case <-cancel:
+			cancelled = true
+		}
+
+		f.Lock()
+		packets := f.packets[ch]
+		delete(f.packets, ch)
+		f.Unlock()
+
+		for _, p := range packets {
+			if cancelled {
+				p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+			} else if _, err := p.route.Resolve(nil); err != nil {
+				p.nic.stack.stats.IP.OutgoingPacketErrors.Increment()
+			} else {
+				p.nic.forwardPacket(p.route, p.proto, p.pkt)
+			}
+			p.route.Release()
+		}
+	}()
+}
+
+// newCancelChannel creates a channel that can cancel a pending forwarding
+// activity. The oldest channel is closed if the number of open channels would
+// exceed maxPendingResolutions.
+func (f *forwardQueue) newCancelChannel() chan struct{} {
+	f.Lock()
+	defer f.Unlock()
+
+	if len(f.cancelChans) == maxPendingResolutions {
+		ch := f.cancelChans[0]
+		f.cancelChans = f.cancelChans[1:]
+		close(ch)
+	}
+	if l := len(f.cancelChans); l >= maxPendingResolutions {
+		panic(fmt.Sprintf("max pending resolutions reached; got %d active resolutions, max = %d", l, maxPendingResolutions))
+	}
+
+	ch := make(chan struct{})
+	f.cancelChans = append(f.cancelChans, ch)
+	return ch
+}
diff --git a/pkg/tcpip/stack/forwarder_test.go b/pkg/tcpip/stack/forwarder_test.go
new file mode 100644
index 000000000..a6546cef0
--- /dev/null
+++ b/pkg/tcpip/stack/forwarder_test.go
@@ -0,0 +1,650 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"encoding/binary"
+	"math"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+const (
+	fwdTestNetNumber           tcpip.NetworkProtocolNumber = math.MaxUint32
+	fwdTestNetHeaderLen                                    = 12
+	fwdTestNetDefaultPrefixLen                             = 8
+
+	// fwdTestNetDefaultMTU is the MTU, in bytes, used throughout the tests,
+	// except where another value is explicitly used. It is chosen to match
+	// the MTU of loopback interfaces on linux systems.
+	fwdTestNetDefaultMTU = 65536
+
+	dstAddrOffset        = 0
+	srcAddrOffset        = 1
+	protocolNumberOffset = 2
+)
+
+// fwdTestNetworkEndpoint is a network-layer protocol endpoint.
+// Headers of this protocol are fwdTestNetHeaderLen bytes, but we currently only
+// use the first three: destination address, source address, and transport
+// protocol. They're all one byte fields to simplify parsing.
+type fwdTestNetworkEndpoint struct {
+	nicID      tcpip.NICID
+	id         NetworkEndpointID
+	prefixLen  int
+	proto      *fwdTestNetworkProtocol
+	dispatcher TransportDispatcher
+	ep         LinkEndpoint
+}
+
+func (f *fwdTestNetworkEndpoint) MTU() uint32 {
+	return f.ep.MTU() - uint32(f.MaxHeaderLength())
+}
+
+func (f *fwdTestNetworkEndpoint) NICID() tcpip.NICID {
+	return f.nicID
+}
+
+func (f *fwdTestNetworkEndpoint) PrefixLen() int {
+	return f.prefixLen
+}
+
+func (*fwdTestNetworkEndpoint) DefaultTTL() uint8 {
+	return 123
+}
+
+func (f *fwdTestNetworkEndpoint) ID() *NetworkEndpointID {
+	return &f.id
+}
+
+func (f *fwdTestNetworkEndpoint) HandlePacket(r *Route, pkt *PacketBuffer) {
+	// Dispatch the packet to the transport protocol.
+	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(pkt.NetworkHeader[protocolNumberOffset]), pkt)
+}
+
+func (f *fwdTestNetworkEndpoint) MaxHeaderLength() uint16 {
+	return f.ep.MaxHeaderLength() + fwdTestNetHeaderLen
+}
+
+func (f *fwdTestNetworkEndpoint) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, dstAddr tcpip.Address) uint16 {
+	return 0
+}
+
+func (f *fwdTestNetworkEndpoint) Capabilities() LinkEndpointCapabilities {
+	return f.ep.Capabilities()
+}
+
+func (f *fwdTestNetworkEndpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return f.proto.Number()
+}
+
+func (f *fwdTestNetworkEndpoint) WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error {
+	// Add the protocol's header to the packet and send it to the link
+	// endpoint.
+	b := pkt.Header.Prepend(fwdTestNetHeaderLen)
+	b[dstAddrOffset] = r.RemoteAddress[0]
+	b[srcAddrOffset] = f.id.LocalAddress[0]
+	b[protocolNumberOffset] = byte(params.Protocol)
+
+	return f.ep.WritePacket(r, gso, fwdTestNetNumber, pkt)
+}
+
+// WritePackets implements LinkEndpoint.WritePackets.
+func (f *fwdTestNetworkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+func (*fwdTestNetworkEndpoint) WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+func (*fwdTestNetworkEndpoint) Close() {}
+
+// fwdTestNetworkProtocol is a network-layer protocol that implements Address
+// resolution.
+type fwdTestNetworkProtocol struct {
+	addrCache              *linkAddrCache
+	addrResolveDelay       time.Duration
+	onLinkAddressResolved  func(cache *linkAddrCache, addr tcpip.Address)
+	onResolveStaticAddress func(tcpip.Address) (tcpip.LinkAddress, bool)
+}
+
+func (f *fwdTestNetworkProtocol) Number() tcpip.NetworkProtocolNumber {
+	return fwdTestNetNumber
+}
+
+func (f *fwdTestNetworkProtocol) MinimumPacketSize() int {
+	return fwdTestNetHeaderLen
+}
+
+func (f *fwdTestNetworkProtocol) DefaultPrefixLen() int {
+	return fwdTestNetDefaultPrefixLen
+}
+
+func (*fwdTestNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
+	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
+}
+
+func (*fwdTestNetworkProtocol) Parse(pkt *PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
+	netHeader, ok := pkt.Data.PullUp(fwdTestNetHeaderLen)
+	if !ok {
+		return 0, false, false
+	}
+	pkt.NetworkHeader = netHeader
+	pkt.Data.TrimFront(fwdTestNetHeaderLen)
+	return tcpip.TransportProtocolNumber(pkt.NetworkHeader[protocolNumberOffset]), true, true
+}
+
+func (f *fwdTestNetworkProtocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, ep LinkEndpoint, _ *Stack) (NetworkEndpoint, *tcpip.Error) {
+	return &fwdTestNetworkEndpoint{
+		nicID:      nicID,
+		id:         NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
+		prefixLen:  addrWithPrefix.PrefixLen,
+		proto:      f,
+		dispatcher: dispatcher,
+		ep:         ep,
+	}, nil
+}
+
+func (f *fwdTestNetworkProtocol) SetOption(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func (f *fwdTestNetworkProtocol) Option(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+func (f *fwdTestNetworkProtocol) Close() {}
+
+func (f *fwdTestNetworkProtocol) Wait() {}
+
+func (f *fwdTestNetworkProtocol) LinkAddressRequest(addr, localAddr tcpip.Address, linkEP LinkEndpoint) *tcpip.Error {
+	if f.addrCache != nil && f.onLinkAddressResolved != nil {
+		time.AfterFunc(f.addrResolveDelay, func() {
+			f.onLinkAddressResolved(f.addrCache, addr)
+		})
+	}
+	return nil
+}
+
+func (f *fwdTestNetworkProtocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+	if f.onResolveStaticAddress != nil {
+		return f.onResolveStaticAddress(addr)
+	}
+	return "", false
+}
+
+func (f *fwdTestNetworkProtocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
+	return fwdTestNetNumber
+}
+
+// fwdTestPacketInfo holds all the information about an outbound packet.
+type fwdTestPacketInfo struct {
+	RemoteLinkAddress tcpip.LinkAddress
+	LocalLinkAddress  tcpip.LinkAddress
+	Pkt               *PacketBuffer
+}
+
+type fwdTestLinkEndpoint struct {
+	dispatcher NetworkDispatcher
+	mtu        uint32
+	linkAddr   tcpip.LinkAddress
+
+	// C is where outbound packets are queued.
+	C chan fwdTestPacketInfo
+}
+
+// InjectInbound injects an inbound packet.
+func (e *fwdTestLinkEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	e.InjectLinkAddr(protocol, "", pkt)
+}
+
+// InjectLinkAddr injects an inbound packet with a remote link address.
+func (e *fwdTestLinkEndpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt *PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
+}
+
+// Attach saves the stack network-layer dispatcher for use later when packets
+// are injected.
+func (e *fwdTestLinkEndpoint) Attach(dispatcher NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *fwdTestLinkEndpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
+// during construction.
+func (e *fwdTestLinkEndpoint) MTU() uint32 {
+	return e.mtu
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (e fwdTestLinkEndpoint) Capabilities() LinkEndpointCapabilities {
+	caps := LinkEndpointCapabilities(0)
+	return caps | CapabilityResolutionRequired
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (*fwdTestLinkEndpoint) GSOMaxSize() uint32 {
+	return 1 << 15
+}
+
+// MaxHeaderLength returns the maximum size of the link layer header. Given it
+// doesn't have a header, it just returns 0.
+func (*fwdTestLinkEndpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (e *fwdTestLinkEndpoint) LinkAddress() tcpip.LinkAddress {
+	return e.linkAddr
+}
+
+func (e fwdTestLinkEndpoint) WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error {
+	p := fwdTestPacketInfo{
+		RemoteLinkAddress: r.RemoteLinkAddress,
+		LocalLinkAddress:  r.LocalLinkAddress,
+		Pkt:               pkt,
+	}
+
+	select {
+	case e.C <- p:
+	default:
+	}
+
+	return nil
+}
+
+// WritePackets stores outbound packets into the channel.
+func (e *fwdTestLinkEndpoint) WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	n := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		e.WritePacket(r, gso, protocol, pkt)
+		n++
+	}
+
+	return n, nil
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *fwdTestLinkEndpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	p := fwdTestPacketInfo{
+		Pkt: &PacketBuffer{Data: vv},
+	}
+
+	select {
+	case e.C <- p:
+	default:
+	}
+
+	return nil
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (*fwdTestLinkEndpoint) Wait() {}
+
+func fwdTestNetFactory(t *testing.T, proto *fwdTestNetworkProtocol) (ep1, ep2 *fwdTestLinkEndpoint) {
+	// Create a stack with the network protocol and two NICs.
+	s := New(Options{
+		NetworkProtocols: []NetworkProtocol{proto},
+	})
+
+	proto.addrCache = s.linkAddrCache
+
+	// Enable forwarding.
+	s.SetForwarding(true)
+
+	// NIC 1 has the link address "a", and added the network address 1.
+	ep1 = &fwdTestLinkEndpoint{
+		C:        make(chan fwdTestPacketInfo, 300),
+		mtu:      fwdTestNetDefaultMTU,
+		linkAddr: "a",
+	}
+	if err := s.CreateNIC(1, ep1); err != nil {
+		t.Fatal("CreateNIC #1 failed:", err)
+	}
+	if err := s.AddAddress(1, fwdTestNetNumber, "\x01"); err != nil {
+		t.Fatal("AddAddress #1 failed:", err)
+	}
+
+	// NIC 2 has the link address "b", and added the network address 2.
+	ep2 = &fwdTestLinkEndpoint{
+		C:        make(chan fwdTestPacketInfo, 300),
+		mtu:      fwdTestNetDefaultMTU,
+		linkAddr: "b",
+	}
+	if err := s.CreateNIC(2, ep2); err != nil {
+		t.Fatal("CreateNIC #2 failed:", err)
+	}
+	if err := s.AddAddress(2, fwdTestNetNumber, "\x02"); err != nil {
+		t.Fatal("AddAddress #2 failed:", err)
+	}
+
+	// Route all packets to NIC 2.
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, NIC: 2}})
+	}
+
+	return ep1, ep2
+}
+
+func TestForwardingWithStaticResolver(t *testing.T) {
+	// Create a network protocol with a static resolver.
+	proto := &fwdTestNetworkProtocol{
+		onResolveStaticAddress:
+		// The network address 3 is resolved to the link address "c".
+		func(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+			if addr == "\x03" {
+				return "c", true
+			}
+			return "", false
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// Inject an inbound packet to address 3 on NIC 1, and see if it is
+	// forwarded to NIC 2.
+	buf := buffer.NewView(30)
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	var p fwdTestPacketInfo
+
+	select {
+	case p = <-ep2.C:
+	default:
+		t.Fatal("packet not forwarded")
+	}
+
+	// Test that the static address resolution happened correctly.
+	if p.RemoteLinkAddress != "c" {
+		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+	}
+	if p.LocalLinkAddress != "b" {
+		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+	}
+}
+
+func TestForwardingWithFakeResolver(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Any address will be resolved to the link address "c".
+			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// Inject an inbound packet to address 3 on NIC 1, and see if it is
+	// forwarded to NIC 2.
+	buf := buffer.NewView(30)
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	var p fwdTestPacketInfo
+
+	select {
+	case p = <-ep2.C:
+	case <-time.After(time.Second):
+		t.Fatal("packet not forwarded")
+	}
+
+	// Test that the address resolution happened correctly.
+	if p.RemoteLinkAddress != "c" {
+		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+	}
+	if p.LocalLinkAddress != "b" {
+		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+	}
+}
+
+func TestForwardingWithNoResolver(t *testing.T) {
+	// Create a network protocol without a resolver.
+	proto := &fwdTestNetworkProtocol{}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// inject an inbound packet to address 3 on NIC 1, and see if it is
+	// forwarded to NIC 2.
+	buf := buffer.NewView(30)
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	select {
+	case <-ep2.C:
+		t.Fatal("Packet should not be forwarded")
+	case <-time.After(time.Second):
+	}
+}
+
+func TestForwardingWithFakeResolverPartialTimeout(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Only packets to address 3 will be resolved to the
+			// link address "c".
+			if addr == "\x03" {
+				cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+			}
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// Inject an inbound packet to address 4 on NIC 1. This packet should
+	// not be forwarded.
+	buf := buffer.NewView(30)
+	buf[dstAddrOffset] = 4
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	// Inject an inbound packet to address 3 on NIC 1, and see if it is
+	// forwarded to NIC 2.
+	buf = buffer.NewView(30)
+	buf[dstAddrOffset] = 3
+	ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	var p fwdTestPacketInfo
+
+	select {
+	case p = <-ep2.C:
+	case <-time.After(time.Second):
+		t.Fatal("packet not forwarded")
+	}
+
+	if p.Pkt.NetworkHeader[dstAddrOffset] != 3 {
+		t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", p.Pkt.NetworkHeader[dstAddrOffset])
+	}
+
+	// Test that the address resolution happened correctly.
+	if p.RemoteLinkAddress != "c" {
+		t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+	}
+	if p.LocalLinkAddress != "b" {
+		t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+	}
+}
+
+func TestForwardingWithFakeResolverTwoPackets(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Any packets will be resolved to the link address "c".
+			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	// Inject two inbound packets to address 3 on NIC 1.
+	for i := 0; i < 2; i++ {
+		buf := buffer.NewView(30)
+		buf[dstAddrOffset] = 3
+		ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+			Data: buf.ToVectorisedView(),
+		})
+	}
+
+	for i := 0; i < 2; i++ {
+		var p fwdTestPacketInfo
+
+		select {
+		case p = <-ep2.C:
+		case <-time.After(time.Second):
+			t.Fatal("packet not forwarded")
+		}
+
+		if p.Pkt.NetworkHeader[dstAddrOffset] != 3 {
+			t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want = 3", p.Pkt.NetworkHeader[dstAddrOffset])
+		}
+
+		// Test that the address resolution happened correctly.
+		if p.RemoteLinkAddress != "c" {
+			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+		}
+		if p.LocalLinkAddress != "b" {
+			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+		}
+	}
+}
+
+func TestForwardingWithFakeResolverManyPackets(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Any packets will be resolved to the link address "c".
+			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	for i := 0; i < maxPendingPacketsPerResolution+5; i++ {
+		// Inject inbound 'maxPendingPacketsPerResolution + 5' packets on NIC 1.
+		buf := buffer.NewView(30)
+		buf[dstAddrOffset] = 3
+		// Set the packet sequence number.
+		binary.BigEndian.PutUint16(buf[fwdTestNetHeaderLen:], uint16(i))
+		ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+			Data: buf.ToVectorisedView(),
+		})
+	}
+
+	for i := 0; i < maxPendingPacketsPerResolution; i++ {
+		var p fwdTestPacketInfo
+
+		select {
+		case p = <-ep2.C:
+		case <-time.After(time.Second):
+			t.Fatal("packet not forwarded")
+		}
+
+		if b := p.Pkt.Header.View(); b[dstAddrOffset] != 3 {
+			t.Fatalf("got b[dstAddrOffset] = %d, want = 3", b[dstAddrOffset])
+		}
+		seqNumBuf, ok := p.Pkt.Data.PullUp(2) // The sequence number is a uint16 (2 bytes).
+		if !ok {
+			t.Fatalf("p.Pkt.Data is too short to hold a sequence number: %d", p.Pkt.Data.Size())
+		}
+
+		// The first 5 packets should not be forwarded so the sequence number should
+		// start with 5.
+		want := uint16(i + 5)
+		if n := binary.BigEndian.Uint16(seqNumBuf); n != want {
+			t.Fatalf("got the packet #%d, want = #%d", n, want)
+		}
+
+		// Test that the address resolution happened correctly.
+		if p.RemoteLinkAddress != "c" {
+			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+		}
+		if p.LocalLinkAddress != "b" {
+			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+		}
+	}
+}
+
+func TestForwardingWithFakeResolverManyResolutions(t *testing.T) {
+	// Create a network protocol with a fake resolver.
+	proto := &fwdTestNetworkProtocol{
+		addrResolveDelay: 500 * time.Millisecond,
+		onLinkAddressResolved: func(cache *linkAddrCache, addr tcpip.Address) {
+			// Any packets will be resolved to the link address "c".
+			cache.add(tcpip.FullAddress{NIC: 2, Addr: addr}, "c")
+		},
+	}
+
+	ep1, ep2 := fwdTestNetFactory(t, proto)
+
+	for i := 0; i < maxPendingResolutions+5; i++ {
+		// Inject inbound 'maxPendingResolutions + 5' packets on NIC 1.
+		// Each packet has a different destination address (3 to
+		// maxPendingResolutions + 7).
+		buf := buffer.NewView(30)
+		buf[dstAddrOffset] = byte(3 + i)
+		ep1.InjectInbound(fwdTestNetNumber, &PacketBuffer{
+			Data: buf.ToVectorisedView(),
+		})
+	}
+
+	for i := 0; i < maxPendingResolutions; i++ {
+		var p fwdTestPacketInfo
+
+		select {
+		case p = <-ep2.C:
+		case <-time.After(time.Second):
+			t.Fatal("packet not forwarded")
+		}
+
+		// The first 5 packets (address 3 to 7) should not be forwarded
+		// because their address resolutions are interrupted.
+		if p.Pkt.NetworkHeader[dstAddrOffset] < 8 {
+			t.Fatalf("got p.Pkt.NetworkHeader[dstAddrOffset] = %d, want p.Pkt.NetworkHeader[dstAddrOffset] >= 8", p.Pkt.NetworkHeader[dstAddrOffset])
+		}
+
+		// Test that the address resolution happened correctly.
+		if p.RemoteLinkAddress != "c" {
+			t.Fatalf("got p.RemoteLinkAddress = %s, want = c", p.RemoteLinkAddress)
+		}
+		if p.LocalLinkAddress != "b" {
+			t.Fatalf("got p.LocalLinkAddress = %s, want = b", p.LocalLinkAddress)
+		}
+	}
+}
diff --git a/pkg/tcpip/stack/icmp_rate_limit.go b/pkg/tcpip/stack/icmp_rate_limit.go
new file mode 100644
index 000000000..3a20839da
--- /dev/null
+++ b/pkg/tcpip/stack/icmp_rate_limit.go
@@ -0,0 +1,41 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"golang.org/x/time/rate"
+)
+
+const (
+	// icmpLimit is the default maximum number of ICMP messages permitted by this
+	// rate limiter.
+	icmpLimit = 1000
+
+	// icmpBurst is the default number of ICMP messages that can be sent in a single
+	// burst.
+	icmpBurst = 50
+)
+
+// ICMPRateLimiter is a global rate limiter that controls the generation of
+// ICMP messages generated by the stack.
+type ICMPRateLimiter struct {
+	*rate.Limiter
+}
+
+// NewICMPRateLimiter returns a global rate limiter for controlling the rate
+// at which ICMP messages are generated by the stack.
+func NewICMPRateLimiter() *ICMPRateLimiter {
+	return &ICMPRateLimiter{Limiter: rate.NewLimiter(icmpLimit, icmpBurst)}
+}
diff --git a/pkg/tcpip/stack/iptables.go b/pkg/tcpip/stack/iptables.go
new file mode 100644
index 000000000..974d77c36
--- /dev/null
+++ b/pkg/tcpip/stack/iptables.go
@@ -0,0 +1,367 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+// Table names.
+const (
+	TablenameNat    = "nat"
+	TablenameMangle = "mangle"
+	TablenameFilter = "filter"
+)
+
+// Chain names as defined by net/ipv4/netfilter/ip_tables.c.
+const (
+	ChainNamePrerouting  = "PREROUTING"
+	ChainNameInput       = "INPUT"
+	ChainNameForward     = "FORWARD"
+	ChainNameOutput      = "OUTPUT"
+	ChainNamePostrouting = "POSTROUTING"
+)
+
+// HookUnset indicates that there is no hook set for an entrypoint or
+// underflow.
+const HookUnset = -1
+
+// DefaultTables returns a default set of tables. Each chain is set to accept
+// all packets.
+func DefaultTables() *IPTables {
+	// TODO(gvisor.dev/issue/170): We may be able to swap out some strings for
+	// iotas.
+	return &IPTables{
+		tables: map[string]Table{
+			TablenameNat: Table{
+				Rules: []Rule{
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: ErrorTarget{}},
+				},
+				BuiltinChains: map[Hook]int{
+					Prerouting:  0,
+					Input:       1,
+					Output:      2,
+					Postrouting: 3,
+				},
+				Underflows: map[Hook]int{
+					Prerouting:  0,
+					Input:       1,
+					Output:      2,
+					Postrouting: 3,
+				},
+				UserChains: map[string]int{},
+			},
+			TablenameMangle: Table{
+				Rules: []Rule{
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: ErrorTarget{}},
+				},
+				BuiltinChains: map[Hook]int{
+					Prerouting: 0,
+					Output:     1,
+				},
+				Underflows: map[Hook]int{
+					Prerouting: 0,
+					Output:     1,
+				},
+				UserChains: map[string]int{},
+			},
+			TablenameFilter: Table{
+				Rules: []Rule{
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: AcceptTarget{}},
+					Rule{Target: ErrorTarget{}},
+				},
+				BuiltinChains: map[Hook]int{
+					Input:   0,
+					Forward: 1,
+					Output:  2,
+				},
+				Underflows: map[Hook]int{
+					Input:   0,
+					Forward: 1,
+					Output:  2,
+				},
+				UserChains: map[string]int{},
+			},
+		},
+		priorities: map[Hook][]string{
+			Input:      []string{TablenameNat, TablenameFilter},
+			Prerouting: []string{TablenameMangle, TablenameNat},
+			Output:     []string{TablenameMangle, TablenameNat, TablenameFilter},
+		},
+		connections: ConnTrack{
+			conns: make(map[tupleID]tuple),
+		},
+	}
+}
+
+// EmptyFilterTable returns a Table with no rules and the filter table chains
+// mapped to HookUnset.
+func EmptyFilterTable() Table {
+	return Table{
+		Rules: []Rule{},
+		BuiltinChains: map[Hook]int{
+			Input:   HookUnset,
+			Forward: HookUnset,
+			Output:  HookUnset,
+		},
+		Underflows: map[Hook]int{
+			Input:   HookUnset,
+			Forward: HookUnset,
+			Output:  HookUnset,
+		},
+		UserChains: map[string]int{},
+	}
+}
+
+// EmptyNatTable returns a Table with no rules and the filter table chains
+// mapped to HookUnset.
+func EmptyNatTable() Table {
+	return Table{
+		Rules: []Rule{},
+		BuiltinChains: map[Hook]int{
+			Prerouting:  HookUnset,
+			Input:       HookUnset,
+			Output:      HookUnset,
+			Postrouting: HookUnset,
+		},
+		Underflows: map[Hook]int{
+			Prerouting:  HookUnset,
+			Input:       HookUnset,
+			Output:      HookUnset,
+			Postrouting: HookUnset,
+		},
+		UserChains: map[string]int{},
+	}
+}
+
+// GetTable returns table by name.
+func (it *IPTables) GetTable(name string) (Table, bool) {
+	it.mu.RLock()
+	defer it.mu.RUnlock()
+	t, ok := it.tables[name]
+	return t, ok
+}
+
+// ReplaceTable replaces or inserts table by name.
+func (it *IPTables) ReplaceTable(name string, table Table) {
+	it.mu.Lock()
+	defer it.mu.Unlock()
+	it.modified = true
+	it.tables[name] = table
+}
+
+// GetPriorities returns slice of priorities associated with hook.
+func (it *IPTables) GetPriorities(hook Hook) []string {
+	it.mu.RLock()
+	defer it.mu.RUnlock()
+	return it.priorities[hook]
+}
+
+// A chainVerdict is what a table decides should be done with a packet.
+type chainVerdict int
+
+const (
+	// chainAccept indicates the packet should continue through netstack.
+	chainAccept chainVerdict = iota
+
+	// chainAccept indicates the packet should be dropped.
+	chainDrop
+
+	// chainReturn indicates the packet should return to the calling chain
+	// or the underflow rule of a builtin chain.
+	chainReturn
+)
+
+// Check runs pkt through the rules for hook. It returns true when the packet
+// should continue traversing the network stack and false when it should be
+// dropped.
+//
+// Precondition: pkt.NetworkHeader is set.
+func (it *IPTables) Check(hook Hook, pkt *PacketBuffer, gso *GSO, r *Route, address tcpip.Address, nicName string) bool {
+	// Many users never configure iptables. Spare them the cost of rule
+	// traversal if rules have never been set.
+	it.mu.RLock()
+	if !it.modified {
+		it.mu.RUnlock()
+		return true
+	}
+	it.mu.RUnlock()
+
+	// Packets are manipulated only if connection and matching
+	// NAT rule exists.
+	it.connections.handlePacket(pkt, hook, gso, r)
+
+	// Go through each table containing the hook.
+	for _, tablename := range it.GetPriorities(hook) {
+		table, _ := it.GetTable(tablename)
+		ruleIdx := table.BuiltinChains[hook]
+		switch verdict := it.checkChain(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict {
+		// If the table returns Accept, move on to the next table.
+		case chainAccept:
+			continue
+		// The Drop verdict is final.
+		case chainDrop:
+			return false
+		case chainReturn:
+			// Any Return from a built-in chain means we have to
+			// call the underflow.
+			underflow := table.Rules[table.Underflows[hook]]
+			switch v, _ := underflow.Target.Action(pkt, &it.connections, hook, gso, r, address); v {
+			case RuleAccept:
+				continue
+			case RuleDrop:
+				return false
+			case RuleJump, RuleReturn:
+				panic("Underflows should only return RuleAccept or RuleDrop.")
+			default:
+				panic(fmt.Sprintf("Unknown verdict: %d", v))
+			}
+
+		default:
+			panic(fmt.Sprintf("Unknown verdict %v.", verdict))
+		}
+	}
+
+	// Every table returned Accept.
+	return true
+}
+
+// CheckPackets runs pkts through the rules for hook and returns a map of packets that
+// should not go forward.
+//
+// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+//
+// TODO(gvisor.dev/issue/170): pk.NetworkHeader will always be set as a
+// precondition.
+//
+// NOTE: unlike the Check API the returned map contains packets that should be
+// dropped.
+func (it *IPTables) CheckPackets(hook Hook, pkts PacketBufferList, gso *GSO, r *Route, nicName string) (drop map[*PacketBuffer]struct{}, natPkts map[*PacketBuffer]struct{}) {
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		if !pkt.NatDone {
+			if ok := it.Check(hook, pkt, gso, r, "", nicName); !ok {
+				if drop == nil {
+					drop = make(map[*PacketBuffer]struct{})
+				}
+				drop[pkt] = struct{}{}
+			}
+			if pkt.NatDone {
+				if natPkts == nil {
+					natPkts = make(map[*PacketBuffer]struct{})
+				}
+				natPkts[pkt] = struct{}{}
+			}
+		}
+	}
+	return drop, natPkts
+}
+
+// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// TODO(gvisor.dev/issue/170): pkt.NetworkHeader will always be set as a
+// precondition.
+func (it *IPTables) checkChain(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) chainVerdict {
+	// Start from ruleIdx and walk the list of rules until a rule gives us
+	// a verdict.
+	for ruleIdx < len(table.Rules) {
+		switch verdict, jumpTo := it.checkRule(hook, pkt, table, ruleIdx, gso, r, address, nicName); verdict {
+		case RuleAccept:
+			return chainAccept
+
+		case RuleDrop:
+			return chainDrop
+
+		case RuleReturn:
+			return chainReturn
+
+		case RuleJump:
+			// "Jumping" to the next rule just means we're
+			// continuing on down the list.
+			if jumpTo == ruleIdx+1 {
+				ruleIdx++
+				continue
+			}
+			switch verdict := it.checkChain(hook, pkt, table, jumpTo, gso, r, address, nicName); verdict {
+			case chainAccept:
+				return chainAccept
+			case chainDrop:
+				return chainDrop
+			case chainReturn:
+				ruleIdx++
+				continue
+			default:
+				panic(fmt.Sprintf("Unknown verdict: %d", verdict))
+			}
+
+		default:
+			panic(fmt.Sprintf("Unknown verdict: %d", verdict))
+		}
+
+	}
+
+	// We got through the entire table without a decision. Default to DROP
+	// for safety.
+	return chainDrop
+}
+
+// Precondition: pkt is a IPv4 packet of at least length header.IPv4MinimumSize.
+// TODO(gvisor.dev/issue/170): pkt.NetworkHeader will always be set as a
+// precondition.
+func (it *IPTables) checkRule(hook Hook, pkt *PacketBuffer, table Table, ruleIdx int, gso *GSO, r *Route, address tcpip.Address, nicName string) (RuleVerdict, int) {
+	rule := table.Rules[ruleIdx]
+
+	// If pkt.NetworkHeader hasn't been set yet, it will be contained in
+	// pkt.Data.
+	if pkt.NetworkHeader == nil {
+		var ok bool
+		pkt.NetworkHeader, ok = pkt.Data.PullUp(header.IPv4MinimumSize)
+		if !ok {
+			// Precondition has been violated.
+			panic(fmt.Sprintf("iptables checks require IPv4 headers of at least %d bytes", header.IPv4MinimumSize))
+		}
+	}
+
+	// Check whether the packet matches the IP header filter.
+	if !rule.Filter.match(header.IPv4(pkt.NetworkHeader), hook, nicName) {
+		// Continue on to the next rule.
+		return RuleJump, ruleIdx + 1
+	}
+
+	// Go through each rule matcher. If they all match, run
+	// the rule target.
+	for _, matcher := range rule.Matchers {
+		matches, hotdrop := matcher.Match(hook, pkt, "")
+		if hotdrop {
+			return RuleDrop, 0
+		}
+		if !matches {
+			// Continue on to the next rule.
+			return RuleJump, ruleIdx + 1
+		}
+	}
+
+	// All the matchers matched, so run the target.
+	return rule.Target.Action(pkt, &it.connections, hook, gso, r, address)
+}
diff --git a/pkg/tcpip/stack/iptables_targets.go b/pkg/tcpip/stack/iptables_targets.go
new file mode 100644
index 000000000..d43f60c67
--- /dev/null
+++ b/pkg/tcpip/stack/iptables_targets.go
@@ -0,0 +1,164 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+// AcceptTarget accepts packets.
+type AcceptTarget struct{}
+
+// Action implements Target.Action.
+func (AcceptTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+	return RuleAccept, 0
+}
+
+// DropTarget drops packets.
+type DropTarget struct{}
+
+// Action implements Target.Action.
+func (DropTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+	return RuleDrop, 0
+}
+
+// ErrorTarget logs an error and drops the packet. It represents a target that
+// should be unreachable.
+type ErrorTarget struct{}
+
+// Action implements Target.Action.
+func (ErrorTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+	log.Debugf("ErrorTarget triggered.")
+	return RuleDrop, 0
+}
+
+// UserChainTarget marks a rule as the beginning of a user chain.
+type UserChainTarget struct {
+	Name string
+}
+
+// Action implements Target.Action.
+func (UserChainTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+	panic("UserChainTarget should never be called.")
+}
+
+// ReturnTarget returns from the current chain. If the chain is a built-in, the
+// hook's underflow should be called.
+type ReturnTarget struct{}
+
+// Action implements Target.Action.
+func (ReturnTarget) Action(*PacketBuffer, *ConnTrack, Hook, *GSO, *Route, tcpip.Address) (RuleVerdict, int) {
+	return RuleReturn, 0
+}
+
+// RedirectTarget redirects the packet by modifying the destination port/IP.
+// Min and Max values for IP and Ports in the struct indicate the range of
+// values which can be used to redirect.
+type RedirectTarget struct {
+	// TODO(gvisor.dev/issue/170): Other flags need to be added after
+	// we support them.
+	// RangeProtoSpecified flag indicates single port is specified to
+	// redirect.
+	RangeProtoSpecified bool
+
+	// MinIP indicates address used to redirect.
+	MinIP tcpip.Address
+
+	// MaxIP indicates address used to redirect.
+	MaxIP tcpip.Address
+
+	// MinPort indicates port used to redirect.
+	MinPort uint16
+
+	// MaxPort indicates port used to redirect.
+	MaxPort uint16
+}
+
+// Action implements Target.Action.
+// TODO(gvisor.dev/issue/170): Parse headers without copying. The current
+// implementation only works for PREROUTING and calls pkt.Clone(), neither
+// of which should be the case.
+func (rt RedirectTarget) Action(pkt *PacketBuffer, ct *ConnTrack, hook Hook, gso *GSO, r *Route, address tcpip.Address) (RuleVerdict, int) {
+	// Packet is already manipulated.
+	if pkt.NatDone {
+		return RuleAccept, 0
+	}
+
+	// Drop the packet if network and transport header are not set.
+	if pkt.NetworkHeader == nil || pkt.TransportHeader == nil {
+		return RuleDrop, 0
+	}
+
+	// Change the address to localhost (127.0.0.1) in Output and
+	// to primary address of the incoming interface in Prerouting.
+	switch hook {
+	case Output:
+		rt.MinIP = tcpip.Address([]byte{127, 0, 0, 1})
+		rt.MaxIP = tcpip.Address([]byte{127, 0, 0, 1})
+	case Prerouting:
+		rt.MinIP = address
+		rt.MaxIP = address
+	default:
+		panic("redirect target is supported only on output and prerouting hooks")
+	}
+
+	// TODO(gvisor.dev/issue/170): Check Flags in RedirectTarget if
+	// we need to change dest address (for OUTPUT chain) or ports.
+	netHeader := header.IPv4(pkt.NetworkHeader)
+	switch protocol := netHeader.TransportProtocol(); protocol {
+	case header.UDPProtocolNumber:
+		udpHeader := header.UDP(pkt.TransportHeader)
+		udpHeader.SetDestinationPort(rt.MinPort)
+
+		// Calculate UDP checksum and set it.
+		if hook == Output {
+			udpHeader.SetChecksum(0)
+			hdr := &pkt.Header
+			length := uint16(pkt.Data.Size()+hdr.UsedLength()) - uint16(netHeader.HeaderLength())
+
+			// Only calculate the checksum if offloading isn't supported.
+			if r.Capabilities()&CapabilityTXChecksumOffload == 0 {
+				xsum := r.PseudoHeaderChecksum(protocol, length)
+				for _, v := range pkt.Data.Views() {
+					xsum = header.Checksum(v, xsum)
+				}
+				udpHeader.SetChecksum(0)
+				udpHeader.SetChecksum(^udpHeader.CalculateChecksum(xsum))
+			}
+		}
+		// Change destination address.
+		netHeader.SetDestinationAddress(rt.MinIP)
+		netHeader.SetChecksum(0)
+		netHeader.SetChecksum(^netHeader.CalculateChecksum())
+		pkt.NatDone = true
+	case header.TCPProtocolNumber:
+		if ct == nil {
+			return RuleAccept, 0
+		}
+
+		// Set up conection for matching NAT rule. Only the first
+		// packet of the connection comes here. Other packets will be
+		// manipulated in connection tracking.
+		if conn := ct.createConnFor(pkt, hook, rt); conn != nil {
+			ct.handlePacket(pkt, hook, gso, r)
+		}
+	default:
+		return RuleDrop, 0
+	}
+
+	return RuleAccept, 0
+}
diff --git a/pkg/tcpip/stack/iptables_types.go b/pkg/tcpip/stack/iptables_types.go
new file mode 100644
index 000000000..c528ec381
--- /dev/null
+++ b/pkg/tcpip/stack/iptables_types.go
@@ -0,0 +1,253 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"strings"
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+// A Hook specifies one of the hooks built into the network stack.
+//
+//                      Userspace app          Userspace app
+//                            ^                      |
+//                            |                      v
+//                         [Input]               [Output]
+//                            ^                      |
+//                            |                      v
+//                            |                   routing
+//                            |                      |
+//                            |                      v
+// ----->[Prerouting]----->routing----->[Forward]---------[Postrouting]----->
+type Hook uint
+
+// These values correspond to values in include/uapi/linux/netfilter.h.
+const (
+	// Prerouting happens before a packet is routed to applications or to
+	// be forwarded.
+	Prerouting Hook = iota
+
+	// Input happens before a packet reaches an application.
+	Input
+
+	// Forward happens once it's decided that a packet should be forwarded
+	// to another host.
+	Forward
+
+	// Output happens after a packet is written by an application to be
+	// sent out.
+	Output
+
+	// Postrouting happens just before a packet goes out on the wire.
+	Postrouting
+
+	// The total number of hooks.
+	NumHooks
+)
+
+// A RuleVerdict is what a rule decides should be done with a packet.
+type RuleVerdict int
+
+const (
+	// RuleAccept indicates the packet should continue through netstack.
+	RuleAccept RuleVerdict = iota
+
+	// RuleDrop indicates the packet should be dropped.
+	RuleDrop
+
+	// RuleJump indicates the packet should jump to another chain.
+	RuleJump
+
+	// RuleReturn indicates the packet should return to the previous chain.
+	RuleReturn
+)
+
+// IPTables holds all the tables for a netstack.
+type IPTables struct {
+	// mu protects tables, priorities, and modified.
+	mu sync.RWMutex
+
+	// tables maps table names to tables. User tables have arbitrary names.
+	// mu needs to be locked for accessing.
+	tables map[string]Table
+
+	// priorities maps each hook to a list of table names. The order of the
+	// list is the order in which each table should be visited for that
+	// hook. mu needs to be locked for accessing.
+	priorities map[Hook][]string
+
+	// modified is whether tables have been modified at least once. It is
+	// used to elide the iptables performance overhead for workloads that
+	// don't utilize iptables.
+	modified bool
+
+	connections ConnTrack
+}
+
+// A Table defines a set of chains and hooks into the network stack. It is
+// really just a list of rules.
+type Table struct {
+	// Rules holds the rules that make up the table.
+	Rules []Rule
+
+	// BuiltinChains maps builtin chains to their entrypoint rule in Rules.
+	BuiltinChains map[Hook]int
+
+	// Underflows maps builtin chains to their underflow rule in Rules
+	// (i.e. the rule to execute if the chain returns without a verdict).
+	Underflows map[Hook]int
+
+	// UserChains holds user-defined chains for the keyed by name. Users
+	// can give their chains arbitrary names.
+	UserChains map[string]int
+}
+
+// ValidHooks returns a bitmap of the builtin hooks for the given table.
+func (table *Table) ValidHooks() uint32 {
+	hooks := uint32(0)
+	for hook := range table.BuiltinChains {
+		hooks |= 1 << hook
+	}
+	return hooks
+}
+
+// A Rule is a packet processing rule. It consists of two pieces. First it
+// contains zero or more matchers, each of which is a specification of which
+// packets this rule applies to. If there are no matchers in the rule, it
+// applies to any packet.
+type Rule struct {
+	// Filter holds basic IP filtering fields common to every rule.
+	Filter IPHeaderFilter
+
+	// Matchers is the list of matchers for this rule.
+	Matchers []Matcher
+
+	// Target is the action to invoke if all the matchers match the packet.
+	Target Target
+}
+
+// IPHeaderFilter holds basic IP filtering data common to every rule.
+type IPHeaderFilter struct {
+	// Protocol matches the transport protocol.
+	Protocol tcpip.TransportProtocolNumber
+
+	// Dst matches the destination IP address.
+	Dst tcpip.Address
+
+	// DstMask masks bits of the destination IP address when comparing with
+	// Dst.
+	DstMask tcpip.Address
+
+	// DstInvert inverts the meaning of the destination IP check, i.e. when
+	// true the filter will match packets that fail the destination
+	// comparison.
+	DstInvert bool
+
+	// Src matches the source IP address.
+	Src tcpip.Address
+
+	// SrcMask masks bits of the source IP address when comparing with Src.
+	SrcMask tcpip.Address
+
+	// SrcInvert inverts the meaning of the source IP check, i.e. when true the
+	// filter will match packets that fail the source comparison.
+	SrcInvert bool
+
+	// OutputInterface matches the name of the outgoing interface for the
+	// packet.
+	OutputInterface string
+
+	// OutputInterfaceMask masks the characters of the interface name when
+	// comparing with OutputInterface.
+	OutputInterfaceMask string
+
+	// OutputInterfaceInvert inverts the meaning of outgoing interface check,
+	// i.e. when true the filter will match packets that fail the outgoing
+	// interface comparison.
+	OutputInterfaceInvert bool
+}
+
+// match returns whether hdr matches the filter.
+func (fl IPHeaderFilter) match(hdr header.IPv4, hook Hook, nicName string) bool {
+	// TODO(gvisor.dev/issue/170): Support other fields of the filter.
+	// Check the transport protocol.
+	if fl.Protocol != 0 && fl.Protocol != hdr.TransportProtocol() {
+		return false
+	}
+
+	// Check the source and destination IPs.
+	if !filterAddress(hdr.DestinationAddress(), fl.DstMask, fl.Dst, fl.DstInvert) || !filterAddress(hdr.SourceAddress(), fl.SrcMask, fl.Src, fl.SrcInvert) {
+		return false
+	}
+
+	// Check the output interface.
+	// TODO(gvisor.dev/issue/170): Add the check for FORWARD and POSTROUTING
+	// hooks after supported.
+	if hook == Output {
+		n := len(fl.OutputInterface)
+		if n == 0 {
+			return true
+		}
+
+		// If the interface name ends with '+', any interface which begins
+		// with the name should be matched.
+		ifName := fl.OutputInterface
+		matches := true
+		if strings.HasSuffix(ifName, "+") {
+			matches = strings.HasPrefix(nicName, ifName[:n-1])
+		} else {
+			matches = nicName == ifName
+		}
+		return fl.OutputInterfaceInvert != matches
+	}
+
+	return true
+}
+
+// filterAddress returns whether addr matches the filter.
+func filterAddress(addr, mask, filterAddr tcpip.Address, invert bool) bool {
+	matches := true
+	for i := range filterAddr {
+		if addr[i]&mask[i] != filterAddr[i] {
+			matches = false
+			break
+		}
+	}
+	return matches != invert
+}
+
+// A Matcher is the interface for matching packets.
+type Matcher interface {
+	// Name returns the name of the Matcher.
+	Name() string
+
+	// Match returns whether the packet matches and whether the packet
+	// should be "hotdropped", i.e. dropped immediately. This is usually
+	// used for suspicious packets.
+	//
+	// Precondition: packet.NetworkHeader is set.
+	Match(hook Hook, packet *PacketBuffer, interfaceName string) (matches bool, hotdrop bool)
+}
+
+// A Target is the interface for taking an action for a packet.
+type Target interface {
+	// Action takes an action on the packet and returns a verdict on how
+	// traversal should (or should not) continue. If the return value is
+	// Jump, it also returns the index of the rule to jump to.
+	Action(packet *PacketBuffer, connections *ConnTrack, hook Hook, gso *GSO, r *Route, address tcpip.Address) (RuleVerdict, int)
+}
diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go
new file mode 100644
index 000000000..403557fd7
--- /dev/null
+++ b/pkg/tcpip/stack/linkaddrcache.go
@@ -0,0 +1,295 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const linkAddrCacheSize = 512 // max cache entries
+
+// linkAddrCache is a fixed-sized cache mapping IP addresses to link addresses.
+//
+// The entries are stored in a ring buffer, oldest entry replaced first.
+//
+// This struct is safe for concurrent use.
+type linkAddrCache struct {
+	// ageLimit is how long a cache entry is valid for.
+	ageLimit time.Duration
+
+	// resolutionTimeout is the amount of time to wait for a link request to
+	// resolve an address.
+	resolutionTimeout time.Duration
+
+	// resolutionAttempts is the number of times an address is attempted to be
+	// resolved before failing.
+	resolutionAttempts int
+
+	cache struct {
+		sync.Mutex
+		table map[tcpip.FullAddress]*linkAddrEntry
+		lru   linkAddrEntryList
+	}
+}
+
+// entryState controls the state of a single entry in the cache.
+type entryState int
+
+const (
+	// incomplete means that there is an outstanding request to resolve the
+	// address. This is the initial state.
+	incomplete entryState = iota
+	// ready means that the address has been resolved and can be used.
+	ready
+	// failed means that address resolution timed out and the address
+	// could not be resolved.
+	failed
+)
+
+// String implements Stringer.
+func (s entryState) String() string {
+	switch s {
+	case incomplete:
+		return "incomplete"
+	case ready:
+		return "ready"
+	case failed:
+		return "failed"
+	default:
+		return fmt.Sprintf("unknown(%d)", s)
+	}
+}
+
+// A linkAddrEntry is an entry in the linkAddrCache.
+// This struct is thread-compatible.
+type linkAddrEntry struct {
+	linkAddrEntryEntry
+
+	addr       tcpip.FullAddress
+	linkAddr   tcpip.LinkAddress
+	expiration time.Time
+	s          entryState
+
+	// wakers is a set of waiters for address resolution result. Anytime
+	// state transitions out of incomplete these waiters are notified.
+	wakers map[*sleep.Waker]struct{}
+
+	// done is used to allow callers to wait on address resolution. It is nil iff
+	// s is incomplete and resolution is not yet in progress.
+	done chan struct{}
+}
+
+// changeState sets the entry's state to ns, notifying any waiters.
+//
+// The entry's expiration is bumped up to the greater of itself and the passed
+// expiration; the zero value indicates immediate expiration, and is set
+// unconditionally - this is an implementation detail that allows for entries
+// to be reused.
+func (e *linkAddrEntry) changeState(ns entryState, expiration time.Time) {
+	// Notify whoever is waiting on address resolution when transitioning
+	// out of incomplete.
+	if e.s == incomplete && ns != incomplete {
+		for w := range e.wakers {
+			w.Assert()
+		}
+		e.wakers = nil
+		if ch := e.done; ch != nil {
+			close(ch)
+		}
+		e.done = nil
+	}
+
+	if expiration.IsZero() || expiration.After(e.expiration) {
+		e.expiration = expiration
+	}
+	e.s = ns
+}
+
+func (e *linkAddrEntry) removeWaker(w *sleep.Waker) {
+	delete(e.wakers, w)
+}
+
+// add adds a k -> v mapping to the cache.
+func (c *linkAddrCache) add(k tcpip.FullAddress, v tcpip.LinkAddress) {
+	// Calculate expiration time before acquiring the lock, since expiration is
+	// relative to the time when information was learned, rather than when it
+	// happened to be inserted into the cache.
+	expiration := time.Now().Add(c.ageLimit)
+
+	c.cache.Lock()
+	entry := c.getOrCreateEntryLocked(k)
+	entry.linkAddr = v
+
+	entry.changeState(ready, expiration)
+	c.cache.Unlock()
+}
+
+// getOrCreateEntryLocked retrieves a cache entry associated with k. The
+// returned entry is always refreshed in the cache (it is reachable via the
+// map, and its place is bumped in LRU).
+//
+// If a matching entry exists in the cache, it is returned. If no matching
+// entry exists and the cache is full, an existing entry is evicted via LRU,
+// reset to state incomplete, and returned. If no matching entry exists and the
+// cache is not full, a new entry with state incomplete is allocated and
+// returned.
+func (c *linkAddrCache) getOrCreateEntryLocked(k tcpip.FullAddress) *linkAddrEntry {
+	if entry, ok := c.cache.table[k]; ok {
+		c.cache.lru.Remove(entry)
+		c.cache.lru.PushFront(entry)
+		return entry
+	}
+	var entry *linkAddrEntry
+	if len(c.cache.table) == linkAddrCacheSize {
+		entry = c.cache.lru.Back()
+
+		delete(c.cache.table, entry.addr)
+		c.cache.lru.Remove(entry)
+
+		// Wake waiters and mark the soon-to-be-reused entry as expired. Note
+		// that the state passed doesn't matter when the zero time is passed.
+		entry.changeState(failed, time.Time{})
+	} else {
+		entry = new(linkAddrEntry)
+	}
+
+	*entry = linkAddrEntry{
+		addr: k,
+		s:    incomplete,
+	}
+	c.cache.table[k] = entry
+	c.cache.lru.PushFront(entry)
+	return entry
+}
+
+// get reports any known link address for k.
+func (c *linkAddrCache) get(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, linkEP LinkEndpoint, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) {
+	if linkRes != nil {
+		if addr, ok := linkRes.ResolveStaticAddress(k.Addr); ok {
+			return addr, nil, nil
+		}
+	}
+
+	c.cache.Lock()
+	defer c.cache.Unlock()
+	entry := c.getOrCreateEntryLocked(k)
+	switch s := entry.s; s {
+	case ready, failed:
+		if !time.Now().After(entry.expiration) {
+			// Not expired.
+			switch s {
+			case ready:
+				return entry.linkAddr, nil, nil
+			case failed:
+				return entry.linkAddr, nil, tcpip.ErrNoLinkAddress
+			default:
+				panic(fmt.Sprintf("invalid cache entry state: %s", s))
+			}
+		}
+
+		entry.changeState(incomplete, time.Time{})
+		fallthrough
+	case incomplete:
+		if waker != nil {
+			if entry.wakers == nil {
+				entry.wakers = make(map[*sleep.Waker]struct{})
+			}
+			entry.wakers[waker] = struct{}{}
+		}
+
+		if entry.done == nil {
+			// Address resolution needs to be initiated.
+			if linkRes == nil {
+				return entry.linkAddr, nil, tcpip.ErrNoLinkAddress
+			}
+
+			entry.done = make(chan struct{})
+			go c.startAddressResolution(k, linkRes, localAddr, linkEP, entry.done) // S/R-SAFE: link non-savable; wakers dropped synchronously.
+		}
+
+		return entry.linkAddr, entry.done, tcpip.ErrWouldBlock
+	default:
+		panic(fmt.Sprintf("invalid cache entry state: %s", s))
+	}
+}
+
+// removeWaker removes a waker previously added through get().
+func (c *linkAddrCache) removeWaker(k tcpip.FullAddress, waker *sleep.Waker) {
+	c.cache.Lock()
+	defer c.cache.Unlock()
+
+	if entry, ok := c.cache.table[k]; ok {
+		entry.removeWaker(waker)
+	}
+}
+
+func (c *linkAddrCache) startAddressResolution(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, linkEP LinkEndpoint, done <-chan struct{}) {
+	for i := 0; ; i++ {
+		// Send link request, then wait for the timeout limit and check
+		// whether the request succeeded.
+		linkRes.LinkAddressRequest(k.Addr, localAddr, linkEP)
+
+		select {
+		case now := <-time.After(c.resolutionTimeout):
+			if stop := c.checkLinkRequest(now, k, i); stop {
+				return
+			}
+		case <-done:
+			return
+		}
+	}
+}
+
+// checkLinkRequest checks whether previous attempt to resolve address has succeeded
+// and mark the entry accordingly, e.g. ready, failed, etc. Return true if request
+// can stop, false if another request should be sent.
+func (c *linkAddrCache) checkLinkRequest(now time.Time, k tcpip.FullAddress, attempt int) bool {
+	c.cache.Lock()
+	defer c.cache.Unlock()
+	entry, ok := c.cache.table[k]
+	if !ok {
+		// Entry was evicted from the cache.
+		return true
+	}
+	switch s := entry.s; s {
+	case ready, failed:
+		// Entry was made ready by resolver or failed. Either way we're done.
+	case incomplete:
+		if attempt+1 < c.resolutionAttempts {
+			// No response yet, need to send another ARP request.
+			return false
+		}
+		// Max number of retries reached, mark entry as failed.
+		entry.changeState(failed, now.Add(c.ageLimit))
+	default:
+		panic(fmt.Sprintf("invalid cache entry state: %s", s))
+	}
+	return true
+}
+
+func newLinkAddrCache(ageLimit, resolutionTimeout time.Duration, resolutionAttempts int) *linkAddrCache {
+	c := &linkAddrCache{
+		ageLimit:           ageLimit,
+		resolutionTimeout:  resolutionTimeout,
+		resolutionAttempts: resolutionAttempts,
+	}
+	c.cache.table = make(map[tcpip.FullAddress]*linkAddrEntry, linkAddrCacheSize)
+	return c
+}
diff --git a/pkg/tcpip/stack/linkaddrcache_test.go b/pkg/tcpip/stack/linkaddrcache_test.go
new file mode 100644
index 000000000..1baa498d0
--- /dev/null
+++ b/pkg/tcpip/stack/linkaddrcache_test.go
@@ -0,0 +1,277 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+type testaddr struct {
+	addr     tcpip.FullAddress
+	linkAddr tcpip.LinkAddress
+}
+
+var testAddrs = func() []testaddr {
+	var addrs []testaddr
+	for i := 0; i < 4*linkAddrCacheSize; i++ {
+		addr := fmt.Sprintf("Addr%06d", i)
+		addrs = append(addrs, testaddr{
+			addr:     tcpip.FullAddress{NIC: 1, Addr: tcpip.Address(addr)},
+			linkAddr: tcpip.LinkAddress("Link" + addr),
+		})
+	}
+	return addrs
+}()
+
+type testLinkAddressResolver struct {
+	cache                *linkAddrCache
+	delay                time.Duration
+	onLinkAddressRequest func()
+}
+
+func (r *testLinkAddressResolver) LinkAddressRequest(addr, _ tcpip.Address, _ LinkEndpoint) *tcpip.Error {
+	time.AfterFunc(r.delay, func() { r.fakeRequest(addr) })
+	if f := r.onLinkAddressRequest; f != nil {
+		f()
+	}
+	return nil
+}
+
+func (r *testLinkAddressResolver) fakeRequest(addr tcpip.Address) {
+	for _, ta := range testAddrs {
+		if ta.addr.Addr == addr {
+			r.cache.add(ta.addr, ta.linkAddr)
+			break
+		}
+	}
+}
+
+func (*testLinkAddressResolver) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+	if addr == "broadcast" {
+		return "mac_broadcast", true
+	}
+	return "", false
+}
+
+func (*testLinkAddressResolver) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
+	return 1
+}
+
+func getBlocking(c *linkAddrCache, addr tcpip.FullAddress, linkRes LinkAddressResolver) (tcpip.LinkAddress, *tcpip.Error) {
+	w := sleep.Waker{}
+	s := sleep.Sleeper{}
+	s.AddWaker(&w, 123)
+	defer s.Done()
+
+	for {
+		if got, _, err := c.get(addr, linkRes, "", nil, &w); err != tcpip.ErrWouldBlock {
+			return got, err
+		}
+		s.Fetch(true)
+	}
+}
+
+func TestCacheOverflow(t *testing.T) {
+	c := newLinkAddrCache(1<<63-1, 1*time.Second, 3)
+	for i := len(testAddrs) - 1; i >= 0; i-- {
+		e := testAddrs[i]
+		c.add(e.addr, e.linkAddr)
+		got, _, err := c.get(e.addr, nil, "", nil, nil)
+		if err != nil {
+			t.Errorf("insert %d, c.get(%q)=%q, got error: %v", i, string(e.addr.Addr), got, err)
+		}
+		if got != e.linkAddr {
+			t.Errorf("insert %d, c.get(%q)=%q, want %q", i, string(e.addr.Addr), got, e.linkAddr)
+		}
+	}
+	// Expect to find at least half of the most recent entries.
+	for i := 0; i < linkAddrCacheSize/2; i++ {
+		e := testAddrs[i]
+		got, _, err := c.get(e.addr, nil, "", nil, nil)
+		if err != nil {
+			t.Errorf("check %d, c.get(%q)=%q, got error: %v", i, string(e.addr.Addr), got, err)
+		}
+		if got != e.linkAddr {
+			t.Errorf("check %d, c.get(%q)=%q, want %q", i, string(e.addr.Addr), got, e.linkAddr)
+		}
+	}
+	// The earliest entries should no longer be in the cache.
+	for i := len(testAddrs) - 1; i >= len(testAddrs)-linkAddrCacheSize; i-- {
+		e := testAddrs[i]
+		if _, _, err := c.get(e.addr, nil, "", nil, nil); err != tcpip.ErrNoLinkAddress {
+			t.Errorf("check %d, c.get(%q), got error: %v, want: error ErrNoLinkAddress", i, string(e.addr.Addr), err)
+		}
+	}
+}
+
+func TestCacheConcurrent(t *testing.T) {
+	c := newLinkAddrCache(1<<63-1, 1*time.Second, 3)
+
+	var wg sync.WaitGroup
+	for r := 0; r < 16; r++ {
+		wg.Add(1)
+		go func() {
+			for _, e := range testAddrs {
+				c.add(e.addr, e.linkAddr)
+				c.get(e.addr, nil, "", nil, nil) // make work for gotsan
+			}
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+
+	// All goroutines add in the same order and add more values than
+	// can fit in the cache, so our eviction strategy requires that
+	// the last entry be present and the first be missing.
+	e := testAddrs[len(testAddrs)-1]
+	got, _, err := c.get(e.addr, nil, "", nil, nil)
+	if err != nil {
+		t.Errorf("c.get(%q)=%q, got error: %v", string(e.addr.Addr), got, err)
+	}
+	if got != e.linkAddr {
+		t.Errorf("c.get(%q)=%q, want %q", string(e.addr.Addr), got, e.linkAddr)
+	}
+
+	e = testAddrs[0]
+	if _, _, err := c.get(e.addr, nil, "", nil, nil); err != tcpip.ErrNoLinkAddress {
+		t.Errorf("c.get(%q), got error: %v, want: error ErrNoLinkAddress", string(e.addr.Addr), err)
+	}
+}
+
+func TestCacheAgeLimit(t *testing.T) {
+	c := newLinkAddrCache(1*time.Millisecond, 1*time.Second, 3)
+	e := testAddrs[0]
+	c.add(e.addr, e.linkAddr)
+	time.Sleep(50 * time.Millisecond)
+	if _, _, err := c.get(e.addr, nil, "", nil, nil); err != tcpip.ErrNoLinkAddress {
+		t.Errorf("c.get(%q), got error: %v, want: error ErrNoLinkAddress", string(e.addr.Addr), err)
+	}
+}
+
+func TestCacheReplace(t *testing.T) {
+	c := newLinkAddrCache(1<<63-1, 1*time.Second, 3)
+	e := testAddrs[0]
+	l2 := e.linkAddr + "2"
+	c.add(e.addr, e.linkAddr)
+	got, _, err := c.get(e.addr, nil, "", nil, nil)
+	if err != nil {
+		t.Errorf("c.get(%q)=%q, got error: %v", string(e.addr.Addr), got, err)
+	}
+	if got != e.linkAddr {
+		t.Errorf("c.get(%q)=%q, want %q", string(e.addr.Addr), got, e.linkAddr)
+	}
+
+	c.add(e.addr, l2)
+	got, _, err = c.get(e.addr, nil, "", nil, nil)
+	if err != nil {
+		t.Errorf("c.get(%q)=%q, got error: %v", string(e.addr.Addr), got, err)
+	}
+	if got != l2 {
+		t.Errorf("c.get(%q)=%q, want %q", string(e.addr.Addr), got, l2)
+	}
+}
+
+func TestCacheResolution(t *testing.T) {
+	c := newLinkAddrCache(1<<63-1, 250*time.Millisecond, 1)
+	linkRes := &testLinkAddressResolver{cache: c}
+	for i, ta := range testAddrs {
+		got, err := getBlocking(c, ta.addr, linkRes)
+		if err != nil {
+			t.Errorf("check %d, c.get(%q)=%q, got error: %v", i, string(ta.addr.Addr), got, err)
+		}
+		if got != ta.linkAddr {
+			t.Errorf("check %d, c.get(%q)=%q, want %q", i, string(ta.addr.Addr), got, ta.linkAddr)
+		}
+	}
+
+	// Check that after resolved, address stays in the cache and never returns WouldBlock.
+	for i := 0; i < 10; i++ {
+		e := testAddrs[len(testAddrs)-1]
+		got, _, err := c.get(e.addr, linkRes, "", nil, nil)
+		if err != nil {
+			t.Errorf("c.get(%q)=%q, got error: %v", string(e.addr.Addr), got, err)
+		}
+		if got != e.linkAddr {
+			t.Errorf("c.get(%q)=%q, want %q", string(e.addr.Addr), got, e.linkAddr)
+		}
+	}
+}
+
+func TestCacheResolutionFailed(t *testing.T) {
+	c := newLinkAddrCache(1<<63-1, 10*time.Millisecond, 5)
+	linkRes := &testLinkAddressResolver{cache: c}
+
+	var requestCount uint32
+	linkRes.onLinkAddressRequest = func() {
+		atomic.AddUint32(&requestCount, 1)
+	}
+
+	// First, sanity check that resolution is working...
+	e := testAddrs[0]
+	got, err := getBlocking(c, e.addr, linkRes)
+	if err != nil {
+		t.Errorf("c.get(%q)=%q, got error: %v", string(e.addr.Addr), got, err)
+	}
+	if got != e.linkAddr {
+		t.Errorf("c.get(%q)=%q, want %q", string(e.addr.Addr), got, e.linkAddr)
+	}
+
+	before := atomic.LoadUint32(&requestCount)
+
+	e.addr.Addr += "2"
+	if _, err := getBlocking(c, e.addr, linkRes); err != tcpip.ErrNoLinkAddress {
+		t.Errorf("c.get(%q), got error: %v, want: error ErrNoLinkAddress", string(e.addr.Addr), err)
+	}
+
+	if got, want := int(atomic.LoadUint32(&requestCount)-before), c.resolutionAttempts; got != want {
+		t.Errorf("got link address request count = %d, want = %d", got, want)
+	}
+}
+
+func TestCacheResolutionTimeout(t *testing.T) {
+	resolverDelay := 500 * time.Millisecond
+	expiration := resolverDelay / 10
+	c := newLinkAddrCache(expiration, 1*time.Millisecond, 3)
+	linkRes := &testLinkAddressResolver{cache: c, delay: resolverDelay}
+
+	e := testAddrs[0]
+	if _, err := getBlocking(c, e.addr, linkRes); err != tcpip.ErrNoLinkAddress {
+		t.Errorf("c.get(%q), got error: %v, want: error ErrNoLinkAddress", string(e.addr.Addr), err)
+	}
+}
+
+// TestStaticResolution checks that static link addresses are resolved immediately and don't
+// send resolution requests.
+func TestStaticResolution(t *testing.T) {
+	c := newLinkAddrCache(1<<63-1, time.Millisecond, 1)
+	linkRes := &testLinkAddressResolver{cache: c, delay: time.Minute}
+
+	addr := tcpip.Address("broadcast")
+	want := tcpip.LinkAddress("mac_broadcast")
+	got, _, err := c.get(tcpip.FullAddress{Addr: addr}, linkRes, "", nil, nil)
+	if err != nil {
+		t.Errorf("c.get(%q)=%q, got error: %v", string(addr), string(got), err)
+	}
+	if got != want {
+		t.Errorf("c.get(%q)=%q, want %q", string(addr), string(got), string(want))
+	}
+}
diff --git a/pkg/tcpip/stack/ndp.go b/pkg/tcpip/stack/ndp.go
new file mode 100644
index 000000000..e28c23d66
--- /dev/null
+++ b/pkg/tcpip/stack/ndp.go
@@ -0,0 +1,1981 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+	"log"
+	"math/rand"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+const (
+	// defaultDupAddrDetectTransmits is the default number of NDP Neighbor
+	// Solicitation messages to send when doing Duplicate Address Detection
+	// for a tentative address.
+	//
+	// Default = 1 (from RFC 4862 section 5.1)
+	defaultDupAddrDetectTransmits = 1
+
+	// defaultRetransmitTimer is the default amount of time to wait between
+	// sending NDP Neighbor solicitation messages.
+	//
+	// Default = 1s (from RFC 4861 section 10).
+	defaultRetransmitTimer = time.Second
+
+	// defaultMaxRtrSolicitations is the default number of Router
+	// Solicitation messages to send when a NIC becomes enabled.
+	//
+	// Default = 3 (from RFC 4861 section 10).
+	defaultMaxRtrSolicitations = 3
+
+	// defaultRtrSolicitationInterval is the default amount of time between
+	// sending Router Solicitation messages.
+	//
+	// Default = 4s (from 4861 section 10).
+	defaultRtrSolicitationInterval = 4 * time.Second
+
+	// defaultMaxRtrSolicitationDelay is the default maximum amount of time
+	// to wait before sending the first Router Solicitation message.
+	//
+	// Default = 1s (from 4861 section 10).
+	defaultMaxRtrSolicitationDelay = time.Second
+
+	// defaultHandleRAs is the default configuration for whether or not to
+	// handle incoming Router Advertisements as a host.
+	defaultHandleRAs = true
+
+	// defaultDiscoverDefaultRouters is the default configuration for
+	// whether or not to discover default routers from incoming Router
+	// Advertisements, as a host.
+	defaultDiscoverDefaultRouters = true
+
+	// defaultDiscoverOnLinkPrefixes is the default configuration for
+	// whether or not to discover on-link prefixes from incoming Router
+	// Advertisements' Prefix Information option, as a host.
+	defaultDiscoverOnLinkPrefixes = true
+
+	// defaultAutoGenGlobalAddresses is the default configuration for
+	// whether or not to generate global IPv6 addresses in response to
+	// receiving a new Prefix Information option with its Autonomous
+	// Address AutoConfiguration flag set, as a host.
+	//
+	// Default = true.
+	defaultAutoGenGlobalAddresses = true
+
+	// minimumRetransmitTimer is the minimum amount of time to wait between
+	// sending NDP Neighbor solicitation messages. Note, RFC 4861 does
+	// not impose a minimum Retransmit Timer, but we do here to make sure
+	// the messages are not sent all at once. We also come to this value
+	// because in the RetransmitTimer field of a Router Advertisement, a
+	// value of 0 means unspecified, so the smallest valid value is 1.
+	// Note, the unit of the RetransmitTimer field in the Router
+	// Advertisement is milliseconds.
+	minimumRetransmitTimer = time.Millisecond
+
+	// minimumRtrSolicitationInterval is the minimum amount of time to wait
+	// between sending Router Solicitation messages. This limit is imposed
+	// to make sure that Router Solicitation messages are not sent all at
+	// once, defeating the purpose of sending the initial few messages.
+	minimumRtrSolicitationInterval = 500 * time.Millisecond
+
+	// minimumMaxRtrSolicitationDelay is the minimum amount of time to wait
+	// before sending the first Router Solicitation message. It is 0 because
+	// we cannot have a negative delay.
+	minimumMaxRtrSolicitationDelay = 0
+
+	// MaxDiscoveredDefaultRouters is the maximum number of discovered
+	// default routers. The stack should stop discovering new routers after
+	// discovering MaxDiscoveredDefaultRouters routers.
+	//
+	// This value MUST be at minimum 2 as per RFC 4861 section 6.3.4, and
+	// SHOULD be more.
+	MaxDiscoveredDefaultRouters = 10
+
+	// MaxDiscoveredOnLinkPrefixes is the maximum number of discovered
+	// on-link prefixes. The stack should stop discovering new on-link
+	// prefixes after discovering MaxDiscoveredOnLinkPrefixes on-link
+	// prefixes.
+	MaxDiscoveredOnLinkPrefixes = 10
+
+	// validPrefixLenForAutoGen is the expected prefix length that an
+	// address can be generated for. Must be 64 bits as the interface
+	// identifier (IID) is 64 bits and an IPv6 address is 128 bits, so
+	// 128 - 64 = 64.
+	validPrefixLenForAutoGen = 64
+
+	// defaultAutoGenTempGlobalAddresses is the default configuration for whether
+	// or not to generate temporary SLAAC addresses.
+	defaultAutoGenTempGlobalAddresses = true
+
+	// defaultMaxTempAddrValidLifetime is the default maximum valid lifetime
+	// for temporary SLAAC addresses generated as part of RFC 4941.
+	//
+	// Default = 7 days (from RFC 4941 section 5).
+	defaultMaxTempAddrValidLifetime = 7 * 24 * time.Hour
+
+	// defaultMaxTempAddrPreferredLifetime is the default preferred lifetime
+	// for temporary SLAAC addresses generated as part of RFC 4941.
+	//
+	// Default = 1 day (from RFC 4941 section 5).
+	defaultMaxTempAddrPreferredLifetime = 24 * time.Hour
+
+	// defaultRegenAdvanceDuration is the default duration before the deprecation
+	// of a temporary address when a new address will be generated.
+	//
+	// Default = 5s (from RFC 4941 section 5).
+	defaultRegenAdvanceDuration = 5 * time.Second
+
+	// minRegenAdvanceDuration is the minimum duration before the deprecation
+	// of a temporary address when a new address will be generated.
+	minRegenAdvanceDuration = time.Duration(0)
+
+	// maxSLAACAddrLocalRegenAttempts is the maximum number of times to attempt
+	// SLAAC address regenerations in response to a NIC-local conflict.
+	maxSLAACAddrLocalRegenAttempts = 10
+)
+
+var (
+	// MinPrefixInformationValidLifetimeForUpdate is the minimum Valid
+	// Lifetime to update the valid lifetime of a generated address by
+	// SLAAC.
+	//
+	// This is exported as a variable (instead of a constant) so tests
+	// can update it to a smaller value.
+	//
+	// Min = 2hrs.
+	MinPrefixInformationValidLifetimeForUpdate = 2 * time.Hour
+
+	// MaxDesyncFactor is the upper bound for the preferred lifetime's desync
+	// factor for temporary SLAAC addresses.
+	//
+	// This is exported as a variable (instead of a constant) so tests
+	// can update it to a smaller value.
+	//
+	// Must be greater than 0.
+	//
+	// Max = 10m (from RFC 4941 section 5).
+	MaxDesyncFactor = 10 * time.Minute
+
+	// MinMaxTempAddrPreferredLifetime is the minimum value allowed for the
+	// maximum preferred lifetime for temporary SLAAC addresses.
+	//
+	// This is exported as a variable (instead of a constant) so tests
+	// can update it to a smaller value.
+	//
+	// This value guarantees that a temporary address will be preferred for at
+	// least 1hr if the SLAAC prefix is valid for at least that time.
+	MinMaxTempAddrPreferredLifetime = defaultRegenAdvanceDuration + MaxDesyncFactor + time.Hour
+
+	// MinMaxTempAddrValidLifetime is the minimum value allowed for the
+	// maximum valid lifetime for temporary SLAAC addresses.
+	//
+	// This is exported as a variable (instead of a constant) so tests
+	// can update it to a smaller value.
+	//
+	// This value guarantees that a temporary address will be valid for at least
+	// 2hrs if the SLAAC prefix is valid for at least that time.
+	MinMaxTempAddrValidLifetime = 2 * time.Hour
+)
+
+// DHCPv6ConfigurationFromNDPRA is a configuration available via DHCPv6 that an
+// NDP Router Advertisement informed the Stack about.
+type DHCPv6ConfigurationFromNDPRA int
+
+const (
+	_ DHCPv6ConfigurationFromNDPRA = iota
+
+	// DHCPv6NoConfiguration indicates that no configurations are available via
+	// DHCPv6.
+	DHCPv6NoConfiguration
+
+	// DHCPv6ManagedAddress indicates that addresses are available via DHCPv6.
+	//
+	// DHCPv6ManagedAddress also implies DHCPv6OtherConfigurations because DHCPv6
+	// will return all available configuration information.
+	DHCPv6ManagedAddress
+
+	// DHCPv6OtherConfigurations indicates that other configuration information is
+	// available via DHCPv6.
+	//
+	// Other configurations are configurations other than addresses. Examples of
+	// other configurations are recursive DNS server list, DNS search lists and
+	// default gateway.
+	DHCPv6OtherConfigurations
+)
+
+// NDPDispatcher is the interface integrators of netstack must implement to
+// receive and handle NDP related events.
+type NDPDispatcher interface {
+	// OnDuplicateAddressDetectionStatus will be called when the DAD process
+	// for an address (addr) on a NIC (with ID nicID) completes. resolved
+	// will be set to true if DAD completed successfully (no duplicate addr
+	// detected); false otherwise (addr was detected to be a duplicate on
+	// the link the NIC is a part of, or it was stopped for some other
+	// reason, such as the address being removed). If an error occured
+	// during DAD, err will be set and resolved must be ignored.
+	//
+	// This function is not permitted to block indefinitely. This function
+	// is also not permitted to call into the stack.
+	OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error)
+
+	// OnDefaultRouterDiscovered will be called when a new default router is
+	// discovered. Implementations must return true if the newly discovered
+	// router should be remembered.
+	//
+	// This function is not permitted to block indefinitely. This function
+	// is also not permitted to call into the stack.
+	OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) bool
+
+	// OnDefaultRouterInvalidated will be called when a discovered default
+	// router that was remembered is invalidated.
+	//
+	// This function is not permitted to block indefinitely. This function
+	// is also not permitted to call into the stack.
+	OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address)
+
+	// OnOnLinkPrefixDiscovered will be called when a new on-link prefix is
+	// discovered. Implementations must return true if the newly discovered
+	// on-link prefix should be remembered.
+	//
+	// This function is not permitted to block indefinitely. This function
+	// is also not permitted to call into the stack.
+	OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) bool
+
+	// OnOnLinkPrefixInvalidated will be called when a discovered on-link
+	// prefix that was remembered is invalidated.
+	//
+	// This function is not permitted to block indefinitely. This function
+	// is also not permitted to call into the stack.
+	OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet)
+
+	// OnAutoGenAddress will be called when a new prefix with its
+	// autonomous address-configuration flag set has been received and SLAAC
+	// has been performed. Implementations may prevent the stack from
+	// assigning the address to the NIC by returning false.
+	//
+	// This function is not permitted to block indefinitely. It must not
+	// call functions on the stack itself.
+	OnAutoGenAddress(tcpip.NICID, tcpip.AddressWithPrefix) bool
+
+	// OnAutoGenAddressDeprecated will be called when an auto-generated
+	// address (as part of SLAAC) has been deprecated, but is still
+	// considered valid. Note, if an address is invalidated at the same
+	// time it is deprecated, the deprecation event MAY be omitted.
+	//
+	// This function is not permitted to block indefinitely. It must not
+	// call functions on the stack itself.
+	OnAutoGenAddressDeprecated(tcpip.NICID, tcpip.AddressWithPrefix)
+
+	// OnAutoGenAddressInvalidated will be called when an auto-generated
+	// address (as part of SLAAC) has been invalidated.
+	//
+	// This function is not permitted to block indefinitely. It must not
+	// call functions on the stack itself.
+	OnAutoGenAddressInvalidated(tcpip.NICID, tcpip.AddressWithPrefix)
+
+	// OnRecursiveDNSServerOption will be called when an NDP option with
+	// recursive DNS servers has been received. Note, addrs may contain
+	// link-local addresses.
+	//
+	// It is up to the caller to use the DNS Servers only for their valid
+	// lifetime. OnRecursiveDNSServerOption may be called for new or
+	// already known DNS servers. If called with known DNS servers, their
+	// valid lifetimes must be refreshed to lifetime (it may be increased,
+	// decreased, or completely invalidated when lifetime = 0).
+	//
+	// This function is not permitted to block indefinitely. It must not
+	// call functions on the stack itself.
+	OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration)
+
+	// OnDNSSearchListOption will be called when an NDP option with a DNS
+	// search list has been received.
+	//
+	// It is up to the caller to use the domain names in the search list
+	// for only their valid lifetime. OnDNSSearchListOption may be called
+	// with new or already known domain names. If called with known domain
+	// names, their valid lifetimes must be refreshed to lifetime (it may
+	// be increased, decreased or completely invalidated when lifetime = 0.
+	OnDNSSearchListOption(nicID tcpip.NICID, domainNames []string, lifetime time.Duration)
+
+	// OnDHCPv6Configuration will be called with an updated configuration that is
+	// available via DHCPv6 for a specified NIC.
+	//
+	// This function is not permitted to block indefinitely. It must not
+	// call functions on the stack itself.
+	OnDHCPv6Configuration(tcpip.NICID, DHCPv6ConfigurationFromNDPRA)
+}
+
+// NDPConfigurations is the NDP configurations for the netstack.
+type NDPConfigurations struct {
+	// The number of Neighbor Solicitation messages to send when doing
+	// Duplicate Address Detection for a tentative address.
+	//
+	// Note, a value of zero effectively disables DAD.
+	DupAddrDetectTransmits uint8
+
+	// The amount of time to wait between sending Neighbor solicitation
+	// messages.
+	//
+	// Must be greater than or equal to 1ms.
+	RetransmitTimer time.Duration
+
+	// The number of Router Solicitation messages to send when the NIC
+	// becomes enabled.
+	MaxRtrSolicitations uint8
+
+	// The amount of time between transmitting Router Solicitation messages.
+	//
+	// Must be greater than or equal to 0.5s.
+	RtrSolicitationInterval time.Duration
+
+	// The maximum amount of time before transmitting the first Router
+	// Solicitation message.
+	//
+	// Must be greater than or equal to 0s.
+	MaxRtrSolicitationDelay time.Duration
+
+	// HandleRAs determines whether or not Router Advertisements will be
+	// processed.
+	HandleRAs bool
+
+	// DiscoverDefaultRouters determines whether or not default routers will
+	// be discovered from Router Advertisements. This configuration is
+	// ignored if HandleRAs is false.
+	DiscoverDefaultRouters bool
+
+	// DiscoverOnLinkPrefixes determines whether or not on-link prefixes
+	// will be discovered from Router Advertisements' Prefix Information
+	// option. This configuration is ignored if HandleRAs is false.
+	DiscoverOnLinkPrefixes bool
+
+	// AutoGenGlobalAddresses determines whether or not global IPv6
+	// addresses will be generated for a NIC in response to receiving a new
+	// Prefix Information option with its Autonomous Address
+	// AutoConfiguration flag set, as a host, as per RFC 4862 (SLAAC).
+	//
+	// Note, if an address was already generated for some unique prefix, as
+	// part of SLAAC, this option does not affect whether or not the
+	// lifetime(s) of the generated address changes; this option only
+	// affects the generation of new addresses as part of SLAAC.
+	AutoGenGlobalAddresses bool
+
+	// AutoGenAddressConflictRetries determines how many times to attempt to retry
+	// generation of a permanent auto-generated address in response to DAD
+	// conflicts.
+	//
+	// If the method used to generate the address does not support creating
+	// alternative addresses (e.g. IIDs based on the modified EUI64 of a NIC's
+	// MAC address), then no attempt will be made to resolve the conflict.
+	AutoGenAddressConflictRetries uint8
+
+	// AutoGenTempGlobalAddresses determines whether or not temporary SLAAC
+	// addresses will be generated for a NIC as part of SLAAC privacy extensions,
+	// RFC 4941.
+	//
+	// Ignored if AutoGenGlobalAddresses is false.
+	AutoGenTempGlobalAddresses bool
+
+	// MaxTempAddrValidLifetime is the maximum valid lifetime for temporary
+	// SLAAC addresses.
+	MaxTempAddrValidLifetime time.Duration
+
+	// MaxTempAddrPreferredLifetime is the maximum preferred lifetime for
+	// temporary SLAAC addresses.
+	MaxTempAddrPreferredLifetime time.Duration
+
+	// RegenAdvanceDuration is the duration before the deprecation of a temporary
+	// address when a new address will be generated.
+	RegenAdvanceDuration time.Duration
+}
+
+// DefaultNDPConfigurations returns an NDPConfigurations populated with
+// default values.
+func DefaultNDPConfigurations() NDPConfigurations {
+	return NDPConfigurations{
+		DupAddrDetectTransmits:       defaultDupAddrDetectTransmits,
+		RetransmitTimer:              defaultRetransmitTimer,
+		MaxRtrSolicitations:          defaultMaxRtrSolicitations,
+		RtrSolicitationInterval:      defaultRtrSolicitationInterval,
+		MaxRtrSolicitationDelay:      defaultMaxRtrSolicitationDelay,
+		HandleRAs:                    defaultHandleRAs,
+		DiscoverDefaultRouters:       defaultDiscoverDefaultRouters,
+		DiscoverOnLinkPrefixes:       defaultDiscoverOnLinkPrefixes,
+		AutoGenGlobalAddresses:       defaultAutoGenGlobalAddresses,
+		AutoGenTempGlobalAddresses:   defaultAutoGenTempGlobalAddresses,
+		MaxTempAddrValidLifetime:     defaultMaxTempAddrValidLifetime,
+		MaxTempAddrPreferredLifetime: defaultMaxTempAddrPreferredLifetime,
+		RegenAdvanceDuration:         defaultRegenAdvanceDuration,
+	}
+}
+
+// validate modifies an NDPConfigurations with valid values. If invalid values
+// are present in c, the corresponding default values will be used instead.
+func (c *NDPConfigurations) validate() {
+	if c.RetransmitTimer < minimumRetransmitTimer {
+		c.RetransmitTimer = defaultRetransmitTimer
+	}
+
+	if c.RtrSolicitationInterval < minimumRtrSolicitationInterval {
+		c.RtrSolicitationInterval = defaultRtrSolicitationInterval
+	}
+
+	if c.MaxRtrSolicitationDelay < minimumMaxRtrSolicitationDelay {
+		c.MaxRtrSolicitationDelay = defaultMaxRtrSolicitationDelay
+	}
+
+	if c.MaxTempAddrValidLifetime < MinMaxTempAddrValidLifetime {
+		c.MaxTempAddrValidLifetime = MinMaxTempAddrValidLifetime
+	}
+
+	if c.MaxTempAddrPreferredLifetime < MinMaxTempAddrPreferredLifetime || c.MaxTempAddrPreferredLifetime > c.MaxTempAddrValidLifetime {
+		c.MaxTempAddrPreferredLifetime = MinMaxTempAddrPreferredLifetime
+	}
+
+	if c.RegenAdvanceDuration < minRegenAdvanceDuration {
+		c.RegenAdvanceDuration = minRegenAdvanceDuration
+	}
+}
+
+// ndpState is the per-interface NDP state.
+type ndpState struct {
+	// The NIC this ndpState is for.
+	nic *NIC
+
+	// configs is the per-interface NDP configurations.
+	configs NDPConfigurations
+
+	// The DAD state to send the next NS message, or resolve the address.
+	dad map[tcpip.Address]dadState
+
+	// The default routers discovered through Router Advertisements.
+	defaultRouters map[tcpip.Address]defaultRouterState
+
+	rtrSolicit struct {
+		// The timer used to send the next router solicitation message.
+		timer *time.Timer
+
+		// Used to let the Router Solicitation timer know that it has been stopped.
+		//
+		// Must only be read from or written to while protected by the lock of
+		// the NIC this ndpState is associated with. MUST be set when the timer is
+		// set.
+		done *bool
+	}
+
+	// The on-link prefixes discovered through Router Advertisements' Prefix
+	// Information option.
+	onLinkPrefixes map[tcpip.Subnet]onLinkPrefixState
+
+	// The SLAAC prefixes discovered through Router Advertisements' Prefix
+	// Information option.
+	slaacPrefixes map[tcpip.Subnet]slaacPrefixState
+
+	// The last learned DHCPv6 configuration from an NDP RA.
+	dhcpv6Configuration DHCPv6ConfigurationFromNDPRA
+
+	// temporaryIIDHistory is the history value used to generate a new temporary
+	// IID.
+	temporaryIIDHistory [header.IIDSize]byte
+
+	// temporaryAddressDesyncFactor is the preferred lifetime's desync factor for
+	// temporary SLAAC addresses.
+	temporaryAddressDesyncFactor time.Duration
+}
+
+// dadState holds the Duplicate Address Detection timer and channel to signal
+// to the DAD goroutine that DAD should stop.
+type dadState struct {
+	// The DAD timer to send the next NS message, or resolve the address.
+	timer *time.Timer
+
+	// Used to let the DAD timer know that it has been stopped.
+	//
+	// Must only be read from or written to while protected by the lock of
+	// the NIC this dadState is associated with.
+	done *bool
+}
+
+// defaultRouterState holds data associated with a default router discovered by
+// a Router Advertisement (RA).
+type defaultRouterState struct {
+	// Timer to invalidate the default router.
+	//
+	// Must not be nil.
+	invalidationTimer *tcpip.CancellableTimer
+}
+
+// onLinkPrefixState holds data associated with an on-link prefix discovered by
+// a Router Advertisement's Prefix Information option (PI) when the NDP
+// configurations was configured to do so.
+type onLinkPrefixState struct {
+	// Timer to invalidate the on-link prefix.
+	//
+	// Must not be nil.
+	invalidationTimer *tcpip.CancellableTimer
+}
+
+// tempSLAACAddrState holds state associated with a temporary SLAAC address.
+type tempSLAACAddrState struct {
+	// Timer to deprecate the temporary SLAAC address.
+	//
+	// Must not be nil.
+	deprecationTimer *tcpip.CancellableTimer
+
+	// Timer to invalidate the temporary SLAAC address.
+	//
+	// Must not be nil.
+	invalidationTimer *tcpip.CancellableTimer
+
+	// Timer to regenerate the temporary SLAAC address.
+	//
+	// Must not be nil.
+	regenTimer *tcpip.CancellableTimer
+
+	createdAt time.Time
+
+	// The address's endpoint.
+	//
+	// Must not be nil.
+	ref *referencedNetworkEndpoint
+
+	// Has a new temporary SLAAC address already been regenerated?
+	regenerated bool
+}
+
+// slaacPrefixState holds state associated with a SLAAC prefix.
+type slaacPrefixState struct {
+	// Timer to deprecate the prefix.
+	//
+	// Must not be nil.
+	deprecationTimer *tcpip.CancellableTimer
+
+	// Timer to invalidate the prefix.
+	//
+	// Must not be nil.
+	invalidationTimer *tcpip.CancellableTimer
+
+	// Nonzero only when the address is not valid forever.
+	validUntil time.Time
+
+	// Nonzero only when the address is not preferred forever.
+	preferredUntil time.Time
+
+	// State associated with the stable address generated for the prefix.
+	stableAddr struct {
+		// The address's endpoint.
+		//
+		// May only be nil when the address is being (re-)generated. Otherwise,
+		// must not be nil as all SLAAC prefixes must have a stable address.
+		ref *referencedNetworkEndpoint
+
+		// The number of times an address has been generated locally where the NIC
+		// already had the generated address.
+		localGenerationFailures uint8
+	}
+
+	// The temporary (short-lived) addresses generated for the SLAAC prefix.
+	tempAddrs map[tcpip.Address]tempSLAACAddrState
+
+	// The next two fields are used by both stable and temporary addresses
+	// generated for a SLAAC prefix. This is safe as only 1 address will be
+	// in the generation and DAD process at any time. That is, no two addresses
+	// will be generated at the same time for a given SLAAC prefix.
+
+	// The number of times an address has been generated and added to the NIC.
+	//
+	// Addresses may be regenerated in reseponse to a DAD conflicts.
+	generationAttempts uint8
+
+	// The maximum number of times to attempt regeneration of a SLAAC address
+	// in response to DAD conflicts.
+	maxGenerationAttempts uint8
+}
+
+// startDuplicateAddressDetection performs Duplicate Address Detection.
+//
+// This function must only be called by IPv6 addresses that are currently
+// tentative.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) startDuplicateAddressDetection(addr tcpip.Address, ref *referencedNetworkEndpoint) *tcpip.Error {
+	// addr must be a valid unicast IPv6 address.
+	if !header.IsV6UnicastAddress(addr) {
+		return tcpip.ErrAddressFamilyNotSupported
+	}
+
+	if ref.getKind() != permanentTentative {
+		// The endpoint should be marked as tentative since we are starting DAD.
+		panic(fmt.Sprintf("ndpdad: addr %s is not tentative on NIC(%d)", addr, ndp.nic.ID()))
+	}
+
+	// Should not attempt to perform DAD on an address that is currently in the
+	// DAD process.
+	if _, ok := ndp.dad[addr]; ok {
+		// Should never happen because we should only ever call this function for
+		// newly created addresses. If we attemped to "add" an address that already
+		// existed, we would get an error since we attempted to add a duplicate
+		// address, or its reference count would have been increased without doing
+		// the work that would have been done for an address that was brand new.
+		// See NIC.addAddressLocked.
+		panic(fmt.Sprintf("ndpdad: already performing DAD for addr %s on NIC(%d)", addr, ndp.nic.ID()))
+	}
+
+	remaining := ndp.configs.DupAddrDetectTransmits
+	if remaining == 0 {
+		ref.setKind(permanent)
+
+		// Consider DAD to have resolved even if no DAD messages were actually
+		// transmitted.
+		if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, true, nil)
+		}
+
+		return nil
+	}
+
+	var done bool
+	var timer *time.Timer
+	// We initially start a timer to fire immediately because some of the DAD work
+	// cannot be done while holding the NIC's lock. This is effectively the same
+	// as starting a goroutine but we use a timer that fires immediately so we can
+	// reset it for the next DAD iteration.
+	timer = time.AfterFunc(0, func() {
+		ndp.nic.mu.Lock()
+		defer ndp.nic.mu.Unlock()
+
+		if done {
+			// If we reach this point, it means that the DAD timer fired after
+			// another goroutine already obtained the NIC lock and stopped DAD
+			// before this function obtained the NIC lock. Simply return here and do
+			// nothing further.
+			return
+		}
+
+		if ref.getKind() != permanentTentative {
+			// The endpoint should still be marked as tentative since we are still
+			// performing DAD on it.
+			panic(fmt.Sprintf("ndpdad: addr %s is no longer tentative on NIC(%d)", addr, ndp.nic.ID()))
+		}
+
+		dadDone := remaining == 0
+
+		var err *tcpip.Error
+		if !dadDone {
+			// Use the unspecified address as the source address when performing DAD.
+			ref := ndp.nic.getRefOrCreateTempLocked(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint)
+
+			// Do not hold the lock when sending packets which may be a long running
+			// task or may block link address resolution. We know this is safe
+			// because immediately after obtaining the lock again, we check if DAD
+			// has been stopped before doing any work with the NIC. Note, DAD would be
+			// stopped if the NIC was disabled or removed, or if the address was
+			// removed.
+			ndp.nic.mu.Unlock()
+			err = ndp.sendDADPacket(addr, ref)
+			ndp.nic.mu.Lock()
+		}
+
+		if done {
+			// If we reach this point, it means that DAD was stopped after we released
+			// the NIC's read lock and before we obtained the write lock.
+			return
+		}
+
+		if dadDone {
+			// DAD has resolved.
+			ref.setKind(permanent)
+		} else if err == nil {
+			// DAD is not done and we had no errors when sending the last NDP NS,
+			// schedule the next DAD timer.
+			remaining--
+			timer.Reset(ndp.nic.stack.ndpConfigs.RetransmitTimer)
+			return
+		}
+
+		// At this point we know that either DAD is done or we hit an error sending
+		// the last NDP NS. Either way, clean up addr's DAD state and let the
+		// integrator know DAD has completed.
+		delete(ndp.dad, addr)
+
+		if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+			ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, dadDone, err)
+		}
+
+		// If DAD resolved for a stable SLAAC address, attempt generation of a
+		// temporary SLAAC address.
+		if dadDone && ref.configType == slaac {
+			// Reset the generation attempts counter as we are starting the generation
+			// of a new address for the SLAAC prefix.
+			ndp.regenerateTempSLAACAddr(ref.addrWithPrefix().Subnet(), true /* resetGenAttempts */)
+		}
+	})
+
+	ndp.dad[addr] = dadState{
+		timer: timer,
+		done:  &done,
+	}
+
+	return nil
+}
+
+// sendDADPacket sends a NS message to see if any nodes on ndp's NIC's link owns
+// addr.
+//
+// addr must be a tentative IPv6 address on ndp's NIC.
+//
+// The NIC ndp belongs to MUST NOT be locked.
+func (ndp *ndpState) sendDADPacket(addr tcpip.Address, ref *referencedNetworkEndpoint) *tcpip.Error {
+	snmc := header.SolicitedNodeAddr(addr)
+
+	r := makeRoute(header.IPv6ProtocolNumber, ref.ep.ID().LocalAddress, snmc, ndp.nic.linkEP.LinkAddress(), ref, false, false)
+	defer r.Release()
+
+	// Route should resolve immediately since snmc is a multicast address so a
+	// remote link address can be calculated without a resolution process.
+	if c, err := r.Resolve(nil); err != nil {
+		// Do not consider the NIC being unknown or disabled as a fatal error.
+		// Since this method is required to be called when the NIC is not locked,
+		// the NIC could have been disabled or removed by another goroutine.
+		if err == tcpip.ErrUnknownNICID || err != tcpip.ErrInvalidEndpointState {
+			return err
+		}
+
+		panic(fmt.Sprintf("ndp: error when resolving route to send NDP NS for DAD (%s -> %s on NIC(%d)): %s", header.IPv6Any, snmc, ndp.nic.ID(), err))
+	} else if c != nil {
+		panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP NS for DAD (%s -> %s on NIC(%d))", header.IPv6Any, snmc, ndp.nic.ID()))
+	}
+
+	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + header.ICMPv6NeighborSolicitMinimumSize)
+	pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
+	pkt.SetType(header.ICMPv6NeighborSolicit)
+	ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+	ns.SetTargetAddress(addr)
+	pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+
+	sent := r.Stats().ICMP.V6PacketsSent
+	if err := r.WritePacket(nil,
+		NetworkHeaderParams{
+			Protocol: header.ICMPv6ProtocolNumber,
+			TTL:      header.NDPHopLimit,
+			TOS:      DefaultTOS,
+		}, &PacketBuffer{Header: hdr},
+	); err != nil {
+		sent.Dropped.Increment()
+		return err
+	}
+	sent.NeighborSolicit.Increment()
+
+	return nil
+}
+
+// stopDuplicateAddressDetection ends a running Duplicate Address Detection
+// process. Note, this may leave the DAD process for a tentative address in
+// such a state forever, unless some other external event resolves the DAD
+// process (receiving an NA from the true owner of addr, or an NS for addr
+// (implying another node is attempting to use addr)). It is up to the caller
+// of this function to handle such a scenario. Normally, addr will be removed
+// from n right after this function returns or the address successfully
+// resolved.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) stopDuplicateAddressDetection(addr tcpip.Address) {
+	dad, ok := ndp.dad[addr]
+	if !ok {
+		// Not currently performing DAD on addr, just return.
+		return
+	}
+
+	if dad.timer != nil {
+		dad.timer.Stop()
+		dad.timer = nil
+
+		*dad.done = true
+		dad.done = nil
+	}
+
+	delete(ndp.dad, addr)
+
+	// Let the integrator know DAD did not resolve.
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnDuplicateAddressDetectionStatus(ndp.nic.ID(), addr, false, nil)
+	}
+}
+
+// handleRA handles a Router Advertisement message that arrived on the NIC
+// this ndp is for. Does nothing if the NIC is configured to not handle RAs.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) handleRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
+	// Is the NIC configured to handle RAs at all?
+	//
+	// Currently, the stack does not determine router interface status on a
+	// per-interface basis; it is a stack-wide configuration, so we check
+	// stack's forwarding flag to determine if the NIC is a routing
+	// interface.
+	if !ndp.configs.HandleRAs || ndp.nic.stack.forwarding {
+		return
+	}
+
+	// Only worry about the DHCPv6 configuration if we have an NDPDispatcher as we
+	// only inform the dispatcher on configuration changes. We do nothing else
+	// with the information.
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		var configuration DHCPv6ConfigurationFromNDPRA
+		switch {
+		case ra.ManagedAddrConfFlag():
+			configuration = DHCPv6ManagedAddress
+
+		case ra.OtherConfFlag():
+			configuration = DHCPv6OtherConfigurations
+
+		default:
+			configuration = DHCPv6NoConfiguration
+		}
+
+		if ndp.dhcpv6Configuration != configuration {
+			ndp.dhcpv6Configuration = configuration
+			ndpDisp.OnDHCPv6Configuration(ndp.nic.ID(), configuration)
+		}
+	}
+
+	// Is the NIC configured to discover default routers?
+	if ndp.configs.DiscoverDefaultRouters {
+		rtr, ok := ndp.defaultRouters[ip]
+		rl := ra.RouterLifetime()
+		switch {
+		case !ok && rl != 0:
+			// This is a new default router we are discovering.
+			//
+			// Only remember it if we currently know about less than
+			// MaxDiscoveredDefaultRouters routers.
+			if len(ndp.defaultRouters) < MaxDiscoveredDefaultRouters {
+				ndp.rememberDefaultRouter(ip, rl)
+			}
+
+		case ok && rl != 0:
+			// This is an already discovered default router. Update
+			// the invalidation timer.
+			rtr.invalidationTimer.StopLocked()
+			rtr.invalidationTimer.Reset(rl)
+			ndp.defaultRouters[ip] = rtr
+
+		case ok && rl == 0:
+			// We know about the router but it is no longer to be
+			// used as a default router so invalidate it.
+			ndp.invalidateDefaultRouter(ip)
+		}
+	}
+
+	// TODO(b/141556115): Do (RetransTimer, ReachableTime)) Parameter
+	//                    Discovery.
+
+	// We know the options is valid as far as wire format is concerned since
+	// we got the Router Advertisement, as documented by this fn. Given this
+	// we do not check the iterator for errors on calls to Next.
+	it, _ := ra.Options().Iter(false)
+	for opt, done, _ := it.Next(); !done; opt, done, _ = it.Next() {
+		switch opt := opt.(type) {
+		case header.NDPRecursiveDNSServer:
+			if ndp.nic.stack.ndpDisp == nil {
+				continue
+			}
+
+			addrs, _ := opt.Addresses()
+			ndp.nic.stack.ndpDisp.OnRecursiveDNSServerOption(ndp.nic.ID(), addrs, opt.Lifetime())
+
+		case header.NDPDNSSearchList:
+			if ndp.nic.stack.ndpDisp == nil {
+				continue
+			}
+
+			domainNames, _ := opt.DomainNames()
+			ndp.nic.stack.ndpDisp.OnDNSSearchListOption(ndp.nic.ID(), domainNames, opt.Lifetime())
+
+		case header.NDPPrefixInformation:
+			prefix := opt.Subnet()
+
+			// Is the prefix a link-local?
+			if header.IsV6LinkLocalAddress(prefix.ID()) {
+				// ...Yes, skip as per RFC 4861 section 6.3.4,
+				// and RFC 4862 section 5.5.3.b (for SLAAC).
+				continue
+			}
+
+			// Is the Prefix Length 0?
+			if prefix.Prefix() == 0 {
+				// ...Yes, skip as this is an invalid prefix
+				// as all IPv6 addresses cannot be on-link.
+				continue
+			}
+
+			if opt.OnLinkFlag() {
+				ndp.handleOnLinkPrefixInformation(opt)
+			}
+
+			if opt.AutonomousAddressConfigurationFlag() {
+				ndp.handleAutonomousPrefixInformation(opt)
+			}
+		}
+
+		// TODO(b/141556115): Do (MTU) Parameter Discovery.
+	}
+}
+
+// invalidateDefaultRouter invalidates a discovered default router.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) invalidateDefaultRouter(ip tcpip.Address) {
+	rtr, ok := ndp.defaultRouters[ip]
+
+	// Is the router still discovered?
+	if !ok {
+		// ...Nope, do nothing further.
+		return
+	}
+
+	rtr.invalidationTimer.StopLocked()
+	delete(ndp.defaultRouters, ip)
+
+	// Let the integrator know a discovered default router is invalidated.
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnDefaultRouterInvalidated(ndp.nic.ID(), ip)
+	}
+}
+
+// rememberDefaultRouter remembers a newly discovered default router with IPv6
+// link-local address ip with lifetime rl.
+//
+// The router identified by ip MUST NOT already be known by the NIC.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) rememberDefaultRouter(ip tcpip.Address, rl time.Duration) {
+	ndpDisp := ndp.nic.stack.ndpDisp
+	if ndpDisp == nil {
+		return
+	}
+
+	// Inform the integrator when we discovered a default router.
+	if !ndpDisp.OnDefaultRouterDiscovered(ndp.nic.ID(), ip) {
+		// Informed by the integrator to not remember the router, do
+		// nothing further.
+		return
+	}
+
+	state := defaultRouterState{
+		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+			ndp.invalidateDefaultRouter(ip)
+		}),
+	}
+
+	state.invalidationTimer.Reset(rl)
+
+	ndp.defaultRouters[ip] = state
+}
+
+// rememberOnLinkPrefix remembers a newly discovered on-link prefix with IPv6
+// address with prefix prefix with lifetime l.
+//
+// The prefix identified by prefix MUST NOT already be known.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) rememberOnLinkPrefix(prefix tcpip.Subnet, l time.Duration) {
+	ndpDisp := ndp.nic.stack.ndpDisp
+	if ndpDisp == nil {
+		return
+	}
+
+	// Inform the integrator when we discovered an on-link prefix.
+	if !ndpDisp.OnOnLinkPrefixDiscovered(ndp.nic.ID(), prefix) {
+		// Informed by the integrator to not remember the prefix, do
+		// nothing further.
+		return
+	}
+
+	state := onLinkPrefixState{
+		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+			ndp.invalidateOnLinkPrefix(prefix)
+		}),
+	}
+
+	if l < header.NDPInfiniteLifetime {
+		state.invalidationTimer.Reset(l)
+	}
+
+	ndp.onLinkPrefixes[prefix] = state
+}
+
+// invalidateOnLinkPrefix invalidates a discovered on-link prefix.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) invalidateOnLinkPrefix(prefix tcpip.Subnet) {
+	s, ok := ndp.onLinkPrefixes[prefix]
+
+	// Is the on-link prefix still discovered?
+	if !ok {
+		// ...Nope, do nothing further.
+		return
+	}
+
+	s.invalidationTimer.StopLocked()
+	delete(ndp.onLinkPrefixes, prefix)
+
+	// Let the integrator know a discovered on-link prefix is invalidated.
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnOnLinkPrefixInvalidated(ndp.nic.ID(), prefix)
+	}
+}
+
+// handleOnLinkPrefixInformation handles a Prefix Information option with
+// its on-link flag set, as per RFC 4861 section 6.3.4.
+//
+// handleOnLinkPrefixInformation assumes that the prefix this pi is for is
+// not the link-local prefix and the on-link flag is set.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) handleOnLinkPrefixInformation(pi header.NDPPrefixInformation) {
+	prefix := pi.Subnet()
+	prefixState, ok := ndp.onLinkPrefixes[prefix]
+	vl := pi.ValidLifetime()
+
+	if !ok && vl == 0 {
+		// Don't know about this prefix but it has a zero valid
+		// lifetime, so just ignore.
+		return
+	}
+
+	if !ok && vl != 0 {
+		// This is a new on-link prefix we are discovering
+		//
+		// Only remember it if we currently know about less than
+		// MaxDiscoveredOnLinkPrefixes on-link prefixes.
+		if ndp.configs.DiscoverOnLinkPrefixes && len(ndp.onLinkPrefixes) < MaxDiscoveredOnLinkPrefixes {
+			ndp.rememberOnLinkPrefix(prefix, vl)
+		}
+		return
+	}
+
+	if ok && vl == 0 {
+		// We know about the on-link prefix, but it is
+		// no longer to be considered on-link, so
+		// invalidate it.
+		ndp.invalidateOnLinkPrefix(prefix)
+		return
+	}
+
+	// This is an already discovered on-link prefix with a
+	// new non-zero valid lifetime.
+	//
+	// Update the invalidation timer.
+
+	prefixState.invalidationTimer.StopLocked()
+
+	if vl < header.NDPInfiniteLifetime {
+		// Prefix is valid for a finite lifetime, reset the timer to expire after
+		// the new valid lifetime.
+		prefixState.invalidationTimer.Reset(vl)
+	}
+
+	ndp.onLinkPrefixes[prefix] = prefixState
+}
+
+// handleAutonomousPrefixInformation handles a Prefix Information option with
+// its autonomous flag set, as per RFC 4862 section 5.5.3.
+//
+// handleAutonomousPrefixInformation assumes that the prefix this pi is for is
+// not the link-local prefix and the autonomous flag is set.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) handleAutonomousPrefixInformation(pi header.NDPPrefixInformation) {
+	vl := pi.ValidLifetime()
+	pl := pi.PreferredLifetime()
+
+	// If the preferred lifetime is greater than the valid lifetime,
+	// silently ignore the Prefix Information option, as per RFC 4862
+	// section 5.5.3.c.
+	if pl > vl {
+		return
+	}
+
+	prefix := pi.Subnet()
+
+	// Check if we already maintain SLAAC state for prefix.
+	if state, ok := ndp.slaacPrefixes[prefix]; ok {
+		// As per RFC 4862 section 5.5.3.e, refresh prefix's SLAAC lifetimes.
+		ndp.refreshSLAACPrefixLifetimes(prefix, &state, pl, vl)
+		ndp.slaacPrefixes[prefix] = state
+		return
+	}
+
+	// prefix is a new SLAAC prefix. Do the work as outlined by RFC 4862 section
+	// 5.5.3.d if ndp is configured to auto-generate new addresses via SLAAC.
+	if !ndp.configs.AutoGenGlobalAddresses {
+		return
+	}
+
+	ndp.doSLAAC(prefix, pl, vl)
+}
+
+// doSLAAC generates a new SLAAC address with the provided lifetimes
+// for prefix.
+//
+// pl is the new preferred lifetime. vl is the new valid lifetime.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) doSLAAC(prefix tcpip.Subnet, pl, vl time.Duration) {
+	// If we do not already have an address for this prefix and the valid
+	// lifetime is 0, no need to do anything further, as per RFC 4862
+	// section 5.5.3.d.
+	if vl == 0 {
+		return
+	}
+
+	// Make sure the prefix is valid (as far as its length is concerned) to
+	// generate a valid IPv6 address from an interface identifier (IID), as
+	// per RFC 4862 sectiion 5.5.3.d.
+	if prefix.Prefix() != validPrefixLenForAutoGen {
+		return
+	}
+
+	state := slaacPrefixState{
+		deprecationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+			state, ok := ndp.slaacPrefixes[prefix]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the deprecated SLAAC prefix %s", prefix))
+			}
+
+			ndp.deprecateSLAACAddress(state.stableAddr.ref)
+		}),
+		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+			state, ok := ndp.slaacPrefixes[prefix]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for the invalidated SLAAC prefix %s", prefix))
+			}
+
+			ndp.invalidateSLAACPrefix(prefix, state)
+		}),
+		tempAddrs:             make(map[tcpip.Address]tempSLAACAddrState),
+		maxGenerationAttempts: ndp.configs.AutoGenAddressConflictRetries + 1,
+	}
+
+	now := time.Now()
+
+	// The time an address is preferred until is needed to properly generate the
+	// address.
+	if pl < header.NDPInfiniteLifetime {
+		state.preferredUntil = now.Add(pl)
+	}
+
+	if !ndp.generateSLAACAddr(prefix, &state) {
+		// We were unable to generate an address for the prefix, we do not nothing
+		// further as there is no reason to maintain state or timers for a prefix we
+		// do not have an address for.
+		return
+	}
+
+	// Setup the initial timers to deprecate and invalidate prefix.
+
+	if pl < header.NDPInfiniteLifetime && pl != 0 {
+		state.deprecationTimer.Reset(pl)
+	}
+
+	if vl < header.NDPInfiniteLifetime {
+		state.invalidationTimer.Reset(vl)
+		state.validUntil = now.Add(vl)
+	}
+
+	// If the address is assigned (DAD resolved), generate a temporary address.
+	if state.stableAddr.ref.getKind() == permanent {
+		// Reset the generation attempts counter as we are starting the generation
+		// of a new address for the SLAAC prefix.
+		ndp.generateTempSLAACAddr(prefix, &state, true /* resetGenAttempts */)
+	}
+
+	ndp.slaacPrefixes[prefix] = state
+}
+
+// addSLAACAddr adds a SLAAC address to the NIC.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) addSLAACAddr(addr tcpip.AddressWithPrefix, configType networkEndpointConfigType, deprecated bool) *referencedNetworkEndpoint {
+	// Inform the integrator that we have a new SLAAC address.
+	ndpDisp := ndp.nic.stack.ndpDisp
+	if ndpDisp == nil {
+		return nil
+	}
+
+	if !ndpDisp.OnAutoGenAddress(ndp.nic.ID(), addr) {
+		// Informed by the integrator not to add the address.
+		return nil
+	}
+
+	protocolAddr := tcpip.ProtocolAddress{
+		Protocol:          header.IPv6ProtocolNumber,
+		AddressWithPrefix: addr,
+	}
+
+	ref, err := ndp.nic.addAddressLocked(protocolAddr, FirstPrimaryEndpoint, permanent, configType, deprecated)
+	if err != nil {
+		panic(fmt.Sprintf("ndp: error when adding SLAAC address %+v: %s", protocolAddr, err))
+	}
+
+	return ref
+}
+
+// generateSLAACAddr generates a SLAAC address for prefix.
+//
+// Returns true if an address was successfully generated.
+//
+// Panics if the prefix is not a SLAAC prefix or it already has an address.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) generateSLAACAddr(prefix tcpip.Subnet, state *slaacPrefixState) bool {
+	if r := state.stableAddr.ref; r != nil {
+		panic(fmt.Sprintf("ndp: SLAAC prefix %s already has a permenant address %s", prefix, r.addrWithPrefix()))
+	}
+
+	// If we have already reached the maximum address generation attempts for the
+	// prefix, do not generate another address.
+	if state.generationAttempts == state.maxGenerationAttempts {
+		return false
+	}
+
+	var generatedAddr tcpip.AddressWithPrefix
+	addrBytes := []byte(prefix.ID())
+
+	for i := 0; ; i++ {
+		// If we were unable to generate an address after the maximum SLAAC address
+		// local regeneration attempts, do nothing further.
+		if i == maxSLAACAddrLocalRegenAttempts {
+			return false
+		}
+
+		dadCounter := state.generationAttempts + state.stableAddr.localGenerationFailures
+		if oIID := ndp.nic.stack.opaqueIIDOpts; oIID.NICNameFromID != nil {
+			addrBytes = header.AppendOpaqueInterfaceIdentifier(
+				addrBytes[:header.IIDOffsetInIPv6Address],
+				prefix,
+				oIID.NICNameFromID(ndp.nic.ID(), ndp.nic.name),
+				dadCounter,
+				oIID.SecretKey,
+			)
+		} else if dadCounter == 0 {
+			// Modified-EUI64 based IIDs have no way to resolve DAD conflicts, so if
+			// the DAD counter is non-zero, we cannot use this method.
+			//
+			// Only attempt to generate an interface-specific IID if we have a valid
+			// link address.
+			//
+			// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
+			// LinkEndpoint.LinkAddress) before reaching this point.
+			linkAddr := ndp.nic.linkEP.LinkAddress()
+			if !header.IsValidUnicastEthernetAddress(linkAddr) {
+				return false
+			}
+
+			// Generate an address within prefix from the modified EUI-64 of ndp's
+			// NIC's Ethernet MAC address.
+			header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+		} else {
+			// We have no way to regenerate an address in response to an address
+			// conflict when addresses are not generated with opaque IIDs.
+			return false
+		}
+
+		generatedAddr = tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(addrBytes),
+			PrefixLen: validPrefixLenForAutoGen,
+		}
+
+		if !ndp.nic.hasPermanentAddrLocked(generatedAddr.Address) {
+			break
+		}
+
+		state.stableAddr.localGenerationFailures++
+	}
+
+	if ref := ndp.addSLAACAddr(generatedAddr, slaac, time.Since(state.preferredUntil) >= 0 /* deprecated */); ref != nil {
+		state.stableAddr.ref = ref
+		state.generationAttempts++
+		return true
+	}
+
+	return false
+}
+
+// regenerateSLAACAddr regenerates an address for a SLAAC prefix.
+//
+// If generating a new address for the prefix fails, the prefix will be
+// invalidated.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) regenerateSLAACAddr(prefix tcpip.Subnet) {
+	state, ok := ndp.slaacPrefixes[prefix]
+	if !ok {
+		panic(fmt.Sprintf("ndp: SLAAC prefix state not found to regenerate address for %s", prefix))
+	}
+
+	if ndp.generateSLAACAddr(prefix, &state) {
+		ndp.slaacPrefixes[prefix] = state
+		return
+	}
+
+	// We were unable to generate a permanent address for the SLAAC prefix so
+	// invalidate the prefix as there is no reason to maintain state for a
+	// SLAAC prefix we do not have an address for.
+	ndp.invalidateSLAACPrefix(prefix, state)
+}
+
+// generateTempSLAACAddr generates a new temporary SLAAC address.
+//
+// If resetGenAttempts is true, the prefix's generation counter will be reset.
+//
+// Returns true if a new address was generated.
+func (ndp *ndpState) generateTempSLAACAddr(prefix tcpip.Subnet, prefixState *slaacPrefixState, resetGenAttempts bool) bool {
+	// Are we configured to auto-generate new temporary global addresses for the
+	// prefix?
+	if !ndp.configs.AutoGenTempGlobalAddresses || prefix == header.IPv6LinkLocalPrefix.Subnet() {
+		return false
+	}
+
+	if resetGenAttempts {
+		prefixState.generationAttempts = 0
+		prefixState.maxGenerationAttempts = ndp.configs.AutoGenAddressConflictRetries + 1
+	}
+
+	// If we have already reached the maximum address generation attempts for the
+	// prefix, do not generate another address.
+	if prefixState.generationAttempts == prefixState.maxGenerationAttempts {
+		return false
+	}
+
+	stableAddr := prefixState.stableAddr.ref.ep.ID().LocalAddress
+	now := time.Now()
+
+	// As per RFC 4941 section 3.3 step 4, the valid lifetime of a temporary
+	// address is the lower of the valid lifetime of the stable address or the
+	// maximum temporary address valid lifetime.
+	vl := ndp.configs.MaxTempAddrValidLifetime
+	if prefixState.validUntil != (time.Time{}) {
+		if prefixVL := prefixState.validUntil.Sub(now); vl > prefixVL {
+			vl = prefixVL
+		}
+	}
+
+	if vl <= 0 {
+		// Cannot create an address without a valid lifetime.
+		return false
+	}
+
+	// As per RFC 4941 section 3.3 step 4, the preferred lifetime of a temporary
+	// address is the lower of the preferred lifetime of the stable address or the
+	// maximum temporary address preferred lifetime - the temporary address desync
+	// factor.
+	pl := ndp.configs.MaxTempAddrPreferredLifetime - ndp.temporaryAddressDesyncFactor
+	if prefixState.preferredUntil != (time.Time{}) {
+		if prefixPL := prefixState.preferredUntil.Sub(now); pl > prefixPL {
+			// Respect the preferred lifetime of the prefix, as per RFC 4941 section
+			// 3.3 step 4.
+			pl = prefixPL
+		}
+	}
+
+	// As per RFC 4941 section 3.3 step 5, a temporary address is created only if
+	// the calculated preferred lifetime is greater than the advance regeneration
+	// duration. In particular, we MUST NOT create a temporary address with a zero
+	// Preferred Lifetime.
+	if pl <= ndp.configs.RegenAdvanceDuration {
+		return false
+	}
+
+	// Attempt to generate a new address that is not already assigned to the NIC.
+	var generatedAddr tcpip.AddressWithPrefix
+	for i := 0; ; i++ {
+		// If we were unable to generate an address after the maximum SLAAC address
+		// local regeneration attempts, do nothing further.
+		if i == maxSLAACAddrLocalRegenAttempts {
+			return false
+		}
+
+		generatedAddr = header.GenerateTempIPv6SLAACAddr(ndp.temporaryIIDHistory[:], stableAddr)
+		if !ndp.nic.hasPermanentAddrLocked(generatedAddr.Address) {
+			break
+		}
+	}
+
+	// As per RFC RFC 4941 section 3.3 step 5, we MUST NOT create a temporary
+	// address with a zero preferred lifetime. The checks above ensure this
+	// so we know the address is not deprecated.
+	ref := ndp.addSLAACAddr(generatedAddr, slaacTemp, false /* deprecated */)
+	if ref == nil {
+		return false
+	}
+
+	state := tempSLAACAddrState{
+		deprecationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+			prefixState, ok := ndp.slaacPrefixes[prefix]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to deprecate temporary address %s", prefix, generatedAddr))
+			}
+
+			tempAddrState, ok := prefixState.tempAddrs[generatedAddr.Address]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a tempAddr entry to deprecate temporary address %s", generatedAddr))
+			}
+
+			ndp.deprecateSLAACAddress(tempAddrState.ref)
+		}),
+		invalidationTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+			prefixState, ok := ndp.slaacPrefixes[prefix]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to invalidate temporary address %s", prefix, generatedAddr))
+			}
+
+			tempAddrState, ok := prefixState.tempAddrs[generatedAddr.Address]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a tempAddr entry to invalidate temporary address %s", generatedAddr))
+			}
+
+			ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, generatedAddr.Address, tempAddrState)
+		}),
+		regenTimer: tcpip.NewCancellableTimer(&ndp.nic.mu, func() {
+			prefixState, ok := ndp.slaacPrefixes[prefix]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry for %s to regenerate temporary address after %s", prefix, generatedAddr))
+			}
+
+			tempAddrState, ok := prefixState.tempAddrs[generatedAddr.Address]
+			if !ok {
+				panic(fmt.Sprintf("ndp: must have a tempAddr entry to regenerate temporary address after %s", generatedAddr))
+			}
+
+			// If an address has already been regenerated for this address, don't
+			// regenerate another address.
+			if tempAddrState.regenerated {
+				return
+			}
+
+			// Reset the generation attempts counter as we are starting the generation
+			// of a new address for the SLAAC prefix.
+			tempAddrState.regenerated = ndp.generateTempSLAACAddr(prefix, &prefixState, true /* resetGenAttempts */)
+			prefixState.tempAddrs[generatedAddr.Address] = tempAddrState
+			ndp.slaacPrefixes[prefix] = prefixState
+		}),
+		createdAt: now,
+		ref:       ref,
+	}
+
+	state.deprecationTimer.Reset(pl)
+	state.invalidationTimer.Reset(vl)
+	state.regenTimer.Reset(pl - ndp.configs.RegenAdvanceDuration)
+
+	prefixState.generationAttempts++
+	prefixState.tempAddrs[generatedAddr.Address] = state
+
+	return true
+}
+
+// regenerateTempSLAACAddr regenerates a temporary address for a SLAAC prefix.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) regenerateTempSLAACAddr(prefix tcpip.Subnet, resetGenAttempts bool) {
+	state, ok := ndp.slaacPrefixes[prefix]
+	if !ok {
+		panic(fmt.Sprintf("ndp: SLAAC prefix state not found to regenerate temporary address for %s", prefix))
+	}
+
+	ndp.generateTempSLAACAddr(prefix, &state, resetGenAttempts)
+	ndp.slaacPrefixes[prefix] = state
+}
+
+// refreshSLAACPrefixLifetimes refreshes the lifetimes of a SLAAC prefix.
+//
+// pl is the new preferred lifetime. vl is the new valid lifetime.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) refreshSLAACPrefixLifetimes(prefix tcpip.Subnet, prefixState *slaacPrefixState, pl, vl time.Duration) {
+	// If the preferred lifetime is zero, then the prefix should be deprecated.
+	deprecated := pl == 0
+	if deprecated {
+		ndp.deprecateSLAACAddress(prefixState.stableAddr.ref)
+	} else {
+		prefixState.stableAddr.ref.deprecated = false
+	}
+
+	// If prefix was preferred for some finite lifetime before, stop the
+	// deprecation timer so it can be reset.
+	prefixState.deprecationTimer.StopLocked()
+
+	now := time.Now()
+
+	// Reset the deprecation timer if prefix has a finite preferred lifetime.
+	if pl < header.NDPInfiniteLifetime {
+		if !deprecated {
+			prefixState.deprecationTimer.Reset(pl)
+		}
+		prefixState.preferredUntil = now.Add(pl)
+	} else {
+		prefixState.preferredUntil = time.Time{}
+	}
+
+	// As per RFC 4862 section 5.5.3.e, update the valid lifetime for prefix:
+	//
+	// 1) If the received Valid Lifetime is greater than 2 hours or greater than
+	//    RemainingLifetime, set the valid lifetime of the prefix to the
+	//    advertised Valid Lifetime.
+	//
+	// 2) If RemainingLifetime is less than or equal to 2 hours, ignore the
+	//    advertised Valid Lifetime.
+	//
+	// 3) Otherwise, reset the valid lifetime of the prefix to 2 hours.
+
+	if vl >= header.NDPInfiniteLifetime {
+		// Handle the infinite valid lifetime separately as we do not keep a timer
+		// in this case.
+		prefixState.invalidationTimer.StopLocked()
+		prefixState.validUntil = time.Time{}
+	} else {
+		var effectiveVl time.Duration
+		var rl time.Duration
+
+		// If the prefix was originally set to be valid forever, assume the
+		// remaining time to be the maximum possible value.
+		if prefixState.validUntil == (time.Time{}) {
+			rl = header.NDPInfiniteLifetime
+		} else {
+			rl = time.Until(prefixState.validUntil)
+		}
+
+		if vl > MinPrefixInformationValidLifetimeForUpdate || vl > rl {
+			effectiveVl = vl
+		} else if rl > MinPrefixInformationValidLifetimeForUpdate {
+			effectiveVl = MinPrefixInformationValidLifetimeForUpdate
+		}
+
+		if effectiveVl != 0 {
+			prefixState.invalidationTimer.StopLocked()
+			prefixState.invalidationTimer.Reset(effectiveVl)
+			prefixState.validUntil = now.Add(effectiveVl)
+		}
+	}
+
+	// If DAD is not yet complete on the stable address, there is no need to do
+	// work with temporary addresses.
+	if prefixState.stableAddr.ref.getKind() != permanent {
+		return
+	}
+
+	// Note, we do not need to update the entries in the temporary address map
+	// after updating the timers because the timers are held as pointers.
+	var regenForAddr tcpip.Address
+	allAddressesRegenerated := true
+	for tempAddr, tempAddrState := range prefixState.tempAddrs {
+		// As per RFC 4941 section 3.3 step 4, the valid lifetime of a temporary
+		// address is the lower of the valid lifetime of the stable address or the
+		// maximum temporary address valid lifetime. Note, the valid lifetime of a
+		// temporary address is relative to the address's creation time.
+		validUntil := tempAddrState.createdAt.Add(ndp.configs.MaxTempAddrValidLifetime)
+		if prefixState.validUntil != (time.Time{}) && validUntil.Sub(prefixState.validUntil) > 0 {
+			validUntil = prefixState.validUntil
+		}
+
+		// If the address is no longer valid, invalidate it immediately. Otherwise,
+		// reset the invalidation timer.
+		newValidLifetime := validUntil.Sub(now)
+		if newValidLifetime <= 0 {
+			ndp.invalidateTempSLAACAddr(prefixState.tempAddrs, tempAddr, tempAddrState)
+			continue
+		}
+		tempAddrState.invalidationTimer.StopLocked()
+		tempAddrState.invalidationTimer.Reset(newValidLifetime)
+
+		// As per RFC 4941 section 3.3 step 4, the preferred lifetime of a temporary
+		// address is the lower of the preferred lifetime of the stable address or
+		// the maximum temporary address preferred lifetime - the temporary address
+		// desync factor. Note, the preferred lifetime of a temporary address is
+		// relative to the address's creation time.
+		preferredUntil := tempAddrState.createdAt.Add(ndp.configs.MaxTempAddrPreferredLifetime - ndp.temporaryAddressDesyncFactor)
+		if prefixState.preferredUntil != (time.Time{}) && preferredUntil.Sub(prefixState.preferredUntil) > 0 {
+			preferredUntil = prefixState.preferredUntil
+		}
+
+		// If the address is no longer preferred, deprecate it immediately.
+		// Otherwise, reset the deprecation timer.
+		newPreferredLifetime := preferredUntil.Sub(now)
+		tempAddrState.deprecationTimer.StopLocked()
+		if newPreferredLifetime <= 0 {
+			ndp.deprecateSLAACAddress(tempAddrState.ref)
+		} else {
+			tempAddrState.ref.deprecated = false
+			tempAddrState.deprecationTimer.Reset(newPreferredLifetime)
+		}
+
+		tempAddrState.regenTimer.StopLocked()
+		if tempAddrState.regenerated {
+		} else {
+			allAddressesRegenerated = false
+
+			if newPreferredLifetime <= ndp.configs.RegenAdvanceDuration {
+				// The new preferred lifetime is less than the advance regeneration
+				// duration so regenerate an address for this temporary address
+				// immediately after we finish iterating over the temporary addresses.
+				regenForAddr = tempAddr
+			} else {
+				tempAddrState.regenTimer.Reset(newPreferredLifetime - ndp.configs.RegenAdvanceDuration)
+			}
+		}
+	}
+
+	// Generate a new temporary address if all of the existing temporary addresses
+	// have been regenerated, or we need to immediately regenerate an address
+	// due to an update in preferred lifetime.
+	//
+	// If each temporay address has already been regenerated, no new temporary
+	// address will be generated. To ensure continuation of temporary SLAAC
+	// addresses, we manually try to regenerate an address here.
+	if len(regenForAddr) != 0 || allAddressesRegenerated {
+		// Reset the generation attempts counter as we are starting the generation
+		// of a new address for the SLAAC prefix.
+		if state, ok := prefixState.tempAddrs[regenForAddr]; ndp.generateTempSLAACAddr(prefix, prefixState, true /* resetGenAttempts */) && ok {
+			state.regenerated = true
+			prefixState.tempAddrs[regenForAddr] = state
+		}
+	}
+}
+
+// deprecateSLAACAddress marks ref as deprecated and notifies the stack's NDP
+// dispatcher that ref has been deprecated.
+//
+// deprecateSLAACAddress does nothing if ref is already deprecated.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) deprecateSLAACAddress(ref *referencedNetworkEndpoint) {
+	if ref.deprecated {
+		return
+	}
+
+	ref.deprecated = true
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressDeprecated(ndp.nic.ID(), ref.addrWithPrefix())
+	}
+}
+
+// invalidateSLAACPrefix invalidates a SLAAC prefix.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) invalidateSLAACPrefix(prefix tcpip.Subnet, state slaacPrefixState) {
+	if r := state.stableAddr.ref; r != nil {
+		// Since we are already invalidating the prefix, do not invalidate the
+		// prefix when removing the address.
+		if err := ndp.nic.removePermanentIPv6EndpointLocked(r, false /* allowSLAACInvalidation */); err != nil {
+			panic(fmt.Sprintf("ndp: error removing stable SLAAC address %s: %s", r.addrWithPrefix(), err))
+		}
+	}
+
+	ndp.cleanupSLAACPrefixResources(prefix, state)
+}
+
+// cleanupSLAACAddrResourcesAndNotify cleans up an invalidated SLAAC address's
+// resources.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidatePrefix bool) {
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), addr)
+	}
+
+	prefix := addr.Subnet()
+	state, ok := ndp.slaacPrefixes[prefix]
+	if !ok || state.stableAddr.ref == nil || addr.Address != state.stableAddr.ref.ep.ID().LocalAddress {
+		return
+	}
+
+	if !invalidatePrefix {
+		// If the prefix is not being invalidated, disassociate the address from the
+		// prefix and do nothing further.
+		state.stableAddr.ref = nil
+		ndp.slaacPrefixes[prefix] = state
+		return
+	}
+
+	ndp.cleanupSLAACPrefixResources(prefix, state)
+}
+
+// cleanupSLAACPrefixResources cleansup a SLAAC prefix's timers and entry.
+//
+// Panics if the SLAAC prefix is not known.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupSLAACPrefixResources(prefix tcpip.Subnet, state slaacPrefixState) {
+	// Invalidate all temporary addresses.
+	for tempAddr, tempAddrState := range state.tempAddrs {
+		ndp.invalidateTempSLAACAddr(state.tempAddrs, tempAddr, tempAddrState)
+	}
+
+	state.stableAddr.ref = nil
+	state.deprecationTimer.StopLocked()
+	state.invalidationTimer.StopLocked()
+	delete(ndp.slaacPrefixes, prefix)
+}
+
+// invalidateTempSLAACAddr invalidates a temporary SLAAC address.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) invalidateTempSLAACAddr(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) {
+	// Since we are already invalidating the address, do not invalidate the
+	// address when removing the address.
+	if err := ndp.nic.removePermanentIPv6EndpointLocked(tempAddrState.ref, false /* allowSLAACInvalidation */); err != nil {
+		panic(fmt.Sprintf("error removing temporary SLAAC address %s: %s", tempAddrState.ref.addrWithPrefix(), err))
+	}
+
+	ndp.cleanupTempSLAACAddrResources(tempAddrs, tempAddr, tempAddrState)
+}
+
+// cleanupTempSLAACAddrResourcesAndNotify cleans up an invalidated temporary
+// SLAAC address's resources from ndp.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupTempSLAACAddrResourcesAndNotify(addr tcpip.AddressWithPrefix, invalidateAddr bool) {
+	if ndpDisp := ndp.nic.stack.ndpDisp; ndpDisp != nil {
+		ndpDisp.OnAutoGenAddressInvalidated(ndp.nic.ID(), addr)
+	}
+
+	if !invalidateAddr {
+		return
+	}
+
+	prefix := addr.Subnet()
+	state, ok := ndp.slaacPrefixes[prefix]
+	if !ok {
+		panic(fmt.Sprintf("ndp: must have a slaacPrefixes entry to clean up temp addr %s resources", addr))
+	}
+
+	tempAddrState, ok := state.tempAddrs[addr.Address]
+	if !ok {
+		panic(fmt.Sprintf("ndp: must have a tempAddr entry to clean up temp addr %s resources", addr))
+	}
+
+	ndp.cleanupTempSLAACAddrResources(state.tempAddrs, addr.Address, tempAddrState)
+}
+
+// cleanupTempSLAACAddrResourcesAndNotify cleans up a temporary SLAAC address's
+// timers and entry.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupTempSLAACAddrResources(tempAddrs map[tcpip.Address]tempSLAACAddrState, tempAddr tcpip.Address, tempAddrState tempSLAACAddrState) {
+	tempAddrState.deprecationTimer.StopLocked()
+	tempAddrState.invalidationTimer.StopLocked()
+	tempAddrState.regenTimer.StopLocked()
+	delete(tempAddrs, tempAddr)
+}
+
+// cleanupState cleans up ndp's state.
+//
+// If hostOnly is true, then only host-specific state will be cleaned up.
+//
+// cleanupState MUST be called with hostOnly set to true when ndp's NIC is
+// transitioning from a host to a router. This function will invalidate all
+// discovered on-link prefixes, discovered routers, and auto-generated
+// addresses.
+//
+// If hostOnly is true, then the link-local auto-generated address will not be
+// invalidated as routers are also expected to generate a link-local address.
+//
+// The NIC that ndp belongs to MUST be locked.
+func (ndp *ndpState) cleanupState(hostOnly bool) {
+	linkLocalSubnet := header.IPv6LinkLocalPrefix.Subnet()
+	linkLocalPrefixes := 0
+	for prefix, state := range ndp.slaacPrefixes {
+		// RFC 4862 section 5 states that routers are also expected to generate a
+		// link-local address so we do not invalidate them if we are cleaning up
+		// host-only state.
+		if hostOnly && prefix == linkLocalSubnet {
+			linkLocalPrefixes++
+			continue
+		}
+
+		ndp.invalidateSLAACPrefix(prefix, state)
+	}
+
+	if got := len(ndp.slaacPrefixes); got != linkLocalPrefixes {
+		panic(fmt.Sprintf("ndp: still have non-linklocal SLAAC prefixes after cleaning up; found = %d prefixes, of which %d are link-local", got, linkLocalPrefixes))
+	}
+
+	for prefix := range ndp.onLinkPrefixes {
+		ndp.invalidateOnLinkPrefix(prefix)
+	}
+
+	if got := len(ndp.onLinkPrefixes); got != 0 {
+		panic(fmt.Sprintf("ndp: still have discovered on-link prefixes after cleaning up; found = %d", got))
+	}
+
+	for router := range ndp.defaultRouters {
+		ndp.invalidateDefaultRouter(router)
+	}
+
+	if got := len(ndp.defaultRouters); got != 0 {
+		panic(fmt.Sprintf("ndp: still have discovered default routers after cleaning up; found = %d", got))
+	}
+
+	ndp.dhcpv6Configuration = 0
+}
+
+// startSolicitingRouters starts soliciting routers, as per RFC 4861 section
+// 6.3.7. If routers are already being solicited, this function does nothing.
+//
+// The NIC ndp belongs to MUST be locked.
+func (ndp *ndpState) startSolicitingRouters() {
+	if ndp.rtrSolicit.timer != nil {
+		// We are already soliciting routers.
+		return
+	}
+
+	remaining := ndp.configs.MaxRtrSolicitations
+	if remaining == 0 {
+		return
+	}
+
+	// Calculate the random delay before sending our first RS, as per RFC
+	// 4861 section 6.3.7.
+	var delay time.Duration
+	if ndp.configs.MaxRtrSolicitationDelay > 0 {
+		delay = time.Duration(rand.Int63n(int64(ndp.configs.MaxRtrSolicitationDelay)))
+	}
+
+	var done bool
+	ndp.rtrSolicit.done = &done
+	ndp.rtrSolicit.timer = time.AfterFunc(delay, func() {
+		ndp.nic.mu.Lock()
+		if done {
+			// If we reach this point, it means that the RS timer fired after another
+			// goroutine already obtained the NIC lock and stopped solicitations.
+			// Simply return here and do nothing further.
+			ndp.nic.mu.Unlock()
+			return
+		}
+
+		// As per RFC 4861 section 4.1, the source of the RS is an address assigned
+		// to the sending interface, or the unspecified address if no address is
+		// assigned to the sending interface.
+		ref := ndp.nic.primaryIPv6EndpointRLocked(header.IPv6AllRoutersMulticastAddress)
+		if ref == nil {
+			ref = ndp.nic.getRefOrCreateTempLocked(header.IPv6ProtocolNumber, header.IPv6Any, NeverPrimaryEndpoint)
+		}
+		ndp.nic.mu.Unlock()
+
+		localAddr := ref.ep.ID().LocalAddress
+		r := makeRoute(header.IPv6ProtocolNumber, localAddr, header.IPv6AllRoutersMulticastAddress, ndp.nic.linkEP.LinkAddress(), ref, false, false)
+		defer r.Release()
+
+		// Route should resolve immediately since
+		// header.IPv6AllRoutersMulticastAddress is a multicast address so a
+		// remote link address can be calculated without a resolution process.
+		if c, err := r.Resolve(nil); err != nil {
+			// Do not consider the NIC being unknown or disabled as a fatal error.
+			// Since this method is required to be called when the NIC is not locked,
+			// the NIC could have been disabled or removed by another goroutine.
+			if err == tcpip.ErrUnknownNICID || err == tcpip.ErrInvalidEndpointState {
+				return
+			}
+
+			panic(fmt.Sprintf("ndp: error when resolving route to send NDP RS (%s -> %s on NIC(%d)): %s", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID(), err))
+		} else if c != nil {
+			panic(fmt.Sprintf("ndp: route resolution not immediate for route to send NDP RS (%s -> %s on NIC(%d))", header.IPv6Any, header.IPv6AllRoutersMulticastAddress, ndp.nic.ID()))
+		}
+
+		// As per RFC 4861 section 4.1, an NDP RS SHOULD include the source
+		// link-layer address option if the source address of the NDP RS is
+		// specified. This option MUST NOT be included if the source address is
+		// unspecified.
+		//
+		// TODO(b/141011931): Validate a LinkEndpoint's link address (provided by
+		// LinkEndpoint.LinkAddress) before reaching this point.
+		var optsSerializer header.NDPOptionsSerializer
+		if localAddr != header.IPv6Any && header.IsValidUnicastEthernetAddress(r.LocalLinkAddress) {
+			optsSerializer = header.NDPOptionsSerializer{
+				header.NDPSourceLinkLayerAddressOption(r.LocalLinkAddress),
+			}
+		}
+		payloadSize := header.ICMPv6HeaderSize + header.NDPRSMinimumSize + int(optsSerializer.Length())
+		hdr := buffer.NewPrependable(int(r.MaxHeaderLength()) + payloadSize)
+		pkt := header.ICMPv6(hdr.Prepend(payloadSize))
+		pkt.SetType(header.ICMPv6RouterSolicit)
+		rs := header.NDPRouterSolicit(pkt.NDPPayload())
+		rs.Options().Serialize(optsSerializer)
+		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, buffer.VectorisedView{}))
+
+		sent := r.Stats().ICMP.V6PacketsSent
+		if err := r.WritePacket(nil,
+			NetworkHeaderParams{
+				Protocol: header.ICMPv6ProtocolNumber,
+				TTL:      header.NDPHopLimit,
+				TOS:      DefaultTOS,
+			}, &PacketBuffer{Header: hdr},
+		); err != nil {
+			sent.Dropped.Increment()
+			log.Printf("startSolicitingRouters: error writing NDP router solicit message on NIC(%d); err = %s", ndp.nic.ID(), err)
+			// Don't send any more messages if we had an error.
+			remaining = 0
+		} else {
+			sent.RouterSolicit.Increment()
+			remaining--
+		}
+
+		ndp.nic.mu.Lock()
+		if done || remaining == 0 {
+			ndp.rtrSolicit.timer = nil
+			ndp.rtrSolicit.done = nil
+		} else if ndp.rtrSolicit.timer != nil {
+			// Note, we need to explicitly check to make sure that
+			// the timer field is not nil because if it was nil but
+			// we still reached this point, then we know the NIC
+			// was requested to stop soliciting routers so we don't
+			// need to send the next Router Solicitation message.
+			ndp.rtrSolicit.timer.Reset(ndp.configs.RtrSolicitationInterval)
+		}
+		ndp.nic.mu.Unlock()
+	})
+
+}
+
+// stopSolicitingRouters stops soliciting routers. If routers are not currently
+// being solicited, this function does nothing.
+//
+// The NIC ndp belongs to MUST be locked.
+func (ndp *ndpState) stopSolicitingRouters() {
+	if ndp.rtrSolicit.timer == nil {
+		// Nothing to do.
+		return
+	}
+
+	*ndp.rtrSolicit.done = true
+	ndp.rtrSolicit.timer.Stop()
+	ndp.rtrSolicit.timer = nil
+	ndp.rtrSolicit.done = nil
+}
+
+// initializeTempAddrState initializes state related to temporary SLAAC
+// addresses.
+func (ndp *ndpState) initializeTempAddrState() {
+	header.InitialTempIID(ndp.temporaryIIDHistory[:], ndp.nic.stack.tempIIDSeed, ndp.nic.ID())
+
+	if MaxDesyncFactor != 0 {
+		ndp.temporaryAddressDesyncFactor = time.Duration(rand.Int63n(int64(MaxDesyncFactor)))
+	}
+}
diff --git a/pkg/tcpip/stack/ndp_test.go b/pkg/tcpip/stack/ndp_test.go
new file mode 100644
index 000000000..6f86abc98
--- /dev/null
+++ b/pkg/tcpip/stack/ndp_test.go
@@ -0,0 +1,5363 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack_test
+
+import (
+	"context"
+	"encoding/binary"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	addr1     = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+	addr2     = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+	addr3     = tcpip.Address("\x0a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03")
+	linkAddr1 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x06")
+	linkAddr2 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x07")
+	linkAddr3 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x08")
+	linkAddr4 = tcpip.LinkAddress("\x02\x02\x03\x04\x05\x09")
+
+	// Extra time to use when waiting for an async event to occur.
+	defaultAsyncPositiveEventTimeout = 10 * time.Second
+
+	// Extra time to use when waiting for an async event to not occur.
+	//
+	// Since a negative check is used to make sure an event did not happen, it is
+	// okay to use a smaller timeout compared to the positive case since execution
+	// stall in regards to the monotonic clock will not affect the expected
+	// outcome.
+	defaultAsyncNegativeEventTimeout = time.Second
+)
+
+var (
+	llAddr1 = header.LinkLocalAddr(linkAddr1)
+	llAddr2 = header.LinkLocalAddr(linkAddr2)
+	llAddr3 = header.LinkLocalAddr(linkAddr3)
+	llAddr4 = header.LinkLocalAddr(linkAddr4)
+	dstAddr = tcpip.FullAddress{
+		Addr: "\x0a\x0b\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01",
+		Port: 25,
+	}
+)
+
+func addrForSubnet(subnet tcpip.Subnet, linkAddr tcpip.LinkAddress) tcpip.AddressWithPrefix {
+	if !header.IsValidUnicastEthernetAddress(linkAddr) {
+		return tcpip.AddressWithPrefix{}
+	}
+
+	addrBytes := []byte(subnet.ID())
+	header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr, addrBytes[header.IIDOffsetInIPv6Address:])
+	return tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(addrBytes),
+		PrefixLen: 64,
+	}
+}
+
+// prefixSubnetAddr returns a prefix (Address + Length), the prefix's equivalent
+// tcpip.Subnet, and an address where the lower half of the address is composed
+// of the EUI-64 of linkAddr if it is a valid unicast ethernet address.
+func prefixSubnetAddr(offset uint8, linkAddr tcpip.LinkAddress) (tcpip.AddressWithPrefix, tcpip.Subnet, tcpip.AddressWithPrefix) {
+	prefixBytes := []byte{1, 2, 3, 4, 5, 6, 7, 8 + offset, 0, 0, 0, 0, 0, 0, 0, 0}
+	prefix := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(prefixBytes),
+		PrefixLen: 64,
+	}
+
+	subnet := prefix.Subnet()
+
+	return prefix, subnet, addrForSubnet(subnet, linkAddr)
+}
+
+// ndpDADEvent is a set of parameters that was passed to
+// ndpDispatcher.OnDuplicateAddressDetectionStatus.
+type ndpDADEvent struct {
+	nicID    tcpip.NICID
+	addr     tcpip.Address
+	resolved bool
+	err      *tcpip.Error
+}
+
+type ndpRouterEvent struct {
+	nicID tcpip.NICID
+	addr  tcpip.Address
+	// true if router was discovered, false if invalidated.
+	discovered bool
+}
+
+type ndpPrefixEvent struct {
+	nicID  tcpip.NICID
+	prefix tcpip.Subnet
+	// true if prefix was discovered, false if invalidated.
+	discovered bool
+}
+
+type ndpAutoGenAddrEventType int
+
+const (
+	newAddr ndpAutoGenAddrEventType = iota
+	deprecatedAddr
+	invalidatedAddr
+)
+
+type ndpAutoGenAddrEvent struct {
+	nicID     tcpip.NICID
+	addr      tcpip.AddressWithPrefix
+	eventType ndpAutoGenAddrEventType
+}
+
+type ndpRDNSS struct {
+	addrs    []tcpip.Address
+	lifetime time.Duration
+}
+
+type ndpRDNSSEvent struct {
+	nicID tcpip.NICID
+	rdnss ndpRDNSS
+}
+
+type ndpDNSSLEvent struct {
+	nicID       tcpip.NICID
+	domainNames []string
+	lifetime    time.Duration
+}
+
+type ndpDHCPv6Event struct {
+	nicID         tcpip.NICID
+	configuration stack.DHCPv6ConfigurationFromNDPRA
+}
+
+var _ stack.NDPDispatcher = (*ndpDispatcher)(nil)
+
+// ndpDispatcher implements NDPDispatcher so tests can know when various NDP
+// related events happen for test purposes.
+type ndpDispatcher struct {
+	dadC                 chan ndpDADEvent
+	routerC              chan ndpRouterEvent
+	rememberRouter       bool
+	prefixC              chan ndpPrefixEvent
+	rememberPrefix       bool
+	autoGenAddrC         chan ndpAutoGenAddrEvent
+	rdnssC               chan ndpRDNSSEvent
+	dnsslC               chan ndpDNSSLEvent
+	routeTable           []tcpip.Route
+	dhcpv6ConfigurationC chan ndpDHCPv6Event
+}
+
+// Implements stack.NDPDispatcher.OnDuplicateAddressDetectionStatus.
+func (n *ndpDispatcher) OnDuplicateAddressDetectionStatus(nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) {
+	if n.dadC != nil {
+		n.dadC <- ndpDADEvent{
+			nicID,
+			addr,
+			resolved,
+			err,
+		}
+	}
+}
+
+// Implements stack.NDPDispatcher.OnDefaultRouterDiscovered.
+func (n *ndpDispatcher) OnDefaultRouterDiscovered(nicID tcpip.NICID, addr tcpip.Address) bool {
+	if c := n.routerC; c != nil {
+		c <- ndpRouterEvent{
+			nicID,
+			addr,
+			true,
+		}
+	}
+
+	return n.rememberRouter
+}
+
+// Implements stack.NDPDispatcher.OnDefaultRouterInvalidated.
+func (n *ndpDispatcher) OnDefaultRouterInvalidated(nicID tcpip.NICID, addr tcpip.Address) {
+	if c := n.routerC; c != nil {
+		c <- ndpRouterEvent{
+			nicID,
+			addr,
+			false,
+		}
+	}
+}
+
+// Implements stack.NDPDispatcher.OnOnLinkPrefixDiscovered.
+func (n *ndpDispatcher) OnOnLinkPrefixDiscovered(nicID tcpip.NICID, prefix tcpip.Subnet) bool {
+	if c := n.prefixC; c != nil {
+		c <- ndpPrefixEvent{
+			nicID,
+			prefix,
+			true,
+		}
+	}
+
+	return n.rememberPrefix
+}
+
+// Implements stack.NDPDispatcher.OnOnLinkPrefixInvalidated.
+func (n *ndpDispatcher) OnOnLinkPrefixInvalidated(nicID tcpip.NICID, prefix tcpip.Subnet) {
+	if c := n.prefixC; c != nil {
+		c <- ndpPrefixEvent{
+			nicID,
+			prefix,
+			false,
+		}
+	}
+}
+
+func (n *ndpDispatcher) OnAutoGenAddress(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) bool {
+	if c := n.autoGenAddrC; c != nil {
+		c <- ndpAutoGenAddrEvent{
+			nicID,
+			addr,
+			newAddr,
+		}
+	}
+	return true
+}
+
+func (n *ndpDispatcher) OnAutoGenAddressDeprecated(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) {
+	if c := n.autoGenAddrC; c != nil {
+		c <- ndpAutoGenAddrEvent{
+			nicID,
+			addr,
+			deprecatedAddr,
+		}
+	}
+}
+
+func (n *ndpDispatcher) OnAutoGenAddressInvalidated(nicID tcpip.NICID, addr tcpip.AddressWithPrefix) {
+	if c := n.autoGenAddrC; c != nil {
+		c <- ndpAutoGenAddrEvent{
+			nicID,
+			addr,
+			invalidatedAddr,
+		}
+	}
+}
+
+// Implements stack.NDPDispatcher.OnRecursiveDNSServerOption.
+func (n *ndpDispatcher) OnRecursiveDNSServerOption(nicID tcpip.NICID, addrs []tcpip.Address, lifetime time.Duration) {
+	if c := n.rdnssC; c != nil {
+		c <- ndpRDNSSEvent{
+			nicID,
+			ndpRDNSS{
+				addrs,
+				lifetime,
+			},
+		}
+	}
+}
+
+// Implements stack.NDPDispatcher.OnDNSSearchListOption.
+func (n *ndpDispatcher) OnDNSSearchListOption(nicID tcpip.NICID, domainNames []string, lifetime time.Duration) {
+	if n.dnsslC != nil {
+		n.dnsslC <- ndpDNSSLEvent{
+			nicID,
+			domainNames,
+			lifetime,
+		}
+	}
+}
+
+// Implements stack.NDPDispatcher.OnDHCPv6Configuration.
+func (n *ndpDispatcher) OnDHCPv6Configuration(nicID tcpip.NICID, configuration stack.DHCPv6ConfigurationFromNDPRA) {
+	if c := n.dhcpv6ConfigurationC; c != nil {
+		c <- ndpDHCPv6Event{
+			nicID,
+			configuration,
+		}
+	}
+}
+
+// channelLinkWithHeaderLength is a channel.Endpoint with a configurable
+// header length.
+type channelLinkWithHeaderLength struct {
+	*channel.Endpoint
+	headerLength uint16
+}
+
+func (l *channelLinkWithHeaderLength) MaxHeaderLength() uint16 {
+	return l.headerLength
+}
+
+// Check e to make sure that the event is for addr on nic with ID 1, and the
+// resolved flag set to resolved with the specified err.
+func checkDADEvent(e ndpDADEvent, nicID tcpip.NICID, addr tcpip.Address, resolved bool, err *tcpip.Error) string {
+	return cmp.Diff(ndpDADEvent{nicID: nicID, addr: addr, resolved: resolved, err: err}, e, cmp.AllowUnexported(e))
+}
+
+// TestDADDisabled tests that an address successfully resolves immediately
+// when DAD is not enabled (the default for an empty stack.Options).
+func TestDADDisabled(t *testing.T) {
+	const nicID = 1
+	ndpDisp := ndpDispatcher{
+		dadC: make(chan ndpDADEvent, 1),
+	}
+	opts := stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPDisp:          &ndpDisp,
+	}
+
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(opts)
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
+	}
+
+	// Should get the address immediately since we should not have performed
+	// DAD on it.
+	select {
+	case e := <-ndpDisp.dadC:
+		if diff := checkDADEvent(e, nicID, addr1, true, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+		}
+	default:
+		t.Fatal("expected DAD event")
+	}
+	addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("stack.GetMainNICAddress(%d, %d) err = %s", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if addr.Address != addr1 {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, addr, addr1)
+	}
+
+	// We should not have sent any NDP NS messages.
+	if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != 0 {
+		t.Fatalf("got NeighborSolicit = %d, want = 0", got)
+	}
+}
+
+// TestDADResolve tests that an address successfully resolves after performing
+// DAD for various values of DupAddrDetectTransmits and RetransmitTimer.
+// Included in the subtests is a test to make sure that an invalid
+// RetransmitTimer (<1ms) values get fixed to the default RetransmitTimer of 1s.
+// This tests also validates the NDP NS packet that is transmitted.
+func TestDADResolve(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name                    string
+		linkHeaderLen           uint16
+		dupAddrDetectTransmits  uint8
+		retransTimer            time.Duration
+		expectedRetransmitTimer time.Duration
+	}{
+		{
+			name:                    "1:1s:1s",
+			dupAddrDetectTransmits:  1,
+			retransTimer:            time.Second,
+			expectedRetransmitTimer: time.Second,
+		},
+		{
+			name:                    "2:1s:1s",
+			linkHeaderLen:           1,
+			dupAddrDetectTransmits:  2,
+			retransTimer:            time.Second,
+			expectedRetransmitTimer: time.Second,
+		},
+		{
+			name:                    "1:2s:2s",
+			linkHeaderLen:           2,
+			dupAddrDetectTransmits:  1,
+			retransTimer:            2 * time.Second,
+			expectedRetransmitTimer: 2 * time.Second,
+		},
+		// 0s is an invalid RetransmitTimer timer and will be fixed to
+		// the default RetransmitTimer value of 1s.
+		{
+			name:                    "1:0s:1s",
+			linkHeaderLen:           3,
+			dupAddrDetectTransmits:  1,
+			retransTimer:            0,
+			expectedRetransmitTimer: time.Second,
+		},
+	}
+
+	for _, test := range tests {
+		test := test
+
+		t.Run(test.name, func(t *testing.T) {
+			t.Parallel()
+
+			ndpDisp := ndpDispatcher{
+				dadC: make(chan ndpDADEvent),
+			}
+			opts := stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPDisp:          &ndpDisp,
+			}
+			opts.NDPConfigs.RetransmitTimer = test.retransTimer
+			opts.NDPConfigs.DupAddrDetectTransmits = test.dupAddrDetectTransmits
+
+			e := channelLinkWithHeaderLength{
+				Endpoint:     channel.New(int(test.dupAddrDetectTransmits), 1280, linkAddr1),
+				headerLength: test.linkHeaderLen,
+			}
+			e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+			s := stack.New(opts)
+			if err := s.CreateNIC(nicID, &e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+
+			// We add a default route so the call to FindRoute below will succeed
+			// once we have an assigned address.
+			s.SetRouteTable([]tcpip.Route{{
+				Destination: header.IPv6EmptySubnet,
+				Gateway:     addr3,
+				NIC:         nicID,
+			}})
+
+			if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
+			}
+
+			// Address should not be considered bound to the NIC yet (DAD ongoing).
+			if addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %s), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			} else if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+			}
+
+			// Make sure the address does not resolve before the resolution time has
+			// passed.
+			time.Sleep(test.expectedRetransmitTimer*time.Duration(test.dupAddrDetectTransmits) - defaultAsyncNegativeEventTimeout)
+			if addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+				t.Errorf("got stack.GetMainNICAddress(%d, %d) = (_, %s), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			} else if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Errorf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+			}
+			// Should not get a route even if we specify the local address as the
+			// tentative address.
+			{
+				r, err := s.FindRoute(nicID, "", addr2, header.IPv6ProtocolNumber, false)
+				if err != tcpip.ErrNoRoute {
+					t.Errorf("got FindRoute(%d, '', %s, %d, false) = (%+v, %v), want = (_, %s)", nicID, addr2, header.IPv6ProtocolNumber, r, err, tcpip.ErrNoRoute)
+				}
+				r.Release()
+			}
+			{
+				r, err := s.FindRoute(nicID, addr1, addr2, header.IPv6ProtocolNumber, false)
+				if err != tcpip.ErrNoRoute {
+					t.Errorf("got FindRoute(%d, %s, %s, %d, false) = (%+v, %v), want = (_, %s)", nicID, addr1, addr2, header.IPv6ProtocolNumber, r, err, tcpip.ErrNoRoute)
+				}
+				r.Release()
+			}
+
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			// Wait for DAD to resolve.
+			select {
+			case <-time.After(defaultAsyncPositiveEventTimeout):
+				t.Fatal("timed out waiting for DAD resolution")
+			case e := <-ndpDisp.dadC:
+				if diff := checkDADEvent(e, nicID, addr1, true, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+				}
+			}
+			if addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+				t.Errorf("got stack.GetMainNICAddress(%d, %d) = (_, %s), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			} else if addr.Address != addr1 {
+				t.Errorf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, addr, addr1)
+			}
+			// Should get a route using the address now that it is resolved.
+			{
+				r, err := s.FindRoute(nicID, "", addr2, header.IPv6ProtocolNumber, false)
+				if err != nil {
+					t.Errorf("got FindRoute(%d, '', %s, %d, false): %s", nicID, addr2, header.IPv6ProtocolNumber, err)
+				} else if r.LocalAddress != addr1 {
+					t.Errorf("got r.LocalAddress = %s, want = %s", r.LocalAddress, addr1)
+				}
+				r.Release()
+			}
+			{
+				r, err := s.FindRoute(nicID, addr1, addr2, header.IPv6ProtocolNumber, false)
+				if err != nil {
+					t.Errorf("got FindRoute(%d, %s, %s, %d, false): %s", nicID, addr1, addr2, header.IPv6ProtocolNumber, err)
+				} else if r.LocalAddress != addr1 {
+					t.Errorf("got r.LocalAddress = %s, want = %s", r.LocalAddress, addr1)
+				}
+				r.Release()
+			}
+
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			// Should not have sent any more NS messages.
+			if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got != uint64(test.dupAddrDetectTransmits) {
+				t.Fatalf("got NeighborSolicit = %d, want = %d", got, test.dupAddrDetectTransmits)
+			}
+
+			// Validate the sent Neighbor Solicitation messages.
+			for i := uint8(0); i < test.dupAddrDetectTransmits; i++ {
+				p, _ := e.ReadContext(context.Background())
+
+				// Make sure its an IPv6 packet.
+				if p.Proto != header.IPv6ProtocolNumber {
+					t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+				}
+
+				// Make sure the right remote link address is used.
+				snmc := header.SolicitedNodeAddr(addr1)
+				if want := header.EthernetAddressFromMulticastIPv6Address(snmc); p.Route.RemoteLinkAddress != want {
+					t.Errorf("got remote link address = %s, want = %s", p.Route.RemoteLinkAddress, want)
+				}
+
+				// Check NDP NS packet.
+				//
+				// As per RFC 4861 section 4.3, a possible option is the Source Link
+				// Layer option, but this option MUST NOT be included when the source
+				// address of the packet is the unspecified address.
+				checker.IPv6(t, p.Pkt.Header.View(),
+					checker.SrcAddr(header.IPv6Any),
+					checker.DstAddr(snmc),
+					checker.TTL(header.NDPHopLimit),
+					checker.NDPNS(
+						checker.NDPNSTargetAddress(addr1),
+						checker.NDPNSOptions(nil),
+					))
+
+				if l, want := p.Pkt.Header.AvailableLength(), int(test.linkHeaderLen); l != want {
+					t.Errorf("got p.Pkt.Header.AvailableLength() = %d; want = %d", l, want)
+				}
+			}
+		})
+	}
+}
+
+// TestDADFail tests to make sure that the DAD process fails if another node is
+// detected to be performing DAD on the same address (receive an NS message from
+// a node doing DAD for the same address), or if another node is detected to own
+// the address already (receive an NA message for the tentative address).
+func TestDADFail(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name    string
+		makeBuf func(tgt tcpip.Address) buffer.Prependable
+		getStat func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter
+	}{
+		{
+			"RxSolicit",
+			func(tgt tcpip.Address) buffer.Prependable {
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + header.ICMPv6NeighborSolicitMinimumSize)
+				pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6NeighborSolicitMinimumSize))
+				pkt.SetType(header.ICMPv6NeighborSolicit)
+				ns := header.NDPNeighborSolicit(pkt.NDPPayload())
+				ns.SetTargetAddress(tgt)
+				snmc := header.SolicitedNodeAddr(tgt)
+				pkt.SetChecksum(header.ICMPv6Checksum(pkt, header.IPv6Any, snmc, buffer.VectorisedView{}))
+				payloadLength := hdr.UsedLength()
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(payloadLength),
+					NextHeader:    uint8(icmp.ProtocolNumber6),
+					HopLimit:      255,
+					SrcAddr:       header.IPv6Any,
+					DstAddr:       snmc,
+				})
+
+				return hdr
+
+			},
+			func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return s.NeighborSolicit
+			},
+		},
+		{
+			"RxAdvert",
+			func(tgt tcpip.Address) buffer.Prependable {
+				naSize := header.ICMPv6NeighborAdvertMinimumSize + header.NDPLinkLayerAddressSize
+				hdr := buffer.NewPrependable(header.IPv6MinimumSize + naSize)
+				pkt := header.ICMPv6(hdr.Prepend(naSize))
+				pkt.SetType(header.ICMPv6NeighborAdvert)
+				na := header.NDPNeighborAdvert(pkt.NDPPayload())
+				na.SetSolicitedFlag(true)
+				na.SetOverrideFlag(true)
+				na.SetTargetAddress(tgt)
+				na.Options().Serialize(header.NDPOptionsSerializer{
+					header.NDPTargetLinkLayerAddressOption(linkAddr1),
+				})
+				pkt.SetChecksum(header.ICMPv6Checksum(pkt, tgt, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
+				payloadLength := hdr.UsedLength()
+				ip := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+				ip.Encode(&header.IPv6Fields{
+					PayloadLength: uint16(payloadLength),
+					NextHeader:    uint8(icmp.ProtocolNumber6),
+					HopLimit:      255,
+					SrcAddr:       tgt,
+					DstAddr:       header.IPv6AllNodesMulticastAddress,
+				})
+
+				return hdr
+
+			},
+			func(s tcpip.ICMPv6ReceivedPacketStats) *tcpip.StatCounter {
+				return s.NeighborAdvert
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				dadC: make(chan ndpDADEvent, 1),
+			}
+			ndpConfigs := stack.DefaultNDPConfigurations()
+			opts := stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs:       ndpConfigs,
+				NDPDisp:          &ndpDisp,
+			}
+			opts.NDPConfigs.RetransmitTimer = time.Second * 2
+
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(opts)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+
+			if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
+			}
+
+			// Address should not be considered bound to the NIC yet
+			// (DAD ongoing).
+			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+			}
+
+			// Receive a packet to simulate multiple nodes owning or
+			// attempting to own the same address.
+			hdr := test.makeBuf(addr1)
+			e.InjectInbound(header.IPv6ProtocolNumber, &stack.PacketBuffer{
+				Data: hdr.View().ToVectorisedView(),
+			})
+
+			stat := test.getStat(s.Stats().ICMP.V6PacketsReceived)
+			if got := stat.Value(); got != 1 {
+				t.Fatalf("got stat = %d, want = 1", got)
+			}
+
+			// Wait for DAD to fail and make sure the address did
+			// not get resolved.
+			select {
+			case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+				// If we don't get a failure event after the
+				// expected resolution time + extra 1s buffer,
+				// something is wrong.
+				t.Fatal("timed out waiting for DAD failure")
+			case e := <-ndpDisp.dadC:
+				if diff := checkDADEvent(e, nicID, addr1, false, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+				}
+			}
+			addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+			}
+
+			// Attempting to add the address again should not fail if the address's
+			// state was cleaned up when DAD failed.
+			if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr1, err)
+			}
+		})
+	}
+}
+
+func TestDADStop(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name               string
+		stopFn             func(t *testing.T, s *stack.Stack)
+		skipFinalAddrCheck bool
+	}{
+		// Tests to make sure that DAD stops when an address is removed.
+		{
+			name: "Remove address",
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				if err := s.RemoveAddress(nicID, addr1); err != nil {
+					t.Fatalf("RemoveAddress(%d, %s): %s", nicID, addr1, err)
+				}
+			},
+		},
+
+		// Tests to make sure that DAD stops when the NIC is disabled.
+		{
+			name: "Disable NIC",
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				if err := s.DisableNIC(nicID); err != nil {
+					t.Fatalf("DisableNIC(%d): %s", nicID, err)
+				}
+			},
+		},
+
+		// Tests to make sure that DAD stops when the NIC is removed.
+		{
+			name: "Remove NIC",
+			stopFn: func(t *testing.T, s *stack.Stack) {
+				if err := s.RemoveNIC(nicID); err != nil {
+					t.Fatalf("RemoveNIC(%d): %s", nicID, err)
+				}
+			},
+			// The NIC is removed so we can't check its addresses after calling
+			// stopFn.
+			skipFinalAddrCheck: true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				dadC: make(chan ndpDADEvent, 1),
+			}
+			ndpConfigs := stack.NDPConfigurations{
+				RetransmitTimer:        time.Second,
+				DupAddrDetectTransmits: 2,
+			}
+			opts := stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPDisp:          &ndpDisp,
+				NDPConfigs:       ndpConfigs,
+			}
+
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(opts)
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+			}
+
+			if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, header.IPv6ProtocolNumber, addr1, err)
+			}
+
+			// Address should not be considered bound to the NIC yet (DAD ongoing).
+			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+			}
+
+			test.stopFn(t, s)
+
+			// Wait for DAD to fail (since the address was removed during DAD).
+			select {
+			case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+				// If we don't get a failure event after the expected resolution
+				// time + extra 1s buffer, something is wrong.
+				t.Fatal("timed out waiting for DAD failure")
+			case e := <-ndpDisp.dadC:
+				if diff := checkDADEvent(e, nicID, addr1, false, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+				}
+			}
+
+			if !test.skipFinalAddrCheck {
+				addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+				if err != nil {
+					t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+				}
+				if want := (tcpip.AddressWithPrefix{}); addr != want {
+					t.Errorf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+				}
+			}
+
+			// Should not have sent more than 1 NS message.
+			if got := s.Stats().ICMP.V6PacketsSent.NeighborSolicit.Value(); got > 1 {
+				t.Errorf("got NeighborSolicit = %d, want <= 1", got)
+			}
+		})
+	}
+}
+
+// TestSetNDPConfigurationFailsForBadNICID tests to make sure we get an error if
+// we attempt to update NDP configurations using an invalid NICID.
+func TestSetNDPConfigurationFailsForBadNICID(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+	})
+
+	// No NIC with ID 1 yet.
+	if got := s.SetNDPConfigurations(1, stack.NDPConfigurations{}); got != tcpip.ErrUnknownNICID {
+		t.Fatalf("got s.SetNDPConfigurations = %v, want = %s", got, tcpip.ErrUnknownNICID)
+	}
+}
+
+// TestSetNDPConfigurations tests that we can update and use per-interface NDP
+// configurations without affecting the default NDP configurations or other
+// interfaces' configurations.
+func TestSetNDPConfigurations(t *testing.T) {
+	const nicID1 = 1
+	const nicID2 = 2
+	const nicID3 = 3
+
+	tests := []struct {
+		name                    string
+		dupAddrDetectTransmits  uint8
+		retransmitTimer         time.Duration
+		expectedRetransmitTimer time.Duration
+	}{
+		{
+			"OK",
+			1,
+			time.Second,
+			time.Second,
+		},
+		{
+			"Invalid Retransmit Timer",
+			1,
+			0,
+			time.Second,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				dadC: make(chan ndpDADEvent, 1),
+			}
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPDisp:          &ndpDisp,
+			})
+
+			expectDADEvent := func(nicID tcpip.NICID, addr tcpip.Address) {
+				select {
+				case e := <-ndpDisp.dadC:
+					if diff := checkDADEvent(e, nicID, addr, true, nil); diff != "" {
+						t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatalf("expected DAD event for %s", addr)
+				}
+			}
+
+			// This NIC(1)'s NDP configurations will be updated to
+			// be different from the default.
+			if err := s.CreateNIC(nicID1, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
+			}
+
+			// Created before updating NIC(1)'s NDP configurations
+			// but updating NIC(1)'s NDP configurations should not
+			// affect other existing NICs.
+			if err := s.CreateNIC(nicID2, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
+			}
+
+			// Update the NDP configurations on NIC(1) to use DAD.
+			configs := stack.NDPConfigurations{
+				DupAddrDetectTransmits: test.dupAddrDetectTransmits,
+				RetransmitTimer:        test.retransmitTimer,
+			}
+			if err := s.SetNDPConfigurations(nicID1, configs); err != nil {
+				t.Fatalf("got SetNDPConfigurations(%d, _) = %s", nicID1, err)
+			}
+
+			// Created after updating NIC(1)'s NDP configurations
+			// but the stack's default NDP configurations should not
+			// have been updated.
+			if err := s.CreateNIC(nicID3, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID3, err)
+			}
+
+			// Add addresses for each NIC.
+			if err := s.AddAddress(nicID1, header.IPv6ProtocolNumber, addr1); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID1, header.IPv6ProtocolNumber, addr1, err)
+			}
+			if err := s.AddAddress(nicID2, header.IPv6ProtocolNumber, addr2); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID2, header.IPv6ProtocolNumber, addr2, err)
+			}
+			expectDADEvent(nicID2, addr2)
+			if err := s.AddAddress(nicID3, header.IPv6ProtocolNumber, addr3); err != nil {
+				t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID3, header.IPv6ProtocolNumber, addr3, err)
+			}
+			expectDADEvent(nicID3, addr3)
+
+			// Address should not be considered bound to NIC(1) yet
+			// (DAD ongoing).
+			addr, err := s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID1, header.IPv6ProtocolNumber, addr, want)
+			}
+
+			// Should get the address on NIC(2) and NIC(3)
+			// immediately since we should not have performed DAD on
+			// it as the stack was configured to not do DAD by
+			// default and we only updated the NDP configurations on
+			// NIC(1).
+			addr, err = s.GetMainNICAddress(nicID2, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID2, header.IPv6ProtocolNumber, err)
+			}
+			if addr.Address != addr2 {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID2, header.IPv6ProtocolNumber, addr, addr2)
+			}
+			addr, err = s.GetMainNICAddress(nicID3, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID3, header.IPv6ProtocolNumber, err)
+			}
+			if addr.Address != addr3 {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID3, header.IPv6ProtocolNumber, addr, addr3)
+			}
+
+			// Sleep until right (500ms before) before resolution to
+			// make sure the address didn't resolve on NIC(1) yet.
+			const delta = 500 * time.Millisecond
+			time.Sleep(time.Duration(test.dupAddrDetectTransmits)*test.expectedRetransmitTimer - delta)
+			addr, err = s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID1, header.IPv6ProtocolNumber, addr, want)
+			}
+
+			// Wait for DAD to resolve.
+			select {
+			case <-time.After(2 * delta):
+				// We should get a resolution event after 500ms
+				// (delta) since we wait for 500ms less than the
+				// expected resolution time above to make sure
+				// that the address did not yet resolve. Waiting
+				// for 1s (2x delta) without a resolution event
+				// means something is wrong.
+				t.Fatal("timed out waiting for DAD resolution")
+			case e := <-ndpDisp.dadC:
+				if diff := checkDADEvent(e, nicID1, addr1, true, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+				}
+			}
+			addr, err = s.GetMainNICAddress(nicID1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID1, header.IPv6ProtocolNumber, err)
+			}
+			if addr.Address != addr1 {
+				t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID1, header.IPv6ProtocolNumber, addr, addr1)
+			}
+		})
+	}
+}
+
+// raBufWithOptsAndDHCPv6 returns a valid NDP Router Advertisement with options
+// and DHCPv6 configurations specified.
+func raBufWithOptsAndDHCPv6(ip tcpip.Address, rl uint16, managedAddress, otherConfigurations bool, optSer header.NDPOptionsSerializer) *stack.PacketBuffer {
+	icmpSize := header.ICMPv6HeaderSize + header.NDPRAMinimumSize + int(optSer.Length())
+	hdr := buffer.NewPrependable(header.IPv6MinimumSize + icmpSize)
+	pkt := header.ICMPv6(hdr.Prepend(icmpSize))
+	pkt.SetType(header.ICMPv6RouterAdvert)
+	pkt.SetCode(0)
+	raPayload := pkt.NDPPayload()
+	ra := header.NDPRouterAdvert(raPayload)
+	// Populate the Router Lifetime.
+	binary.BigEndian.PutUint16(raPayload[2:], rl)
+	// Populate the Managed Address flag field.
+	if managedAddress {
+		// The Managed Addresses flag field is the 7th bit of byte #1 (0-indexing)
+		// of the RA payload.
+		raPayload[1] |= (1 << 7)
+	}
+	// Populate the Other Configurations flag field.
+	if otherConfigurations {
+		// The Other Configurations flag field is the 6th bit of byte #1
+		// (0-indexing) of the RA payload.
+		raPayload[1] |= (1 << 6)
+	}
+	opts := ra.Options()
+	opts.Serialize(optSer)
+	pkt.SetChecksum(header.ICMPv6Checksum(pkt, ip, header.IPv6AllNodesMulticastAddress, buffer.VectorisedView{}))
+	payloadLength := hdr.UsedLength()
+	iph := header.IPv6(hdr.Prepend(header.IPv6MinimumSize))
+	iph.Encode(&header.IPv6Fields{
+		PayloadLength: uint16(payloadLength),
+		NextHeader:    uint8(icmp.ProtocolNumber6),
+		HopLimit:      header.NDPHopLimit,
+		SrcAddr:       ip,
+		DstAddr:       header.IPv6AllNodesMulticastAddress,
+	})
+
+	return &stack.PacketBuffer{Data: hdr.View().ToVectorisedView()}
+}
+
+// raBufWithOpts returns a valid NDP Router Advertisement with options.
+//
+// Note, raBufWithOpts does not populate any of the RA fields other than the
+// Router Lifetime.
+func raBufWithOpts(ip tcpip.Address, rl uint16, optSer header.NDPOptionsSerializer) *stack.PacketBuffer {
+	return raBufWithOptsAndDHCPv6(ip, rl, false, false, optSer)
+}
+
+// raBufWithDHCPv6 returns a valid NDP Router Advertisement with DHCPv6 related
+// fields set.
+//
+// Note, raBufWithDHCPv6 does not populate any of the RA fields other than the
+// DHCPv6 related ones.
+func raBufWithDHCPv6(ip tcpip.Address, managedAddresses, otherConfiguratiosns bool) *stack.PacketBuffer {
+	return raBufWithOptsAndDHCPv6(ip, 0, managedAddresses, otherConfiguratiosns, header.NDPOptionsSerializer{})
+}
+
+// raBuf returns a valid NDP Router Advertisement.
+//
+// Note, raBuf does not populate any of the RA fields other than the
+// Router Lifetime.
+func raBuf(ip tcpip.Address, rl uint16) *stack.PacketBuffer {
+	return raBufWithOpts(ip, rl, header.NDPOptionsSerializer{})
+}
+
+// raBufWithPI returns a valid NDP Router Advertisement with a single Prefix
+// Information option.
+//
+// Note, raBufWithPI does not populate any of the RA fields other than the
+// Router Lifetime.
+func raBufWithPI(ip tcpip.Address, rl uint16, prefix tcpip.AddressWithPrefix, onLink, auto bool, vl, pl uint32) *stack.PacketBuffer {
+	flags := uint8(0)
+	if onLink {
+		// The OnLink flag is the 7th bit in the flags byte.
+		flags |= 1 << 7
+	}
+	if auto {
+		// The Address Auto-Configuration flag is the 6th bit in the
+		// flags byte.
+		flags |= 1 << 6
+	}
+
+	// A valid header.NDPPrefixInformation must be 30 bytes.
+	buf := [30]byte{}
+	// The first byte in a header.NDPPrefixInformation is the Prefix Length
+	// field.
+	buf[0] = uint8(prefix.PrefixLen)
+	// The 2nd byte within a header.NDPPrefixInformation is the Flags field.
+	buf[1] = flags
+	// The Valid Lifetime field starts after the 2nd byte within a
+	// header.NDPPrefixInformation.
+	binary.BigEndian.PutUint32(buf[2:], vl)
+	// The Preferred Lifetime field starts after the 6th byte within a
+	// header.NDPPrefixInformation.
+	binary.BigEndian.PutUint32(buf[6:], pl)
+	// The Prefix Address field starts after the 14th byte within a
+	// header.NDPPrefixInformation.
+	copy(buf[14:], prefix.Address)
+	return raBufWithOpts(ip, rl, header.NDPOptionsSerializer{
+		header.NDPPrefixInformation(buf[:]),
+	})
+}
+
+// TestNoRouterDiscovery tests that router discovery will not be performed if
+// configured not to.
+func TestNoRouterDiscovery(t *testing.T) {
+	// Being configured to discover routers means handle and
+	// discover are set to true and forwarding is set to false.
+	// This tests all possible combinations of the configurations,
+	// except for the configuration where handle = true, discover =
+	// true and forwarding = false (the required configuration to do
+	// router discovery) - that will done in other tests.
+	for i := 0; i < 7; i++ {
+		handle := i&1 != 0
+		discover := i&2 != 0
+		forwarding := i&4 == 0
+
+		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverDefaultRouters(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				routerC: make(chan ndpRouterEvent, 1),
+			}
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:              handle,
+					DiscoverDefaultRouters: discover,
+				},
+				NDPDisp: &ndpDisp,
+			})
+			s.SetForwarding(forwarding)
+
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(1) = %s", err)
+			}
+
+			// Rx an RA with non-zero lifetime.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
+			select {
+			case <-ndpDisp.routerC:
+				t.Fatal("unexpectedly discovered a router when configured not to")
+			default:
+			}
+		})
+	}
+}
+
+// Check e to make sure that the event is for addr on nic with ID 1, and the
+// discovered flag set to discovered.
+func checkRouterEvent(e ndpRouterEvent, addr tcpip.Address, discovered bool) string {
+	return cmp.Diff(ndpRouterEvent{nicID: 1, addr: addr, discovered: discovered}, e, cmp.AllowUnexported(e))
+}
+
+// TestRouterDiscoveryDispatcherNoRemember tests that the stack does not
+// remember a discovered router when the dispatcher asks it not to.
+func TestRouterDiscoveryDispatcherNoRemember(t *testing.T) {
+	ndpDisp := ndpDispatcher{
+		routerC: make(chan ndpRouterEvent, 1),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverDefaultRouters: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Receive an RA for a router we should not remember.
+	const lifetimeSeconds = 1
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, lifetimeSeconds))
+	select {
+	case e := <-ndpDisp.routerC:
+		if diff := checkRouterEvent(e, llAddr2, true); diff != "" {
+			t.Errorf("router event mismatch (-want +got):\n%s", diff)
+		}
+	default:
+		t.Fatal("expected router discovery event")
+	}
+
+	// Wait for the invalidation time plus some buffer to make sure we do
+	// not actually receive any invalidation events as we should not have
+	// remembered the router in the first place.
+	select {
+	case <-ndpDisp.routerC:
+		t.Fatal("should not have received any router events")
+	case <-time.After(lifetimeSeconds*time.Second + defaultAsyncNegativeEventTimeout):
+	}
+}
+
+func TestRouterDiscovery(t *testing.T) {
+	ndpDisp := ndpDispatcher{
+		routerC:        make(chan ndpRouterEvent, 1),
+		rememberRouter: true,
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverDefaultRouters: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	expectRouterEvent := func(addr tcpip.Address, discovered bool) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.routerC:
+			if diff := checkRouterEvent(e, addr, discovered); diff != "" {
+				t.Errorf("router event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected router discovery event")
+		}
+	}
+
+	expectAsyncRouterInvalidationEvent := func(addr tcpip.Address, timeout time.Duration) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.routerC:
+			if diff := checkRouterEvent(e, addr, false); diff != "" {
+				t.Errorf("router event mismatch (-want +got):\n%s", diff)
+			}
+		case <-time.After(timeout):
+			t.Fatal("timed out waiting for router discovery event")
+		}
+	}
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Rx an RA from lladdr2 with zero lifetime. It should not be
+	// remembered.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 0))
+	select {
+	case <-ndpDisp.routerC:
+		t.Fatal("unexpectedly discovered a router with 0 lifetime")
+	default:
+	}
+
+	// Rx an RA from lladdr2 with a huge lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
+	expectRouterEvent(llAddr2, true)
+
+	// Rx an RA from another router (lladdr3) with non-zero lifetime.
+	const l3LifetimeSeconds = 6
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr3, l3LifetimeSeconds))
+	expectRouterEvent(llAddr3, true)
+
+	// Rx an RA from lladdr2 with lesser lifetime.
+	const l2LifetimeSeconds = 2
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, l2LifetimeSeconds))
+	select {
+	case <-ndpDisp.routerC:
+		t.Fatal("Should not receive a router event when updating lifetimes for known routers")
+	default:
+	}
+
+	// Wait for lladdr2's router invalidation timer to fire. The lifetime
+	// of the router should have been updated to the most recent (smaller)
+	// lifetime.
+	//
+	// Wait for the normal lifetime plus an extra bit for the
+	// router to get invalidated. If we don't get an invalidation
+	// event after this time, then something is wrong.
+	expectAsyncRouterInvalidationEvent(llAddr2, l2LifetimeSeconds*time.Second+defaultAsyncPositiveEventTimeout)
+
+	// Rx an RA from lladdr2 with huge lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 1000))
+	expectRouterEvent(llAddr2, true)
+
+	// Rx an RA from lladdr2 with zero lifetime. It should be invalidated.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr2, 0))
+	expectRouterEvent(llAddr2, false)
+
+	// Wait for lladdr3's router invalidation timer to fire. The lifetime
+	// of the router should have been updated to the most recent (smaller)
+	// lifetime.
+	//
+	// Wait for the normal lifetime plus an extra bit for the
+	// router to get invalidated. If we don't get an invalidation
+	// event after this time, then something is wrong.
+	expectAsyncRouterInvalidationEvent(llAddr3, l3LifetimeSeconds*time.Second+defaultAsyncPositiveEventTimeout)
+}
+
+// TestRouterDiscoveryMaxRouters tests that only
+// stack.MaxDiscoveredDefaultRouters discovered routers are remembered.
+func TestRouterDiscoveryMaxRouters(t *testing.T) {
+	ndpDisp := ndpDispatcher{
+		routerC:        make(chan ndpRouterEvent, 1),
+		rememberRouter: true,
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverDefaultRouters: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Receive an RA from 2 more than the max number of discovered routers.
+	for i := 1; i <= stack.MaxDiscoveredDefaultRouters+2; i++ {
+		linkAddr := []byte{2, 2, 3, 4, 5, 0}
+		linkAddr[5] = byte(i)
+		llAddr := header.LinkLocalAddr(tcpip.LinkAddress(linkAddr))
+
+		e.InjectInbound(header.IPv6ProtocolNumber, raBuf(llAddr, 5))
+
+		if i <= stack.MaxDiscoveredDefaultRouters {
+			select {
+			case e := <-ndpDisp.routerC:
+				if diff := checkRouterEvent(e, llAddr, true); diff != "" {
+					t.Errorf("router event mismatch (-want +got):\n%s", diff)
+				}
+			default:
+				t.Fatal("expected router discovery event")
+			}
+
+		} else {
+			select {
+			case <-ndpDisp.routerC:
+				t.Fatal("should not have discovered a new router after we already discovered the max number of routers")
+			default:
+			}
+		}
+	}
+}
+
+// TestNoPrefixDiscovery tests that prefix discovery will not be performed if
+// configured not to.
+func TestNoPrefixDiscovery(t *testing.T) {
+	prefix := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
+		PrefixLen: 64,
+	}
+
+	// Being configured to discover prefixes means handle and
+	// discover are set to true and forwarding is set to false.
+	// This tests all possible combinations of the configurations,
+	// except for the configuration where handle = true, discover =
+	// true and forwarding = false (the required configuration to do
+	// prefix discovery) - that will done in other tests.
+	for i := 0; i < 7; i++ {
+		handle := i&1 != 0
+		discover := i&2 != 0
+		forwarding := i&4 == 0
+
+		t.Run(fmt.Sprintf("HandleRAs(%t), DiscoverOnLinkPrefixes(%t), Forwarding(%t)", handle, discover, forwarding), func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				prefixC: make(chan ndpPrefixEvent, 1),
+			}
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:              handle,
+					DiscoverOnLinkPrefixes: discover,
+				},
+				NDPDisp: &ndpDisp,
+			})
+			s.SetForwarding(forwarding)
+
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(1) = %s", err)
+			}
+
+			// Rx an RA with prefix with non-zero lifetime.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, 10, 0))
+
+			select {
+			case <-ndpDisp.prefixC:
+				t.Fatal("unexpectedly discovered a prefix when configured not to")
+			default:
+			}
+		})
+	}
+}
+
+// Check e to make sure that the event is for prefix on nic with ID 1, and the
+// discovered flag set to discovered.
+func checkPrefixEvent(e ndpPrefixEvent, prefix tcpip.Subnet, discovered bool) string {
+	return cmp.Diff(ndpPrefixEvent{nicID: 1, prefix: prefix, discovered: discovered}, e, cmp.AllowUnexported(e))
+}
+
+// TestPrefixDiscoveryDispatcherNoRemember tests that the stack does not
+// remember a discovered on-link prefix when the dispatcher asks it not to.
+func TestPrefixDiscoveryDispatcherNoRemember(t *testing.T) {
+	prefix, subnet, _ := prefixSubnetAddr(0, "")
+
+	ndpDisp := ndpDispatcher{
+		prefixC: make(chan ndpPrefixEvent, 1),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverDefaultRouters: false,
+			DiscoverOnLinkPrefixes: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Receive an RA with prefix that we should not remember.
+	const lifetimeSeconds = 1
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, lifetimeSeconds, 0))
+	select {
+	case e := <-ndpDisp.prefixC:
+		if diff := checkPrefixEvent(e, subnet, true); diff != "" {
+			t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
+		}
+	default:
+		t.Fatal("expected prefix discovery event")
+	}
+
+	// Wait for the invalidation time plus some buffer to make sure we do
+	// not actually receive any invalidation events as we should not have
+	// remembered the prefix in the first place.
+	select {
+	case <-ndpDisp.prefixC:
+		t.Fatal("should not have received any prefix events")
+	case <-time.After(lifetimeSeconds*time.Second + defaultAsyncNegativeEventTimeout):
+	}
+}
+
+func TestPrefixDiscovery(t *testing.T) {
+	prefix1, subnet1, _ := prefixSubnetAddr(0, "")
+	prefix2, subnet2, _ := prefixSubnetAddr(1, "")
+	prefix3, subnet3, _ := prefixSubnetAddr(2, "")
+
+	ndpDisp := ndpDispatcher{
+		prefixC:        make(chan ndpPrefixEvent, 1),
+		rememberPrefix: true,
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverOnLinkPrefixes: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	expectPrefixEvent := func(prefix tcpip.Subnet, discovered bool) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.prefixC:
+			if diff := checkPrefixEvent(e, prefix, discovered); diff != "" {
+				t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected prefix discovery event")
+		}
+	}
+
+	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+	// with zero valid lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 0, 0))
+	select {
+	case <-ndpDisp.prefixC:
+		t.Fatal("unexpectedly discovered a prefix with 0 lifetime")
+	default:
+	}
+
+	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+	// with non-zero lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 100, 0))
+	expectPrefixEvent(subnet1, true)
+
+	// Receive an RA with prefix2 in a PI.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, false, 100, 0))
+	expectPrefixEvent(subnet2, true)
+
+	// Receive an RA with prefix3 in a PI.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, false, 100, 0))
+	expectPrefixEvent(subnet3, true)
+
+	// Receive an RA with prefix1 in a PI with lifetime = 0.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, false, 0, 0))
+	expectPrefixEvent(subnet1, false)
+
+	// Receive an RA with prefix2 in a PI with lesser lifetime.
+	lifetime := uint32(2)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, false, lifetime, 0))
+	select {
+	case <-ndpDisp.prefixC:
+		t.Fatal("unexpectedly received prefix event when updating lifetime")
+	default:
+	}
+
+	// Wait for prefix2's most recent invalidation timer plus some buffer to
+	// expire.
+	select {
+	case e := <-ndpDisp.prefixC:
+		if diff := checkPrefixEvent(e, subnet2, false); diff != "" {
+			t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(time.Duration(lifetime)*time.Second + defaultAsyncPositiveEventTimeout):
+		t.Fatal("timed out waiting for prefix discovery event")
+	}
+
+	// Receive RA to invalidate prefix3.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix3, true, false, 0, 0))
+	expectPrefixEvent(subnet3, false)
+}
+
+func TestPrefixDiscoveryWithInfiniteLifetime(t *testing.T) {
+	// Update the infinite lifetime value to a smaller value so we can test
+	// that when we receive a PI with such a lifetime value, we do not
+	// invalidate the prefix.
+	const testInfiniteLifetimeSeconds = 2
+	const testInfiniteLifetime = testInfiniteLifetimeSeconds * time.Second
+	saved := header.NDPInfiniteLifetime
+	header.NDPInfiniteLifetime = testInfiniteLifetime
+	defer func() {
+		header.NDPInfiniteLifetime = saved
+	}()
+
+	prefix := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address("\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x00"),
+		PrefixLen: 64,
+	}
+	subnet := prefix.Subnet()
+
+	ndpDisp := ndpDispatcher{
+		prefixC:        make(chan ndpPrefixEvent, 1),
+		rememberPrefix: true,
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverOnLinkPrefixes: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	expectPrefixEvent := func(prefix tcpip.Subnet, discovered bool) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.prefixC:
+			if diff := checkPrefixEvent(e, prefix, discovered); diff != "" {
+				t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected prefix discovery event")
+		}
+	}
+
+	// Receive an RA with prefix in an NDP Prefix Information option (PI)
+	// with infinite valid lifetime which should not get invalidated.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds, 0))
+	expectPrefixEvent(subnet, true)
+	select {
+	case <-ndpDisp.prefixC:
+		t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
+	case <-time.After(testInfiniteLifetime + defaultAsyncNegativeEventTimeout):
+	}
+
+	// Receive an RA with finite lifetime.
+	// The prefix should get invalidated after 1s.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds-1, 0))
+	select {
+	case e := <-ndpDisp.prefixC:
+		if diff := checkPrefixEvent(e, subnet, false); diff != "" {
+			t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(testInfiniteLifetime):
+		t.Fatal("timed out waiting for prefix discovery event")
+	}
+
+	// Receive an RA with finite lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds-1, 0))
+	expectPrefixEvent(subnet, true)
+
+	// Receive an RA with prefix with an infinite lifetime.
+	// The prefix should not be invalidated.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds, 0))
+	select {
+	case <-ndpDisp.prefixC:
+		t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
+	case <-time.After(testInfiniteLifetime + defaultAsyncNegativeEventTimeout):
+	}
+
+	// Receive an RA with a prefix with a lifetime value greater than the
+	// set infinite lifetime value.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, testInfiniteLifetimeSeconds+1, 0))
+	select {
+	case <-ndpDisp.prefixC:
+		t.Fatal("unexpectedly invalidated a prefix with infinite lifetime")
+	case <-time.After((testInfiniteLifetimeSeconds+1)*time.Second + defaultAsyncNegativeEventTimeout):
+	}
+
+	// Receive an RA with 0 lifetime.
+	// The prefix should get invalidated.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, false, 0, 0))
+	expectPrefixEvent(subnet, false)
+}
+
+// TestPrefixDiscoveryMaxRouters tests that only
+// stack.MaxDiscoveredOnLinkPrefixes discovered on-link prefixes are remembered.
+func TestPrefixDiscoveryMaxOnLinkPrefixes(t *testing.T) {
+	ndpDisp := ndpDispatcher{
+		prefixC:        make(chan ndpPrefixEvent, stack.MaxDiscoveredOnLinkPrefixes+3),
+		rememberPrefix: true,
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			DiscoverDefaultRouters: false,
+			DiscoverOnLinkPrefixes: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	optSer := make(header.NDPOptionsSerializer, stack.MaxDiscoveredOnLinkPrefixes+2)
+	prefixes := [stack.MaxDiscoveredOnLinkPrefixes + 2]tcpip.Subnet{}
+
+	// Receive an RA with 2 more than the max number of discovered on-link
+	// prefixes.
+	for i := 0; i < stack.MaxDiscoveredOnLinkPrefixes+2; i++ {
+		prefixAddr := [16]byte{1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0}
+		prefixAddr[7] = byte(i)
+		prefix := tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(prefixAddr[:]),
+			PrefixLen: 64,
+		}
+		prefixes[i] = prefix.Subnet()
+		buf := [30]byte{}
+		buf[0] = uint8(prefix.PrefixLen)
+		buf[1] = 128
+		binary.BigEndian.PutUint32(buf[2:], 10)
+		copy(buf[14:], prefix.Address)
+
+		optSer[i] = header.NDPPrefixInformation(buf[:])
+	}
+
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, optSer))
+	for i := 0; i < stack.MaxDiscoveredOnLinkPrefixes+2; i++ {
+		if i < stack.MaxDiscoveredOnLinkPrefixes {
+			select {
+			case e := <-ndpDisp.prefixC:
+				if diff := checkPrefixEvent(e, prefixes[i], true); diff != "" {
+					t.Errorf("prefix event mismatch (-want +got):\n%s", diff)
+				}
+			default:
+				t.Fatal("expected prefix discovery event")
+			}
+		} else {
+			select {
+			case <-ndpDisp.prefixC:
+				t.Fatal("should not have discovered a new prefix after we already discovered the max number of prefixes")
+			default:
+			}
+		}
+	}
+}
+
+// Checks to see if list contains an IPv6 address, item.
+func containsV6Addr(list []tcpip.ProtocolAddress, item tcpip.AddressWithPrefix) bool {
+	protocolAddress := tcpip.ProtocolAddress{
+		Protocol:          header.IPv6ProtocolNumber,
+		AddressWithPrefix: item,
+	}
+
+	for _, i := range list {
+		if i == protocolAddress {
+			return true
+		}
+	}
+
+	return false
+}
+
+// TestNoAutoGenAddr tests that SLAAC is not performed when configured not to.
+func TestNoAutoGenAddr(t *testing.T) {
+	prefix, _, _ := prefixSubnetAddr(0, "")
+
+	// Being configured to auto-generate addresses means handle and
+	// autogen are set to true and forwarding is set to false.
+	// This tests all possible combinations of the configurations,
+	// except for the configuration where handle = true, autogen =
+	// true and forwarding = false (the required configuration to do
+	// SLAAC) - that will done in other tests.
+	for i := 0; i < 7; i++ {
+		handle := i&1 != 0
+		autogen := i&2 != 0
+		forwarding := i&4 == 0
+
+		t.Run(fmt.Sprintf("HandleRAs(%t), AutoGenAddr(%t), Forwarding(%t)", handle, autogen, forwarding), func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+			}
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:              handle,
+					AutoGenGlobalAddresses: autogen,
+				},
+				NDPDisp: &ndpDisp,
+			})
+			s.SetForwarding(forwarding)
+
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(1) = %s", err)
+			}
+
+			// Rx an RA with prefix with non-zero lifetime.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, false, true, 10, 0))
+
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly auto-generated an address when configured not to")
+			default:
+			}
+		})
+	}
+}
+
+// Check e to make sure that the event is for addr on nic with ID 1, and the
+// event type is set to eventType.
+func checkAutoGenAddrEvent(e ndpAutoGenAddrEvent, addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) string {
+	return cmp.Diff(ndpAutoGenAddrEvent{nicID: 1, addr: addr, eventType: eventType}, e, cmp.AllowUnexported(e))
+}
+
+// TestAutoGenAddr tests that an address is properly generated and invalidated
+// when configured to do so.
+func TestAutoGenAddr(t *testing.T) {
+	const newMinVL = 2
+	newMinVLDuration := newMinVL * time.Second
+	saved := stack.MinPrefixInformationValidLifetimeForUpdate
+	defer func() {
+		stack.MinPrefixInformationValidLifetimeForUpdate = saved
+	}()
+	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
+
+	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			AutoGenGlobalAddresses: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+	// with zero valid lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 0, 0))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly auto-generated an address with 0 lifetime")
+	default:
+	}
+
+	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+	// with non-zero lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
+	expectAutoGenAddrEvent(addr1, newAddr)
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+
+	// Receive an RA with prefix2 in an NDP Prefix Information option (PI)
+	// with preferred lifetime > valid lifetime
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 5, 6))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly auto-generated an address with preferred lifetime > valid lifetime")
+	default:
+	}
+
+	// Receive an RA with prefix2 in a PI.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+	expectAutoGenAddrEvent(addr2, newAddr)
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr2) {
+		t.Fatalf("Should have %s in the list of addresses", addr2)
+	}
+
+	// Refresh valid lifetime for addr of prefix1.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly auto-generated an address when we already have an address for a prefix")
+	default:
+	}
+
+	// Wait for addr of prefix1 to be invalidated.
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" {
+			t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout):
+		t.Fatal("timed out waiting for addr auto gen event")
+	}
+	if containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr1) {
+		t.Fatalf("Should not have %s in the list of addresses", addr1)
+	}
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr2) {
+		t.Fatalf("Should have %s in the list of addresses", addr2)
+	}
+}
+
+func addressCheck(addrs []tcpip.ProtocolAddress, containList, notContainList []tcpip.AddressWithPrefix) string {
+	ret := ""
+	for _, c := range containList {
+		if !containsV6Addr(addrs, c) {
+			ret += fmt.Sprintf("should have %s in the list of addresses\n", c)
+		}
+	}
+	for _, c := range notContainList {
+		if containsV6Addr(addrs, c) {
+			ret += fmt.Sprintf("should not have %s in the list of addresses\n", c)
+		}
+	}
+	return ret
+}
+
+// TestAutoGenTempAddr tests that temporary SLAAC addresses are generated when
+// configured to do so as part of IPv6 Privacy Extensions.
+func TestAutoGenTempAddr(t *testing.T) {
+	const (
+		nicID            = 1
+		newMinVL         = 5
+		newMinVLDuration = newMinVL * time.Second
+	)
+
+	savedMinPrefixInformationValidLifetimeForUpdate := stack.MinPrefixInformationValidLifetimeForUpdate
+	savedMaxDesync := stack.MaxDesyncFactor
+	defer func() {
+		stack.MinPrefixInformationValidLifetimeForUpdate = savedMinPrefixInformationValidLifetimeForUpdate
+		stack.MaxDesyncFactor = savedMaxDesync
+	}()
+	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
+	stack.MaxDesyncFactor = time.Nanosecond
+
+	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+
+	tests := []struct {
+		name             string
+		dupAddrTransmits uint8
+		retransmitTimer  time.Duration
+	}{
+		{
+			name: "DAD disabled",
+		},
+		{
+			name:             "DAD enabled",
+			dupAddrTransmits: 1,
+			retransmitTimer:  time.Second,
+		},
+	}
+
+	// This Run will not return until the parallel tests finish.
+	//
+	// We need this because we need to do some teardown work after the
+	// parallel tests complete.
+	//
+	// See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+	// more details.
+	t.Run("group", func(t *testing.T) {
+		for i, test := range tests {
+			i := i
+			test := test
+
+			t.Run(test.name, func(t *testing.T) {
+				t.Parallel()
+
+				seed := []byte{uint8(i)}
+				var tempIIDHistory [header.IIDSize]byte
+				header.InitialTempIID(tempIIDHistory[:], seed, nicID)
+				newTempAddr := func(stableAddr tcpip.Address) tcpip.AddressWithPrefix {
+					return header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], stableAddr)
+				}
+
+				ndpDisp := ndpDispatcher{
+					dadC:         make(chan ndpDADEvent, 2),
+					autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+				}
+				e := channel.New(0, 1280, linkAddr1)
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+					NDPConfigs: stack.NDPConfigurations{
+						DupAddrDetectTransmits:     test.dupAddrTransmits,
+						RetransmitTimer:            test.retransmitTimer,
+						HandleRAs:                  true,
+						AutoGenGlobalAddresses:     true,
+						AutoGenTempGlobalAddresses: true,
+					},
+					NDPDisp:     &ndpDisp,
+					TempIIDSeed: seed,
+				})
+
+				if err := s.CreateNIC(nicID, e); err != nil {
+					t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+				}
+
+				expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+					t.Helper()
+
+					select {
+					case e := <-ndpDisp.autoGenAddrC:
+						if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+						}
+					default:
+						t.Fatal("expected addr auto gen event")
+					}
+				}
+
+				expectAutoGenAddrEventAsync := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+					t.Helper()
+
+					select {
+					case e := <-ndpDisp.autoGenAddrC:
+						if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+						}
+					case <-time.After(defaultAsyncPositiveEventTimeout):
+						t.Fatal("timed out waiting for addr auto gen event")
+					}
+				}
+
+				expectDADEventAsync := func(addr tcpip.Address) {
+					t.Helper()
+
+					select {
+					case e := <-ndpDisp.dadC:
+						if diff := checkDADEvent(e, nicID, addr, true, nil); diff != "" {
+							t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+						}
+					case <-time.After(time.Duration(test.dupAddrTransmits)*test.retransmitTimer + defaultAsyncPositiveEventTimeout):
+						t.Fatal("timed out waiting for DAD event")
+					}
+				}
+
+				// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+				// with zero valid lifetime.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 0, 0))
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					t.Fatalf("unexpectedly auto-generated an address with 0 lifetime; event = %+v", e)
+				default:
+				}
+
+				// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+				// with non-zero valid lifetime.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
+				expectAutoGenAddrEvent(addr1, newAddr)
+				expectDADEventAsync(addr1.Address)
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					t.Fatalf("unexpectedly got an auto gen addr event = %+v", e)
+				default:
+				}
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1}, nil); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+				// with non-zero valid & preferred lifetimes.
+				tempAddr1 := newTempAddr(addr1.Address)
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+				expectAutoGenAddrEvent(tempAddr1, newAddr)
+				expectDADEventAsync(tempAddr1.Address)
+				if mismatch := addressCheck(s.NICInfo()[1].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1}, nil); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Receive an RA with prefix2 in an NDP Prefix Information option (PI)
+				// with preferred lifetime > valid lifetime
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 5, 6))
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					t.Fatalf("unexpectedly auto-generated an address with preferred lifetime > valid lifetime; event = %+v", e)
+				default:
+				}
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1}, nil); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Receive an RA with prefix2 in a PI w/ non-zero valid and preferred
+				// lifetimes.
+				tempAddr2 := newTempAddr(addr2.Address)
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+				expectAutoGenAddrEvent(addr2, newAddr)
+				expectDADEventAsync(addr2.Address)
+				expectAutoGenAddrEventAsync(tempAddr2, newAddr)
+				expectDADEventAsync(tempAddr2.Address)
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Deprecate prefix1.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
+				expectAutoGenAddrEvent(addr1, deprecatedAddr)
+				expectAutoGenAddrEvent(tempAddr1, deprecatedAddr)
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Refresh lifetimes for prefix1.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Reduce valid lifetime and deprecate addresses of prefix1.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
+				expectAutoGenAddrEvent(addr1, deprecatedAddr)
+				expectAutoGenAddrEvent(tempAddr1, deprecatedAddr)
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr1, tempAddr1, addr2, tempAddr2}, nil); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Wait for addrs of prefix1 to be invalidated. They should be
+				// invalidated at the same time.
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					var nextAddr tcpip.AddressWithPrefix
+					if e.addr == addr1 {
+						if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" {
+							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+						}
+						nextAddr = tempAddr1
+					} else {
+						if diff := checkAutoGenAddrEvent(e, tempAddr1, invalidatedAddr); diff != "" {
+							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+						}
+						nextAddr = addr1
+					}
+
+					select {
+					case e := <-ndpDisp.autoGenAddrC:
+						if diff := checkAutoGenAddrEvent(e, nextAddr, invalidatedAddr); diff != "" {
+							t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+						}
+					case <-time.After(defaultAsyncPositiveEventTimeout):
+						t.Fatal("timed out waiting for addr auto gen event")
+					}
+				case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout):
+					t.Fatal("timed out waiting for addr auto gen event")
+				}
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr2, tempAddr2}, []tcpip.AddressWithPrefix{addr1, tempAddr1}); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+
+				// Receive an RA with prefix2 in a PI w/ 0 lifetimes.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 0, 0))
+				expectAutoGenAddrEvent(addr2, deprecatedAddr)
+				expectAutoGenAddrEvent(tempAddr2, deprecatedAddr)
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					t.Errorf("got unexpected auto gen addr event = %+v", e)
+				default:
+				}
+				if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr2, tempAddr2}, []tcpip.AddressWithPrefix{addr1, tempAddr1}); mismatch != "" {
+					t.Fatal(mismatch)
+				}
+			})
+		}
+	})
+}
+
+// TestNoAutoGenTempAddrForLinkLocal test that temporary SLAAC addresses are not
+// generated for auto generated link-local addresses.
+func TestNoAutoGenTempAddrForLinkLocal(t *testing.T) {
+	const nicID = 1
+
+	savedMaxDesyncFactor := stack.MaxDesyncFactor
+	defer func() {
+		stack.MaxDesyncFactor = savedMaxDesyncFactor
+	}()
+	stack.MaxDesyncFactor = time.Nanosecond
+
+	tests := []struct {
+		name             string
+		dupAddrTransmits uint8
+		retransmitTimer  time.Duration
+	}{
+		{
+			name: "DAD disabled",
+		},
+		{
+			name:             "DAD enabled",
+			dupAddrTransmits: 1,
+			retransmitTimer:  time.Second,
+		},
+	}
+
+	// This Run will not return until the parallel tests finish.
+	//
+	// We need this because we need to do some teardown work after the
+	// parallel tests complete.
+	//
+	// See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+	// more details.
+	t.Run("group", func(t *testing.T) {
+		for _, test := range tests {
+			test := test
+
+			t.Run(test.name, func(t *testing.T) {
+				t.Parallel()
+
+				ndpDisp := ndpDispatcher{
+					dadC:         make(chan ndpDADEvent, 1),
+					autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+				}
+				e := channel.New(0, 1280, linkAddr1)
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+					NDPConfigs: stack.NDPConfigurations{
+						AutoGenTempGlobalAddresses: true,
+					},
+					NDPDisp:              &ndpDisp,
+					AutoGenIPv6LinkLocal: true,
+				})
+
+				if err := s.CreateNIC(nicID, e); err != nil {
+					t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+				}
+
+				// The stable link-local address should auto-generate and resolve DAD.
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, tcpip.AddressWithPrefix{Address: llAddr1, PrefixLen: header.IIDOffsetInIPv6Address * 8}, newAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
+				select {
+				case e := <-ndpDisp.dadC:
+					if diff := checkDADEvent(e, nicID, llAddr1, true, nil); diff != "" {
+						t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+					}
+				case <-time.After(time.Duration(test.dupAddrTransmits)*test.retransmitTimer + defaultAsyncPositiveEventTimeout):
+					t.Fatal("timed out waiting for DAD event")
+				}
+
+				// No new addresses should be generated.
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					t.Errorf("got unxpected auto gen addr event = %+v", e)
+				case <-time.After(defaultAsyncNegativeEventTimeout):
+				}
+			})
+		}
+	})
+}
+
+// TestNoAutoGenTempAddrWithoutStableAddr tests that a temporary SLAAC address
+// will not be generated until after DAD completes, even if a new Router
+// Advertisement is received to refresh lifetimes.
+func TestNoAutoGenTempAddrWithoutStableAddr(t *testing.T) {
+	const (
+		nicID           = 1
+		dadTransmits    = 1
+		retransmitTimer = 2 * time.Second
+	)
+
+	savedMaxDesyncFactor := stack.MaxDesyncFactor
+	defer func() {
+		stack.MaxDesyncFactor = savedMaxDesyncFactor
+	}()
+	stack.MaxDesyncFactor = 0
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+	var tempIIDHistory [header.IIDSize]byte
+	header.InitialTempIID(tempIIDHistory[:], nil, nicID)
+	tempAddr := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address)
+
+	ndpDisp := ndpDispatcher{
+		dadC:         make(chan ndpDADEvent, 1),
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			DupAddrDetectTransmits:     dadTransmits,
+			RetransmitTimer:            retransmitTimer,
+			HandleRAs:                  true,
+			AutoGenGlobalAddresses:     true,
+			AutoGenTempGlobalAddresses: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	// Receive an RA to trigger SLAAC for prefix.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		if diff := checkAutoGenAddrEvent(e, addr, newAddr); diff != "" {
+			t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+		}
+	default:
+		t.Fatal("expected addr auto gen event")
+	}
+
+	// DAD on the stable address for prefix has not yet completed. Receiving a new
+	// RA that would refresh lifetimes should not generate a temporary SLAAC
+	// address for the prefix.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		t.Fatalf("unexpected auto gen addr event = %+v", e)
+	default:
+	}
+
+	// Wait for DAD to complete for the stable address then expect the temporary
+	// address to be generated.
+	select {
+	case e := <-ndpDisp.dadC:
+		if diff := checkDADEvent(e, nicID, addr.Address, true, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncPositiveEventTimeout):
+		t.Fatal("timed out waiting for DAD event")
+	}
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		if diff := checkAutoGenAddrEvent(e, tempAddr, newAddr); diff != "" {
+			t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(defaultAsyncPositiveEventTimeout):
+		t.Fatal("timed out waiting for addr auto gen event")
+	}
+}
+
+// TestAutoGenTempAddrRegen tests that temporary SLAAC addresses are
+// regenerated.
+func TestAutoGenTempAddrRegen(t *testing.T) {
+	const (
+		nicID            = 1
+		regenAfter       = 2 * time.Second
+		newMinVL         = 10
+		newMinVLDuration = newMinVL * time.Second
+	)
+
+	savedMaxDesyncFactor := stack.MaxDesyncFactor
+	savedMinMaxTempAddrPreferredLifetime := stack.MinMaxTempAddrPreferredLifetime
+	savedMinMaxTempAddrValidLifetime := stack.MinMaxTempAddrValidLifetime
+	defer func() {
+		stack.MaxDesyncFactor = savedMaxDesyncFactor
+		stack.MinMaxTempAddrPreferredLifetime = savedMinMaxTempAddrPreferredLifetime
+		stack.MinMaxTempAddrValidLifetime = savedMinMaxTempAddrValidLifetime
+	}()
+	stack.MaxDesyncFactor = 0
+	stack.MinMaxTempAddrPreferredLifetime = newMinVLDuration
+	stack.MinMaxTempAddrValidLifetime = newMinVLDuration
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+	var tempIIDHistory [header.IIDSize]byte
+	header.InitialTempIID(tempIIDHistory[:], nil, nicID)
+	tempAddr1 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address)
+	tempAddr2 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address)
+	tempAddr3 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address)
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	ndpConfigs := stack.NDPConfigurations{
+		HandleRAs:                  true,
+		AutoGenGlobalAddresses:     true,
+		AutoGenTempGlobalAddresses: true,
+		RegenAdvanceDuration:       newMinVLDuration - regenAfter,
+	}
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs:       ndpConfigs,
+		NDPDisp:          &ndpDisp,
+	})
+
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	expectAutoGenAddrEventAsync := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		case <-time.After(timeout):
+			t.Fatal("timed out waiting for addr auto gen event")
+		}
+	}
+
+	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+	// with non-zero valid & preferred lifetimes.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+	expectAutoGenAddrEvent(addr, newAddr)
+	expectAutoGenAddrEvent(tempAddr1, newAddr)
+	if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddr1}, nil); mismatch != "" {
+		t.Fatal(mismatch)
+	}
+
+	// Wait for regeneration
+	expectAutoGenAddrEventAsync(tempAddr2, newAddr, regenAfter+defaultAsyncPositiveEventTimeout)
+	if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddr1, tempAddr2}, nil); mismatch != "" {
+		t.Fatal(mismatch)
+	}
+
+	// Wait for regeneration
+	expectAutoGenAddrEventAsync(tempAddr3, newAddr, regenAfter+defaultAsyncPositiveEventTimeout)
+	if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddr1, tempAddr2, tempAddr3}, nil); mismatch != "" {
+		t.Fatal(mismatch)
+	}
+
+	// Stop generating temporary addresses
+	ndpConfigs.AutoGenTempGlobalAddresses = false
+	if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
+		t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
+	}
+
+	// Wait for all the temporary addresses to get invalidated.
+	tempAddrs := []tcpip.AddressWithPrefix{tempAddr1, tempAddr2, tempAddr3}
+	invalidateAfter := newMinVLDuration - 2*regenAfter
+	for _, addr := range tempAddrs {
+		// Wait for a deprecation then invalidation event, or just an invalidation
+		// event. We need to cover both cases but cannot deterministically hit both
+		// cases because the deprecation and invalidation timers could fire in any
+		// order.
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, deprecatedAddr); diff == "" {
+				// If we get a deprecation event first, we should get an invalidation
+				// event almost immediately after.
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				case <-time.After(defaultAsyncPositiveEventTimeout):
+					t.Fatal("timed out waiting for addr auto gen event")
+				}
+			} else if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff == "" {
+				// If we get an invalidation event first, we shouldn't get a deprecation
+				// event after.
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					t.Fatalf("unexpectedly got an auto-generated event = %+v", e)
+				case <-time.After(defaultAsyncNegativeEventTimeout):
+				}
+			} else {
+				t.Fatalf("got unexpected auto-generated event = %+v", e)
+			}
+		case <-time.After(invalidateAfter + defaultAsyncPositiveEventTimeout):
+			t.Fatal("timed out waiting for addr auto gen event")
+		}
+
+		invalidateAfter = regenAfter
+	}
+	if mismatch := addressCheck(s.NICInfo()[1].ProtocolAddresses, []tcpip.AddressWithPrefix{addr}, tempAddrs); mismatch != "" {
+		t.Fatal(mismatch)
+	}
+}
+
+// TestAutoGenTempAddrRegenTimerUpdates tests that a temporary address's
+// regeneration timer gets updated when refreshing the address's lifetimes.
+func TestAutoGenTempAddrRegenTimerUpdates(t *testing.T) {
+	const (
+		nicID            = 1
+		regenAfter       = 2 * time.Second
+		newMinVL         = 10
+		newMinVLDuration = newMinVL * time.Second
+	)
+
+	savedMaxDesyncFactor := stack.MaxDesyncFactor
+	savedMinMaxTempAddrPreferredLifetime := stack.MinMaxTempAddrPreferredLifetime
+	savedMinMaxTempAddrValidLifetime := stack.MinMaxTempAddrValidLifetime
+	defer func() {
+		stack.MaxDesyncFactor = savedMaxDesyncFactor
+		stack.MinMaxTempAddrPreferredLifetime = savedMinMaxTempAddrPreferredLifetime
+		stack.MinMaxTempAddrValidLifetime = savedMinMaxTempAddrValidLifetime
+	}()
+	stack.MaxDesyncFactor = 0
+	stack.MinMaxTempAddrPreferredLifetime = newMinVLDuration
+	stack.MinMaxTempAddrValidLifetime = newMinVLDuration
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+	var tempIIDHistory [header.IIDSize]byte
+	header.InitialTempIID(tempIIDHistory[:], nil, nicID)
+	tempAddr1 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address)
+	tempAddr2 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address)
+	tempAddr3 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], addr.Address)
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	ndpConfigs := stack.NDPConfigurations{
+		HandleRAs:                  true,
+		AutoGenGlobalAddresses:     true,
+		AutoGenTempGlobalAddresses: true,
+		RegenAdvanceDuration:       newMinVLDuration - regenAfter,
+	}
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs:       ndpConfigs,
+		NDPDisp:          &ndpDisp,
+	})
+
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	expectAutoGenAddrEventAsync := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		case <-time.After(timeout):
+			t.Fatal("timed out waiting for addr auto gen event")
+		}
+	}
+
+	// Receive an RA with prefix1 in an NDP Prefix Information option (PI)
+	// with non-zero valid & preferred lifetimes.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+	expectAutoGenAddrEvent(addr, newAddr)
+	expectAutoGenAddrEvent(tempAddr1, newAddr)
+	if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, []tcpip.AddressWithPrefix{addr, tempAddr1}, nil); mismatch != "" {
+		t.Fatal(mismatch)
+	}
+
+	// Deprecate the prefix.
+	//
+	// A new temporary address should be generated after the regeneration
+	// time has passed since the prefix is deprecated.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 0))
+	expectAutoGenAddrEvent(addr, deprecatedAddr)
+	expectAutoGenAddrEvent(tempAddr1, deprecatedAddr)
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		t.Fatalf("unexpected auto gen addr event = %+v", e)
+	case <-time.After(regenAfter + defaultAsyncNegativeEventTimeout):
+	}
+
+	// Prefer the prefix again.
+	//
+	// A new temporary address should immediately be generated since the
+	// regeneration time has already passed since the last address was generated
+	// - this regeneration does not depend on a timer.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+	expectAutoGenAddrEvent(tempAddr2, newAddr)
+
+	// Increase the maximum lifetimes for temporary addresses to large values
+	// then refresh the lifetimes of the prefix.
+	//
+	// A new address should not be generated after the regeneration time that was
+	// expected for the previous check. This is because the preferred lifetime for
+	// the temporary addresses has increased, so it will take more time to
+	// regenerate a new temporary address. Note, new addresses are only
+	// regenerated after the preferred lifetime - the regenerate advance duration
+	// as paased.
+	ndpConfigs.MaxTempAddrValidLifetime = 100 * time.Second
+	ndpConfigs.MaxTempAddrPreferredLifetime = 100 * time.Second
+	if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
+		t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
+	}
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		t.Fatalf("unexpected auto gen addr event = %+v", e)
+	case <-time.After(regenAfter + defaultAsyncNegativeEventTimeout):
+	}
+
+	// Set the maximum lifetimes for temporary addresses such that on the next
+	// RA, the regeneration timer gets reset.
+	//
+	// The maximum lifetime is the sum of the minimum lifetimes for temporary
+	// addresses + the time that has already passed since the last address was
+	// generated so that the regeneration timer is needed to generate the next
+	// address.
+	newLifetimes := newMinVLDuration + regenAfter + defaultAsyncNegativeEventTimeout
+	ndpConfigs.MaxTempAddrValidLifetime = newLifetimes
+	ndpConfigs.MaxTempAddrPreferredLifetime = newLifetimes
+	if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
+		t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
+	}
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+	expectAutoGenAddrEventAsync(tempAddr3, newAddr, regenAfter+defaultAsyncPositiveEventTimeout)
+}
+
+// TestMixedSLAACAddrConflictRegen tests SLAAC address regeneration in response
+// to a mix of DAD conflicts and NIC-local conflicts.
+func TestMixedSLAACAddrConflictRegen(t *testing.T) {
+	const (
+		nicID           = 1
+		nicName         = "nic"
+		lifetimeSeconds = 9999
+		// From stack.maxSLAACAddrLocalRegenAttempts
+		maxSLAACAddrLocalRegenAttempts = 10
+		// We use 2 more addreses than the maximum local regeneration attempts
+		// because we want to also trigger regeneration in response to a DAD
+		// conflicts for this test.
+		maxAddrs         = maxSLAACAddrLocalRegenAttempts + 2
+		dupAddrTransmits = 1
+		retransmitTimer  = time.Second
+	)
+
+	var tempIIDHistoryWithModifiedEUI64 [header.IIDSize]byte
+	header.InitialTempIID(tempIIDHistoryWithModifiedEUI64[:], nil, nicID)
+
+	var tempIIDHistoryWithOpaqueIID [header.IIDSize]byte
+	header.InitialTempIID(tempIIDHistoryWithOpaqueIID[:], nil, nicID)
+
+	prefix, subnet, stableAddrWithModifiedEUI64 := prefixSubnetAddr(0, linkAddr1)
+	var stableAddrsWithOpaqueIID [maxAddrs]tcpip.AddressWithPrefix
+	var tempAddrsWithOpaqueIID [maxAddrs]tcpip.AddressWithPrefix
+	var tempAddrsWithModifiedEUI64 [maxAddrs]tcpip.AddressWithPrefix
+	addrBytes := []byte(subnet.ID())
+	for i := 0; i < maxAddrs; i++ {
+		stableAddrsWithOpaqueIID[i] = tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet, nicName, uint8(i), nil)),
+			PrefixLen: header.IIDOffsetInIPv6Address * 8,
+		}
+		// When generating temporary addresses, the resolved stable address for the
+		// SLAAC prefix will be the first address stable address generated for the
+		// prefix as we will not simulate address conflicts for the stable addresses
+		// in tests involving temporary addresses. Address conflicts for stable
+		// addresses will be done in their own tests.
+		tempAddrsWithOpaqueIID[i] = header.GenerateTempIPv6SLAACAddr(tempIIDHistoryWithOpaqueIID[:], stableAddrsWithOpaqueIID[0].Address)
+		tempAddrsWithModifiedEUI64[i] = header.GenerateTempIPv6SLAACAddr(tempIIDHistoryWithModifiedEUI64[:], stableAddrWithModifiedEUI64.Address)
+	}
+
+	tests := []struct {
+		name          string
+		addrs         []tcpip.AddressWithPrefix
+		tempAddrs     bool
+		initialExpect tcpip.AddressWithPrefix
+		nicNameFromID func(tcpip.NICID, string) string
+	}{
+		{
+			name:  "Stable addresses with opaque IIDs",
+			addrs: stableAddrsWithOpaqueIID[:],
+			nicNameFromID: func(tcpip.NICID, string) string {
+				return nicName
+			},
+		},
+		{
+			name:          "Temporary addresses with opaque IIDs",
+			addrs:         tempAddrsWithOpaqueIID[:],
+			tempAddrs:     true,
+			initialExpect: stableAddrsWithOpaqueIID[0],
+			nicNameFromID: func(tcpip.NICID, string) string {
+				return nicName
+			},
+		},
+		{
+			name:          "Temporary addresses with modified EUI64",
+			addrs:         tempAddrsWithModifiedEUI64[:],
+			tempAddrs:     true,
+			initialExpect: stableAddrWithModifiedEUI64,
+		},
+	}
+
+	for _, test := range tests {
+		test := test
+
+		t.Run(test.name, func(t *testing.T) {
+			t.Parallel()
+
+			ndpDisp := ndpDispatcher{
+				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+			}
+			e := channel.New(0, 1280, linkAddr1)
+			ndpConfigs := stack.NDPConfigurations{
+				HandleRAs:                     true,
+				AutoGenGlobalAddresses:        true,
+				AutoGenTempGlobalAddresses:    test.tempAddrs,
+				AutoGenAddressConflictRetries: 1,
+			}
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NDPConfigs:         ndpConfigs,
+				NDPDisp:            &ndpDisp,
+				OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+					NICNameFromID: test.nicNameFromID,
+				},
+			})
+
+			s.SetRouteTable([]tcpip.Route{{
+				Destination: header.IPv6EmptySubnet,
+				Gateway:     llAddr2,
+				NIC:         nicID,
+			}})
+
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+
+			for j := 0; j < len(test.addrs)-1; j++ {
+				// The NIC will not attempt to generate an address in response to a
+				// NIC-local conflict after some maximum number of attempts. We skip
+				// creating a conflict for the address that would be generated as part
+				// of the last attempt so we can simulate a DAD conflict for this
+				// address and restart the NIC-local generation process.
+				if j == maxSLAACAddrLocalRegenAttempts-1 {
+					continue
+				}
+
+				if err := s.AddAddress(nicID, ipv6.ProtocolNumber, test.addrs[j].Address); err != nil {
+					t.Fatalf("s.AddAddress(%d, %d, %s): %s", nicID, ipv6.ProtocolNumber, test.addrs[j].Address, err)
+				}
+			}
+
+			expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
+			}
+
+			expectAutoGenAddrAsyncEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				case <-time.After(defaultAsyncPositiveEventTimeout):
+					t.Fatal("timed out waiting for addr auto gen event")
+				}
+			}
+
+			expectDADEventAsync := func(addr tcpip.Address) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.dadC:
+					if diff := checkDADEvent(e, nicID, addr, true, nil); diff != "" {
+						t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+					}
+				case <-time.After(dupAddrTransmits*retransmitTimer + defaultAsyncPositiveEventTimeout):
+					t.Fatal("timed out waiting for DAD event")
+				}
+			}
+
+			// Enable DAD.
+			ndpDisp.dadC = make(chan ndpDADEvent, 2)
+			ndpConfigs.DupAddrDetectTransmits = dupAddrTransmits
+			ndpConfigs.RetransmitTimer = retransmitTimer
+			if err := s.SetNDPConfigurations(nicID, ndpConfigs); err != nil {
+				t.Fatalf("s.SetNDPConfigurations(%d, _): %s", nicID, err)
+			}
+
+			// Do SLAAC for prefix.
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, lifetimeSeconds))
+			if test.initialExpect != (tcpip.AddressWithPrefix{}) {
+				expectAutoGenAddrEvent(test.initialExpect, newAddr)
+				expectDADEventAsync(test.initialExpect.Address)
+			}
+
+			// The last local generation attempt should succeed, but we introduce a
+			// DAD failure to restart the local generation process.
+			addr := test.addrs[maxSLAACAddrLocalRegenAttempts-1]
+			expectAutoGenAddrAsyncEvent(addr, newAddr)
+			if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
+				t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
+			}
+			select {
+			case e := <-ndpDisp.dadC:
+				if diff := checkDADEvent(e, nicID, addr.Address, false, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+				}
+			default:
+				t.Fatal("expected DAD event")
+			}
+			expectAutoGenAddrEvent(addr, invalidatedAddr)
+
+			// The last address generated should resolve DAD.
+			addr = test.addrs[len(test.addrs)-1]
+			expectAutoGenAddrAsyncEvent(addr, newAddr)
+			expectDADEventAsync(addr.Address)
+
+			select {
+			case e := <-ndpDisp.autoGenAddrC:
+				t.Fatalf("unexpected auto gen addr event = %+v", e)
+			default:
+			}
+		})
+	}
+}
+
+// stackAndNdpDispatcherWithDefaultRoute returns an ndpDispatcher,
+// channel.Endpoint and stack.Stack.
+//
+// stack.Stack will have a default route through the router (llAddr3) installed
+// and a static link-address (linkAddr3) added to the link address cache for the
+// router.
+func stackAndNdpDispatcherWithDefaultRoute(t *testing.T, nicID tcpip.NICID) (*ndpDispatcher, *channel.Endpoint, *stack.Stack) {
+	t.Helper()
+	ndpDisp := &ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			AutoGenGlobalAddresses: true,
+		},
+		NDPDisp: ndpDisp,
+	})
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+	s.SetRouteTable([]tcpip.Route{{
+		Destination: header.IPv6EmptySubnet,
+		Gateway:     llAddr3,
+		NIC:         nicID,
+	}})
+	s.AddLinkAddress(nicID, llAddr3, linkAddr3)
+	return ndpDisp, e, s
+}
+
+// addrForNewConnectionTo returns the local address used when creating a new
+// connection to addr.
+func addrForNewConnectionTo(t *testing.T, s *stack.Stack, addr tcpip.FullAddress) tcpip.Address {
+	t.Helper()
+
+	wq := waiter.Queue{}
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+	defer close(ch)
+	ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
+	if err != nil {
+		t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
+	}
+	defer ep.Close()
+	if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+		t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
+	}
+	if err := ep.Connect(addr); err != nil {
+		t.Fatalf("ep.Connect(%+v): %s", addr, err)
+	}
+	got, err := ep.GetLocalAddress()
+	if err != nil {
+		t.Fatalf("ep.GetLocalAddress(): %s", err)
+	}
+	return got.Addr
+}
+
+// addrForNewConnection returns the local address used when creating a new
+// connection.
+func addrForNewConnection(t *testing.T, s *stack.Stack) tcpip.Address {
+	t.Helper()
+
+	return addrForNewConnectionTo(t, s, dstAddr)
+}
+
+// addrForNewConnectionWithAddr returns the local address used when creating a
+// new connection with a specific local address.
+func addrForNewConnectionWithAddr(t *testing.T, s *stack.Stack, addr tcpip.FullAddress) tcpip.Address {
+	t.Helper()
+
+	wq := waiter.Queue{}
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+	defer close(ch)
+	ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
+	if err != nil {
+		t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
+	}
+	defer ep.Close()
+	if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+		t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
+	}
+	if err := ep.Bind(addr); err != nil {
+		t.Fatalf("ep.Bind(%+v): %s", addr, err)
+	}
+	if err := ep.Connect(dstAddr); err != nil {
+		t.Fatalf("ep.Connect(%+v): %s", dstAddr, err)
+	}
+	got, err := ep.GetLocalAddress()
+	if err != nil {
+		t.Fatalf("ep.GetLocalAddress(): %s", err)
+	}
+	return got.Addr
+}
+
+// TestAutoGenAddrDeprecateFromPI tests deprecating a SLAAC address when
+// receiving a PI with 0 preferred lifetime.
+func TestAutoGenAddrDeprecateFromPI(t *testing.T) {
+	const nicID = 1
+
+	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+
+	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+		t.Helper()
+
+		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+		} else if got != addr {
+			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+		}
+
+		if got := addrForNewConnection(t, s); got != addr.Address {
+			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+		}
+	}
+
+	// Receive PI for prefix1.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+	expectAutoGenAddrEvent(addr1, newAddr)
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should have %s in the list of addresses", addr1)
+	}
+	expectPrimaryAddr(addr1)
+
+	// Deprecate addr for prefix1 immedaitely.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 0))
+	expectAutoGenAddrEvent(addr1, deprecatedAddr)
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should have %s in the list of addresses", addr1)
+	}
+	// addr should still be the primary endpoint as there are no other addresses.
+	expectPrimaryAddr(addr1)
+
+	// Refresh lifetimes of addr generated from prefix1.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 100))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto-generated event")
+	default:
+	}
+	expectPrimaryAddr(addr1)
+
+	// Receive PI for prefix2.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+	expectAutoGenAddrEvent(addr2, newAddr)
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+	expectPrimaryAddr(addr2)
+
+	// Deprecate addr for prefix2 immedaitely.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+	expectAutoGenAddrEvent(addr2, deprecatedAddr)
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+	// addr1 should be the primary endpoint now since addr2 is deprecated but
+	// addr1 is not.
+	expectPrimaryAddr(addr1)
+	// addr2 is deprecated but if explicitly requested, it should be used.
+	fullAddr2 := tcpip.FullAddress{Addr: addr2.Address, NIC: nicID}
+	if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
+	}
+
+	// Another PI w/ 0 preferred lifetime should not result in a deprecation
+	// event.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto-generated event")
+	default:
+	}
+	expectPrimaryAddr(addr1)
+	if got := addrForNewConnectionWithAddr(t, s, fullAddr2); got != addr2.Address {
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr2, got, addr2.Address)
+	}
+
+	// Refresh lifetimes of addr generated from prefix2.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto-generated event")
+	default:
+	}
+	expectPrimaryAddr(addr2)
+}
+
+// TestAutoGenAddrTimerDeprecation tests that an address is properly deprecated
+// when its preferred lifetime expires.
+func TestAutoGenAddrTimerDeprecation(t *testing.T) {
+	const nicID = 1
+	const newMinVL = 2
+	newMinVLDuration := newMinVL * time.Second
+	saved := stack.MinPrefixInformationValidLifetimeForUpdate
+	defer func() {
+		stack.MinPrefixInformationValidLifetimeForUpdate = saved
+	}()
+	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVLDuration
+
+	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+
+	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	expectAutoGenAddrEventAfter := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType, timeout time.Duration) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		case <-time.After(timeout):
+			t.Fatal("timed out waiting for addr auto gen event")
+		}
+	}
+
+	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+		t.Helper()
+
+		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+		} else if got != addr {
+			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+		}
+
+		if got := addrForNewConnection(t, s); got != addr.Address {
+			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+		}
+	}
+
+	// Receive PI for prefix2.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 100))
+	expectAutoGenAddrEvent(addr2, newAddr)
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+	expectPrimaryAddr(addr2)
+
+	// Receive a PI for prefix1.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, 100, 90))
+	expectAutoGenAddrEvent(addr1, newAddr)
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should have %s in the list of addresses", addr1)
+	}
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+	expectPrimaryAddr(addr1)
+
+	// Refresh lifetime for addr of prefix1.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto-generated event")
+	default:
+	}
+	expectPrimaryAddr(addr1)
+
+	// Wait for addr of prefix1 to be deprecated.
+	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should not have %s in the list of addresses", addr1)
+	}
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+	// addr2 should be the primary endpoint now since addr1 is deprecated but
+	// addr2 is not.
+	expectPrimaryAddr(addr2)
+	// addr1 is deprecated but if explicitly requested, it should be used.
+	fullAddr1 := tcpip.FullAddress{Addr: addr1.Address, NIC: nicID}
+	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
+	}
+
+	// Refresh valid lifetime for addr of prefix1, w/ 0 preferred lifetime to make
+	// sure we do not get a deprecation event again.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, 0))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto-generated event")
+	default:
+	}
+	expectPrimaryAddr(addr2)
+	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
+	}
+
+	// Refresh lifetimes for addr of prefix1.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, newMinVL, newMinVL-1))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto-generated event")
+	default:
+	}
+	// addr1 is the primary endpoint again since it is non-deprecated now.
+	expectPrimaryAddr(addr1)
+
+	// Wait for addr of prefix1 to be deprecated.
+	expectAutoGenAddrEventAfter(addr1, deprecatedAddr, newMinVLDuration-time.Second+defaultAsyncPositiveEventTimeout)
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should not have %s in the list of addresses", addr1)
+	}
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+	// addr2 should be the primary endpoint now since it is not deprecated.
+	expectPrimaryAddr(addr2)
+	if got := addrForNewConnectionWithAddr(t, s, fullAddr1); got != addr1.Address {
+		t.Errorf("got addrForNewConnectionWithAddr(_, _, %+v) = %s, want = %s", fullAddr1, got, addr1.Address)
+	}
+
+	// Wait for addr of prefix1 to be invalidated.
+	expectAutoGenAddrEventAfter(addr1, invalidatedAddr, time.Second+defaultAsyncPositiveEventTimeout)
+	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should not have %s in the list of addresses", addr1)
+	}
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+	expectPrimaryAddr(addr2)
+
+	// Refresh both lifetimes for addr of prefix2 to the same value.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, newMinVL, newMinVL))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto-generated event")
+	default:
+	}
+
+	// Wait for a deprecation then invalidation events, or just an invalidation
+	// event. We need to cover both cases but cannot deterministically hit both
+	// cases because the deprecation and invalidation handlers could be handled in
+	// either deprecation then invalidation, or invalidation then deprecation
+	// (which should be cancelled by the invalidation handler).
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		if diff := checkAutoGenAddrEvent(e, addr2, deprecatedAddr); diff == "" {
+			// If we get a deprecation event first, we should get an invalidation
+			// event almost immediately after.
+			select {
+			case e := <-ndpDisp.autoGenAddrC:
+				if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff != "" {
+					t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+				}
+			case <-time.After(defaultAsyncPositiveEventTimeout):
+				t.Fatal("timed out waiting for addr auto gen event")
+			}
+		} else if diff := checkAutoGenAddrEvent(e, addr2, invalidatedAddr); diff == "" {
+			// If we get an invalidation  event first, we should not get a deprecation
+			// event after.
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Fatal("unexpectedly got an auto-generated event")
+			case <-time.After(defaultAsyncNegativeEventTimeout):
+			}
+		} else {
+			t.Fatalf("got unexpected auto-generated event")
+		}
+	case <-time.After(newMinVLDuration + defaultAsyncPositiveEventTimeout):
+		t.Fatal("timed out waiting for addr auto gen event")
+	}
+	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should not have %s in the list of addresses", addr1)
+	}
+	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should not have %s in the list of addresses", addr2)
+	}
+	// Should not have any primary endpoints.
+	if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+		t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+	} else if want := (tcpip.AddressWithPrefix{}); got != want {
+		t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, want)
+	}
+	wq := waiter.Queue{}
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+	defer close(ch)
+	ep, err := s.NewEndpoint(header.UDPProtocolNumber, header.IPv6ProtocolNumber, &wq)
+	if err != nil {
+		t.Fatalf("s.NewEndpoint(%d, %d, _): %s", header.UDPProtocolNumber, header.IPv6ProtocolNumber, err)
+	}
+	defer ep.Close()
+	if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+		t.Fatalf("SetSockOpt(tcpip.V6OnlyOption, true): %s", err)
+	}
+
+	if err := ep.Connect(dstAddr); err != tcpip.ErrNoRoute {
+		t.Errorf("got ep.Connect(%+v) = %v, want = %s", dstAddr, err, tcpip.ErrNoRoute)
+	}
+}
+
+// Tests transitioning a SLAAC address's valid lifetime between finite and
+// infinite values.
+func TestAutoGenAddrFiniteToInfiniteToFiniteVL(t *testing.T) {
+	const infiniteVLSeconds = 2
+	const minVLSeconds = 1
+	savedIL := header.NDPInfiniteLifetime
+	savedMinVL := stack.MinPrefixInformationValidLifetimeForUpdate
+	defer func() {
+		stack.MinPrefixInformationValidLifetimeForUpdate = savedMinVL
+		header.NDPInfiniteLifetime = savedIL
+	}()
+	stack.MinPrefixInformationValidLifetimeForUpdate = minVLSeconds * time.Second
+	header.NDPInfiniteLifetime = infiniteVLSeconds * time.Second
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+	tests := []struct {
+		name       string
+		infiniteVL uint32
+	}{
+		{
+			name:       "EqualToInfiniteVL",
+			infiniteVL: infiniteVLSeconds,
+		},
+		// Our implementation supports changing header.NDPInfiniteLifetime for tests
+		// such that a packet can be received where the lifetime field has a value
+		// greater than header.NDPInfiniteLifetime. Because of this, we test to make
+		// sure that receiving a value greater than header.NDPInfiniteLifetime is
+		// handled the same as when receiving a value equal to
+		// header.NDPInfiniteLifetime.
+		{
+			name:       "MoreThanInfiniteVL",
+			infiniteVL: infiniteVLSeconds + 1,
+		},
+	}
+
+	// This Run will not return until the parallel tests finish.
+	//
+	// We need this because we need to do some teardown work after the
+	// parallel tests complete.
+	//
+	// See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+	// more details.
+	t.Run("group", func(t *testing.T) {
+		for _, test := range tests {
+			test := test
+
+			t.Run(test.name, func(t *testing.T) {
+				t.Parallel()
+
+				ndpDisp := ndpDispatcher{
+					autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+				}
+				e := channel.New(0, 1280, linkAddr1)
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+					NDPConfigs: stack.NDPConfigurations{
+						HandleRAs:              true,
+						AutoGenGlobalAddresses: true,
+					},
+					NDPDisp: &ndpDisp,
+				})
+
+				if err := s.CreateNIC(1, e); err != nil {
+					t.Fatalf("CreateNIC(1) = %s", err)
+				}
+
+				// Receive an RA with finite prefix.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, 0))
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, newAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
+
+				// Receive an new RA with prefix with infinite VL.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.infiniteVL, 0))
+
+				// Receive a new RA with prefix with finite VL.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, minVLSeconds, 0))
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+
+				case <-time.After(minVLSeconds*time.Second + defaultAsyncPositiveEventTimeout):
+					t.Fatal("timeout waiting for addr auto gen event")
+				}
+			})
+		}
+	})
+}
+
+// TestAutoGenAddrValidLifetimeUpdates tests that the valid lifetime of an
+// auto-generated address only gets updated when required to, as specified in
+// RFC 4862 section 5.5.3.e.
+func TestAutoGenAddrValidLifetimeUpdates(t *testing.T) {
+	const infiniteVL = 4294967295
+	const newMinVL = 4
+	saved := stack.MinPrefixInformationValidLifetimeForUpdate
+	defer func() {
+		stack.MinPrefixInformationValidLifetimeForUpdate = saved
+	}()
+	stack.MinPrefixInformationValidLifetimeForUpdate = newMinVL * time.Second
+
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+	tests := []struct {
+		name string
+		ovl  uint32
+		nvl  uint32
+		evl  uint32
+	}{
+		// Should update the VL to the minimum VL for updating if the
+		// new VL is less than newMinVL but was originally greater than
+		// it.
+		{
+			"LargeVLToVLLessThanMinVLForUpdate",
+			9999,
+			1,
+			newMinVL,
+		},
+		{
+			"LargeVLTo0",
+			9999,
+			0,
+			newMinVL,
+		},
+		{
+			"InfiniteVLToVLLessThanMinVLForUpdate",
+			infiniteVL,
+			1,
+			newMinVL,
+		},
+		{
+			"InfiniteVLTo0",
+			infiniteVL,
+			0,
+			newMinVL,
+		},
+
+		// Should not update VL if original VL was less than newMinVL
+		// and the new VL is also less than newMinVL.
+		{
+			"ShouldNotUpdateWhenBothOldAndNewAreLessThanMinVLForUpdate",
+			newMinVL - 1,
+			newMinVL - 3,
+			newMinVL - 1,
+		},
+
+		// Should take the new VL if the new VL is greater than the
+		// remaining time or is greater than newMinVL.
+		{
+			"MorethanMinVLToLesserButStillMoreThanMinVLForUpdate",
+			newMinVL + 5,
+			newMinVL + 3,
+			newMinVL + 3,
+		},
+		{
+			"SmallVLToGreaterVLButStillLessThanMinVLForUpdate",
+			newMinVL - 3,
+			newMinVL - 1,
+			newMinVL - 1,
+		},
+		{
+			"SmallVLToGreaterVLThatIsMoreThaMinVLForUpdate",
+			newMinVL - 3,
+			newMinVL + 1,
+			newMinVL + 1,
+		},
+	}
+
+	// This Run will not return until the parallel tests finish.
+	//
+	// We need this because we need to do some teardown work after the
+	// parallel tests complete.
+	//
+	// See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+	// more details.
+	t.Run("group", func(t *testing.T) {
+		for _, test := range tests {
+			test := test
+
+			t.Run(test.name, func(t *testing.T) {
+				t.Parallel()
+
+				ndpDisp := ndpDispatcher{
+					autoGenAddrC: make(chan ndpAutoGenAddrEvent, 10),
+				}
+				e := channel.New(10, 1280, linkAddr1)
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+					NDPConfigs: stack.NDPConfigurations{
+						HandleRAs:              true,
+						AutoGenGlobalAddresses: true,
+					},
+					NDPDisp: &ndpDisp,
+				})
+
+				if err := s.CreateNIC(1, e); err != nil {
+					t.Fatalf("CreateNIC(1) = %s", err)
+				}
+
+				// Receive an RA with prefix with initial VL,
+				// test.ovl.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.ovl, 0))
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, newAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
+
+				// Receive an new RA with prefix with new VL,
+				// test.nvl.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, test.nvl, 0))
+
+				//
+				// Validate that the VL for the address got set
+				// to test.evl.
+				//
+
+				// The address should not be invalidated until the effective valid
+				// lifetime has passed.
+				select {
+				case <-ndpDisp.autoGenAddrC:
+					t.Fatal("unexpectedly received an auto gen addr event")
+				case <-time.After(time.Duration(test.evl)*time.Second - defaultAsyncNegativeEventTimeout):
+				}
+
+				// Wait for the invalidation event.
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				case <-time.After(defaultAsyncPositiveEventTimeout):
+					t.Fatal("timeout waiting for addr auto gen event")
+				}
+			})
+		}
+	})
+}
+
+// TestAutoGenAddrRemoval tests that when auto-generated addresses are removed
+// by the user, its resources will be cleaned up and an invalidation event will
+// be sent to the integrator.
+func TestAutoGenAddrRemoval(t *testing.T) {
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			AutoGenGlobalAddresses: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	// Receive a PI to auto-generate an address.
+	const lifetimeSeconds = 1
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, 0))
+	expectAutoGenAddrEvent(addr, newAddr)
+
+	// Removing the address should result in an invalidation event
+	// immediately.
+	if err := s.RemoveAddress(1, addr.Address); err != nil {
+		t.Fatalf("RemoveAddress(_, %s) = %s", addr.Address, err)
+	}
+	expectAutoGenAddrEvent(addr, invalidatedAddr)
+
+	// Wait for the original valid lifetime to make sure the original timer
+	// got stopped/cleaned up.
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly received an auto gen addr event")
+	case <-time.After(lifetimeSeconds*time.Second + defaultAsyncNegativeEventTimeout):
+	}
+}
+
+// TestAutoGenAddrAfterRemoval tests adding a SLAAC address that was previously
+// assigned to the NIC but is in the permanentExpired state.
+func TestAutoGenAddrAfterRemoval(t *testing.T) {
+	const nicID = 1
+
+	prefix1, _, addr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, _, addr2 := prefixSubnetAddr(1, linkAddr1)
+	ndpDisp, e, s := stackAndNdpDispatcherWithDefaultRoute(t, nicID)
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	expectPrimaryAddr := func(addr tcpip.AddressWithPrefix) {
+		t.Helper()
+
+		if got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber); err != nil {
+			t.Fatalf("s.GetMainNICAddress(%d, %d): %s", nicID, header.IPv6ProtocolNumber, err)
+		} else if got != addr {
+			t.Errorf("got s.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr)
+		}
+
+		if got := addrForNewConnection(t, s); got != addr.Address {
+			t.Errorf("got addrForNewConnection = %s, want = %s", got, addr.Address)
+		}
+	}
+
+	// Receive a PI to auto-generate addr1 with a large valid and preferred
+	// lifetime.
+	const largeLifetimeSeconds = 999
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix1, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+	expectAutoGenAddrEvent(addr1, newAddr)
+	expectPrimaryAddr(addr1)
+
+	// Add addr2 as a static address.
+	protoAddr2 := tcpip.ProtocolAddress{
+		Protocol:          header.IPv6ProtocolNumber,
+		AddressWithPrefix: addr2,
+	}
+	if err := s.AddProtocolAddressWithOptions(nicID, protoAddr2, stack.FirstPrimaryEndpoint); err != nil {
+		t.Fatalf("AddProtocolAddressWithOptions(%d, %+v, %d) = %s", nicID, protoAddr2, stack.FirstPrimaryEndpoint, err)
+	}
+	// addr2 should be more preferred now since it is at the front of the primary
+	// list.
+	expectPrimaryAddr(addr2)
+
+	// Get a route using addr2 to increment its reference count then remove it
+	// to leave it in the permanentExpired state.
+	r, err := s.FindRoute(nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, false)
+	if err != nil {
+		t.Fatalf("FindRoute(%d, %s, %s, %d, false): %s", nicID, addr2.Address, addr3, header.IPv6ProtocolNumber, err)
+	}
+	defer r.Release()
+	if err := s.RemoveAddress(nicID, addr2.Address); err != nil {
+		t.Fatalf("s.RemoveAddress(%d, %s): %s", nicID, addr2.Address, err)
+	}
+	// addr1 should be preferred again since addr2 is in the expired state.
+	expectPrimaryAddr(addr1)
+
+	// Receive a PI to auto-generate addr2 as valid and preferred.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+	expectAutoGenAddrEvent(addr2, newAddr)
+	// addr2 should be more preferred now that it is closer to the front of the
+	// primary list and not deprecated.
+	expectPrimaryAddr(addr2)
+
+	// Removing the address should result in an invalidation event immediately.
+	// It should still be in the permanentExpired state because r is still held.
+	//
+	// We remove addr2 here to make sure addr2 was marked as a SLAAC address
+	// (it was previously marked as a static address).
+	if err := s.RemoveAddress(1, addr2.Address); err != nil {
+		t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+	}
+	expectAutoGenAddrEvent(addr2, invalidatedAddr)
+	// addr1 should be more preferred since addr2 is in the expired state.
+	expectPrimaryAddr(addr1)
+
+	// Receive a PI to auto-generate addr2 as valid and deprecated.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, 0))
+	expectAutoGenAddrEvent(addr2, newAddr)
+	// addr1 should still be more preferred since addr2 is deprecated, even though
+	// it is closer to the front of the primary list.
+	expectPrimaryAddr(addr1)
+
+	// Receive a PI to refresh addr2's preferred lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, prefix2, true, true, largeLifetimeSeconds, largeLifetimeSeconds))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly got an auto gen addr event")
+	default:
+	}
+	// addr2 should be more preferred now that it is not deprecated.
+	expectPrimaryAddr(addr2)
+
+	if err := s.RemoveAddress(1, addr2.Address); err != nil {
+		t.Fatalf("RemoveAddress(_, %s) = %s", addr2.Address, err)
+	}
+	expectAutoGenAddrEvent(addr2, invalidatedAddr)
+	expectPrimaryAddr(addr1)
+}
+
+// TestAutoGenAddrStaticConflict tests that if SLAAC generates an address that
+// is already assigned to the NIC, the static address remains.
+func TestAutoGenAddrStaticConflict(t *testing.T) {
+	prefix, _, addr := prefixSubnetAddr(0, linkAddr1)
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			AutoGenGlobalAddresses: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(1) = %s", err)
+	}
+
+	// Add the address as a static address before SLAAC tries to add it.
+	if err := s.AddProtocolAddress(1, tcpip.ProtocolAddress{Protocol: header.IPv6ProtocolNumber, AddressWithPrefix: addr}); err != nil {
+		t.Fatalf("AddAddress(_, %d, %s) = %s", header.IPv6ProtocolNumber, addr.Address, err)
+	}
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+
+	// Receive a PI where the generated address will be the same as the one
+	// that we already have assigned statically.
+	const lifetimeSeconds = 1
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, 0))
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly received an auto gen addr event for an address we already have statically")
+	default:
+	}
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+
+	// Should not get an invalidation event after the PI's invalidation
+	// time.
+	select {
+	case <-ndpDisp.autoGenAddrC:
+		t.Fatal("unexpectedly received an auto gen addr event")
+	case <-time.After(lifetimeSeconds*time.Second + defaultAsyncNegativeEventTimeout):
+	}
+	if !containsV6Addr(s.NICInfo()[1].ProtocolAddresses, addr) {
+		t.Fatalf("Should have %s in the list of addresses", addr1)
+	}
+}
+
+// TestAutoGenAddrWithOpaqueIID tests that SLAAC generated addresses will use
+// opaque interface identifiers when configured to do so.
+func TestAutoGenAddrWithOpaqueIID(t *testing.T) {
+	const nicID = 1
+	const nicName = "nic1"
+	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte
+	secretKey := secretKeyBuf[:]
+	n, err := rand.Read(secretKey)
+	if err != nil {
+		t.Fatalf("rand.Read(_): %s", err)
+	}
+	if n != header.OpaqueIIDSecretKeyMinBytes {
+		t.Fatalf("got rand.Read(_) = (%d, _), want = (%d, _)", n, header.OpaqueIIDSecretKeyMinBytes)
+	}
+
+	prefix1, subnet1, _ := prefixSubnetAddr(0, linkAddr1)
+	prefix2, subnet2, _ := prefixSubnetAddr(1, linkAddr1)
+	// addr1 and addr2 are the addresses that are expected to be generated when
+	// stack.Stack is configured to generate opaque interface identifiers as
+	// defined by RFC 7217.
+	addrBytes := []byte(subnet1.ID())
+	addr1 := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet1, nicName, 0, secretKey)),
+		PrefixLen: 64,
+	}
+	addrBytes = []byte(subnet2.ID())
+	addr2 := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet2, nicName, 0, secretKey)),
+		PrefixLen: 64,
+	}
+
+	ndpDisp := ndpDispatcher{
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs:              true,
+			AutoGenGlobalAddresses: true,
+		},
+		NDPDisp: &ndpDisp,
+		OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+			NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+				return nicName
+			},
+			SecretKey: secretKey,
+		},
+	})
+	opts := stack.NICOptions{Name: nicName}
+	if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
+		t.Fatalf("CreateNICWithOptions(%d, _, %+v, _) = %s", nicID, opts, err)
+	}
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	// Receive an RA with prefix1 in a PI.
+	const validLifetimeSecondPrefix1 = 1
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix1, true, true, validLifetimeSecondPrefix1, 0))
+	expectAutoGenAddrEvent(addr1, newAddr)
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should have %s in the list of addresses", addr1)
+	}
+
+	// Receive an RA with prefix2 in a PI with a large valid lifetime.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix2, true, true, 100, 0))
+	expectAutoGenAddrEvent(addr2, newAddr)
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should have %s in the list of addresses", addr1)
+	}
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+
+	// Wait for addr of prefix1 to be invalidated.
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		if diff := checkAutoGenAddrEvent(e, addr1, invalidatedAddr); diff != "" {
+			t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(validLifetimeSecondPrefix1*time.Second + defaultAsyncPositiveEventTimeout):
+		t.Fatal("timed out waiting for addr auto gen event")
+	}
+	if containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr1) {
+		t.Fatalf("should not have %s in the list of addresses", addr1)
+	}
+	if !containsV6Addr(s.NICInfo()[nicID].ProtocolAddresses, addr2) {
+		t.Fatalf("should have %s in the list of addresses", addr2)
+	}
+}
+
+func TestAutoGenAddrInResponseToDADConflicts(t *testing.T) {
+	const nicID = 1
+	const nicName = "nic"
+	const dadTransmits = 1
+	const retransmitTimer = time.Second
+	const maxMaxRetries = 3
+	const lifetimeSeconds = 10
+
+	// Needed for the temporary address sub test.
+	savedMaxDesync := stack.MaxDesyncFactor
+	defer func() {
+		stack.MaxDesyncFactor = savedMaxDesync
+	}()
+	stack.MaxDesyncFactor = time.Nanosecond
+
+	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte
+	secretKey := secretKeyBuf[:]
+	n, err := rand.Read(secretKey)
+	if err != nil {
+		t.Fatalf("rand.Read(_): %s", err)
+	}
+	if n != header.OpaqueIIDSecretKeyMinBytes {
+		t.Fatalf("got rand.Read(_) = (%d, _), want = (%d, _)", n, header.OpaqueIIDSecretKeyMinBytes)
+	}
+
+	prefix, subnet, _ := prefixSubnetAddr(0, linkAddr1)
+
+	addrForSubnet := func(subnet tcpip.Subnet, dadCounter uint8) tcpip.AddressWithPrefix {
+		addrBytes := []byte(subnet.ID())
+		return tcpip.AddressWithPrefix{
+			Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet, nicName, dadCounter, secretKey)),
+			PrefixLen: 64,
+		}
+	}
+
+	expectAutoGenAddrEvent := func(t *testing.T, ndpDisp *ndpDispatcher, addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	expectAutoGenAddrEventAsync := func(t *testing.T, ndpDisp *ndpDispatcher, addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		case <-time.After(defaultAsyncPositiveEventTimeout):
+			t.Fatal("timed out waiting for addr auto gen event")
+		}
+	}
+
+	expectDADEvent := func(t *testing.T, ndpDisp *ndpDispatcher, addr tcpip.Address, resolved bool) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.dadC:
+			if diff := checkDADEvent(e, nicID, addr, resolved, nil); diff != "" {
+				t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected DAD event")
+		}
+	}
+
+	expectDADEventAsync := func(t *testing.T, ndpDisp *ndpDispatcher, addr tcpip.Address, resolved bool) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.dadC:
+			if diff := checkDADEvent(e, nicID, addr, resolved, nil); diff != "" {
+				t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+			}
+		case <-time.After(dadTransmits*retransmitTimer + defaultAsyncPositiveEventTimeout):
+			t.Fatal("timed out waiting for DAD event")
+		}
+	}
+
+	stableAddrForTempAddrTest := addrForSubnet(subnet, 0)
+
+	addrTypes := []struct {
+		name             string
+		ndpConfigs       stack.NDPConfigurations
+		autoGenLinkLocal bool
+		prepareFn        func(t *testing.T, ndpDisp *ndpDispatcher, e *channel.Endpoint, tempIIDHistory []byte) []tcpip.AddressWithPrefix
+		addrGenFn        func(dadCounter uint8, tempIIDHistory []byte) tcpip.AddressWithPrefix
+	}{
+		{
+			name: "Global address",
+			ndpConfigs: stack.NDPConfigurations{
+				DupAddrDetectTransmits: dadTransmits,
+				RetransmitTimer:        retransmitTimer,
+				HandleRAs:              true,
+				AutoGenGlobalAddresses: true,
+			},
+			prepareFn: func(_ *testing.T, _ *ndpDispatcher, e *channel.Endpoint, _ []byte) []tcpip.AddressWithPrefix {
+				// Receive an RA with prefix1 in a PI.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, lifetimeSeconds))
+				return nil
+
+			},
+			addrGenFn: func(dadCounter uint8, _ []byte) tcpip.AddressWithPrefix {
+				return addrForSubnet(subnet, dadCounter)
+			},
+		},
+		{
+			name: "LinkLocal address",
+			ndpConfigs: stack.NDPConfigurations{
+				DupAddrDetectTransmits: dadTransmits,
+				RetransmitTimer:        retransmitTimer,
+			},
+			autoGenLinkLocal: true,
+			prepareFn: func(*testing.T, *ndpDispatcher, *channel.Endpoint, []byte) []tcpip.AddressWithPrefix {
+				return nil
+			},
+			addrGenFn: func(dadCounter uint8, _ []byte) tcpip.AddressWithPrefix {
+				return addrForSubnet(header.IPv6LinkLocalPrefix.Subnet(), dadCounter)
+			},
+		},
+		{
+			name: "Temporary address",
+			ndpConfigs: stack.NDPConfigurations{
+				DupAddrDetectTransmits:     dadTransmits,
+				RetransmitTimer:            retransmitTimer,
+				HandleRAs:                  true,
+				AutoGenGlobalAddresses:     true,
+				AutoGenTempGlobalAddresses: true,
+			},
+			prepareFn: func(t *testing.T, ndpDisp *ndpDispatcher, e *channel.Endpoint, tempIIDHistory []byte) []tcpip.AddressWithPrefix {
+				header.InitialTempIID(tempIIDHistory, nil, nicID)
+
+				// Generate a stable SLAAC address so temporary addresses will be
+				// generated.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, 100, 100))
+				expectAutoGenAddrEvent(t, ndpDisp, stableAddrForTempAddrTest, newAddr)
+				expectDADEventAsync(t, ndpDisp, stableAddrForTempAddrTest.Address, true)
+
+				// The stable address will be assigned throughout the test.
+				return []tcpip.AddressWithPrefix{stableAddrForTempAddrTest}
+			},
+			addrGenFn: func(_ uint8, tempIIDHistory []byte) tcpip.AddressWithPrefix {
+				return header.GenerateTempIPv6SLAACAddr(tempIIDHistory, stableAddrForTempAddrTest.Address)
+			},
+		},
+	}
+
+	for _, addrType := range addrTypes {
+		// This Run will not return until the parallel tests finish.
+		//
+		// We need this because we need to do some teardown work after the parallel
+		// tests complete and limit the number of parallel tests running at the same
+		// time to reduce flakes.
+		//
+		// See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+		// more details.
+		t.Run(addrType.name, func(t *testing.T) {
+			for maxRetries := uint8(0); maxRetries <= maxMaxRetries; maxRetries++ {
+				for numFailures := uint8(0); numFailures <= maxRetries+1; numFailures++ {
+					maxRetries := maxRetries
+					numFailures := numFailures
+					addrType := addrType
+
+					t.Run(fmt.Sprintf("%d max retries and %d failures", maxRetries, numFailures), func(t *testing.T) {
+						t.Parallel()
+
+						ndpDisp := ndpDispatcher{
+							dadC:         make(chan ndpDADEvent, 1),
+							autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+						}
+						e := channel.New(0, 1280, linkAddr1)
+						ndpConfigs := addrType.ndpConfigs
+						ndpConfigs.AutoGenAddressConflictRetries = maxRetries
+						s := stack.New(stack.Options{
+							NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+							AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
+							NDPConfigs:           ndpConfigs,
+							NDPDisp:              &ndpDisp,
+							OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+								NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+									return nicName
+								},
+								SecretKey: secretKey,
+							},
+						})
+						opts := stack.NICOptions{Name: nicName}
+						if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
+							t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err)
+						}
+
+						var tempIIDHistory [header.IIDSize]byte
+						stableAddrs := addrType.prepareFn(t, &ndpDisp, e, tempIIDHistory[:])
+
+						// Simulate DAD conflicts so the address is regenerated.
+						for i := uint8(0); i < numFailures; i++ {
+							addr := addrType.addrGenFn(i, tempIIDHistory[:])
+							expectAutoGenAddrEventAsync(t, &ndpDisp, addr, newAddr)
+
+							// Should not have any new addresses assigned to the NIC.
+							if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, stableAddrs, nil); mismatch != "" {
+								t.Fatal(mismatch)
+							}
+
+							// Simulate a DAD conflict.
+							if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
+								t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
+							}
+							expectAutoGenAddrEvent(t, &ndpDisp, addr, invalidatedAddr)
+							expectDADEvent(t, &ndpDisp, addr.Address, false)
+
+							// Attempting to add the address manually should not fail if the
+							// address's state was cleaned up when DAD failed.
+							if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr.Address); err != nil {
+								t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr.Address, err)
+							}
+							if err := s.RemoveAddress(nicID, addr.Address); err != nil {
+								t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr.Address, err)
+							}
+							expectDADEvent(t, &ndpDisp, addr.Address, false)
+						}
+
+						// Should not have any new addresses assigned to the NIC.
+						if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, stableAddrs, nil); mismatch != "" {
+							t.Fatal(mismatch)
+						}
+
+						// If we had less failures than generation attempts, we should have
+						// an address after DAD resolves.
+						if maxRetries+1 > numFailures {
+							addr := addrType.addrGenFn(numFailures, tempIIDHistory[:])
+							expectAutoGenAddrEventAsync(t, &ndpDisp, addr, newAddr)
+							expectDADEventAsync(t, &ndpDisp, addr.Address, true)
+							if mismatch := addressCheck(s.NICInfo()[nicID].ProtocolAddresses, append(stableAddrs, addr), nil); mismatch != "" {
+								t.Fatal(mismatch)
+							}
+						}
+
+						// Should not attempt address generation again.
+						select {
+						case e := <-ndpDisp.autoGenAddrC:
+							t.Fatalf("unexpectedly got an auto-generated address event = %+v", e)
+						case <-time.After(defaultAsyncNegativeEventTimeout):
+						}
+					})
+				}
+			}
+		})
+	}
+}
+
+// TestAutoGenAddrWithEUI64IIDNoDADRetries tests that a regeneration attempt is
+// not made for SLAAC addresses generated with an IID based on the NIC's link
+// address.
+func TestAutoGenAddrWithEUI64IIDNoDADRetries(t *testing.T) {
+	const nicID = 1
+	const dadTransmits = 1
+	const retransmitTimer = time.Second
+	const maxRetries = 3
+	const lifetimeSeconds = 10
+
+	prefix, subnet, _ := prefixSubnetAddr(0, linkAddr1)
+
+	addrTypes := []struct {
+		name             string
+		ndpConfigs       stack.NDPConfigurations
+		autoGenLinkLocal bool
+		subnet           tcpip.Subnet
+		triggerSLAACFn   func(e *channel.Endpoint)
+	}{
+		{
+			name: "Global address",
+			ndpConfigs: stack.NDPConfigurations{
+				DupAddrDetectTransmits:        dadTransmits,
+				RetransmitTimer:               retransmitTimer,
+				HandleRAs:                     true,
+				AutoGenGlobalAddresses:        true,
+				AutoGenAddressConflictRetries: maxRetries,
+			},
+			subnet: subnet,
+			triggerSLAACFn: func(e *channel.Endpoint) {
+				// Receive an RA with prefix1 in a PI.
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, lifetimeSeconds))
+
+			},
+		},
+		{
+			name: "LinkLocal address",
+			ndpConfigs: stack.NDPConfigurations{
+				DupAddrDetectTransmits:        dadTransmits,
+				RetransmitTimer:               retransmitTimer,
+				AutoGenAddressConflictRetries: maxRetries,
+			},
+			autoGenLinkLocal: true,
+			subnet:           header.IPv6LinkLocalPrefix.Subnet(),
+			triggerSLAACFn:   func(e *channel.Endpoint) {},
+		},
+	}
+
+	for _, addrType := range addrTypes {
+		addrType := addrType
+
+		t.Run(addrType.name, func(t *testing.T) {
+			t.Parallel()
+
+			ndpDisp := ndpDispatcher{
+				dadC:         make(chan ndpDADEvent, 1),
+				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+			}
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+				AutoGenIPv6LinkLocal: addrType.autoGenLinkLocal,
+				NDPConfigs:           addrType.ndpConfigs,
+				NDPDisp:              &ndpDisp,
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+
+			expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+				t.Helper()
+
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
+			}
+
+			addrType.triggerSLAACFn(e)
+
+			addrBytes := []byte(addrType.subnet.ID())
+			header.EthernetAdddressToModifiedEUI64IntoBuf(linkAddr1, addrBytes[header.IIDOffsetInIPv6Address:])
+			addr := tcpip.AddressWithPrefix{
+				Address:   tcpip.Address(addrBytes),
+				PrefixLen: 64,
+			}
+			expectAutoGenAddrEvent(addr, newAddr)
+
+			// Simulate a DAD conflict.
+			if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
+				t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
+			}
+			expectAutoGenAddrEvent(addr, invalidatedAddr)
+			select {
+			case e := <-ndpDisp.dadC:
+				if diff := checkDADEvent(e, nicID, addr.Address, false, nil); diff != "" {
+					t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+				}
+			default:
+				t.Fatal("expected DAD event")
+			}
+
+			// Should not attempt address regeneration.
+			select {
+			case e := <-ndpDisp.autoGenAddrC:
+				t.Fatalf("unexpectedly got an auto-generated address event = %+v", e)
+			case <-time.After(defaultAsyncNegativeEventTimeout):
+			}
+		})
+	}
+}
+
+// TestAutoGenAddrContinuesLifetimesAfterRetry tests that retrying address
+// generation in response to DAD conflicts does not refresh the lifetimes.
+func TestAutoGenAddrContinuesLifetimesAfterRetry(t *testing.T) {
+	const nicID = 1
+	const nicName = "nic"
+	const dadTransmits = 1
+	const retransmitTimer = 2 * time.Second
+	const failureTimer = time.Second
+	const maxRetries = 1
+	const lifetimeSeconds = 5
+
+	var secretKeyBuf [header.OpaqueIIDSecretKeyMinBytes]byte
+	secretKey := secretKeyBuf[:]
+	n, err := rand.Read(secretKey)
+	if err != nil {
+		t.Fatalf("rand.Read(_): %s", err)
+	}
+	if n != header.OpaqueIIDSecretKeyMinBytes {
+		t.Fatalf("got rand.Read(_) = (%d, _), want = (%d, _)", n, header.OpaqueIIDSecretKeyMinBytes)
+	}
+
+	prefix, subnet, _ := prefixSubnetAddr(0, linkAddr1)
+
+	ndpDisp := ndpDispatcher{
+		dadC:         make(chan ndpDADEvent, 1),
+		autoGenAddrC: make(chan ndpAutoGenAddrEvent, 2),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			DupAddrDetectTransmits:        dadTransmits,
+			RetransmitTimer:               retransmitTimer,
+			HandleRAs:                     true,
+			AutoGenGlobalAddresses:        true,
+			AutoGenAddressConflictRetries: maxRetries,
+		},
+		NDPDisp: &ndpDisp,
+		OpaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+			NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+				return nicName
+			},
+			SecretKey: secretKey,
+		},
+	})
+	opts := stack.NICOptions{Name: nicName}
+	if err := s.CreateNICWithOptions(nicID, e, opts); err != nil {
+		t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err)
+	}
+
+	expectAutoGenAddrEvent := func(addr tcpip.AddressWithPrefix, eventType ndpAutoGenAddrEventType) {
+		t.Helper()
+
+		select {
+		case e := <-ndpDisp.autoGenAddrC:
+			if diff := checkAutoGenAddrEvent(e, addr, eventType); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected addr auto gen event")
+		}
+	}
+
+	// Receive an RA with prefix in a PI.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr2, 0, prefix, true, true, lifetimeSeconds, lifetimeSeconds))
+
+	addrBytes := []byte(subnet.ID())
+	addr := tcpip.AddressWithPrefix{
+		Address:   tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet, nicName, 0, secretKey)),
+		PrefixLen: 64,
+	}
+	expectAutoGenAddrEvent(addr, newAddr)
+
+	// Simulate a DAD conflict after some time has passed.
+	time.Sleep(failureTimer)
+	if err := s.DupTentativeAddrDetected(nicID, addr.Address); err != nil {
+		t.Fatalf("s.DupTentativeAddrDetected(%d, %s): %s", nicID, addr.Address, err)
+	}
+	expectAutoGenAddrEvent(addr, invalidatedAddr)
+	select {
+	case e := <-ndpDisp.dadC:
+		if diff := checkDADEvent(e, nicID, addr.Address, false, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+		}
+	default:
+		t.Fatal("expected DAD event")
+	}
+
+	// Let the next address resolve.
+	addr.Address = tcpip.Address(header.AppendOpaqueInterfaceIdentifier(addrBytes[:header.IIDOffsetInIPv6Address], subnet, nicName, 1, secretKey))
+	expectAutoGenAddrEvent(addr, newAddr)
+	select {
+	case e := <-ndpDisp.dadC:
+		if diff := checkDADEvent(e, nicID, addr.Address, true, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+		}
+	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncPositiveEventTimeout):
+		t.Fatal("timed out waiting for DAD event")
+	}
+
+	// Address should be deprecated/invalidated after the lifetime expires.
+	//
+	// Note, the remaining lifetime is calculated from when the PI was first
+	// processed. Since we wait for some time before simulating a DAD conflict
+	// and more time for the new address to resolve, the new address is only
+	// expected to be valid for the remaining time. The DAD conflict should
+	// not have reset the lifetimes.
+	//
+	// We expect either just the invalidation event or the deprecation event
+	// followed by the invalidation event.
+	select {
+	case e := <-ndpDisp.autoGenAddrC:
+		if e.eventType == deprecatedAddr {
+			if diff := checkAutoGenAddrEvent(e, addr, deprecatedAddr); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+
+			select {
+			case e := <-ndpDisp.autoGenAddrC:
+				if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
+					t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+				}
+			case <-time.After(defaultAsyncPositiveEventTimeout):
+				t.Fatal("timed out waiting for invalidated auto gen addr event after deprecation")
+			}
+		} else {
+			if diff := checkAutoGenAddrEvent(e, addr, invalidatedAddr); diff != "" {
+				t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+			}
+		}
+	case <-time.After(lifetimeSeconds*time.Second - failureTimer - dadTransmits*retransmitTimer + defaultAsyncPositiveEventTimeout):
+		t.Fatal("timed out waiting for auto gen addr event")
+	}
+}
+
+// TestNDPRecursiveDNSServerDispatch tests that we properly dispatch an event
+// to the integrator when an RA is received with the NDP Recursive DNS Server
+// option with at least one valid address.
+func TestNDPRecursiveDNSServerDispatch(t *testing.T) {
+	tests := []struct {
+		name     string
+		opt      header.NDPRecursiveDNSServer
+		expected *ndpRDNSS
+	}{
+		{
+			"Unspecified",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+				0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+			}),
+			nil,
+		},
+		{
+			"Multicast",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+				255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+			}),
+			nil,
+		},
+		{
+			"OptionTooSmall",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+				1, 2, 3, 4, 5, 6, 7, 8,
+			}),
+			nil,
+		},
+		{
+			"0Addresses",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+			}),
+			nil,
+		},
+		{
+			"Valid1Address",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 2,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1,
+			}),
+			&ndpRDNSS{
+				[]tcpip.Address{
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01",
+				},
+				2 * time.Second,
+			},
+		},
+		{
+			"Valid2Addresses",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 1,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 2,
+			}),
+			&ndpRDNSS{
+				[]tcpip.Address{
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01",
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x02",
+				},
+				time.Second,
+			},
+		},
+		{
+			"Valid3Addresses",
+			header.NDPRecursiveDNSServer([]byte{
+				0, 0,
+				0, 0, 0, 0,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 1,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 2,
+				1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 3,
+			}),
+			&ndpRDNSS{
+				[]tcpip.Address{
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x01",
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x02",
+					"\x01\x02\x03\x04\x05\x06\x07\x08\x00\x00\x00\x00\x00\x00\x00\x03",
+				},
+				0,
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				// We do not expect more than a single RDNSS
+				// event at any time for this test.
+				rdnssC: make(chan ndpRDNSSEvent, 1),
+			}
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs: true,
+				},
+				NDPDisp: &ndpDisp,
+			})
+			if err := s.CreateNIC(1, e); err != nil {
+				t.Fatalf("CreateNIC(1) = %s", err)
+			}
+
+			e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, header.NDPOptionsSerializer{test.opt}))
+
+			if test.expected != nil {
+				select {
+				case e := <-ndpDisp.rdnssC:
+					if e.nicID != 1 {
+						t.Errorf("got rdnss nicID = %d, want = 1", e.nicID)
+					}
+					if diff := cmp.Diff(e.rdnss.addrs, test.expected.addrs); diff != "" {
+						t.Errorf("rdnss addrs mismatch (-want +got):\n%s", diff)
+					}
+					if e.rdnss.lifetime != test.expected.lifetime {
+						t.Errorf("got rdnss lifetime = %s, want = %s", e.rdnss.lifetime, test.expected.lifetime)
+					}
+				default:
+					t.Fatal("expected an RDNSS option event")
+				}
+			}
+
+			// Should have no more RDNSS options.
+			select {
+			case e := <-ndpDisp.rdnssC:
+				t.Fatalf("unexpectedly got a new RDNSS option event: %+v", e)
+			default:
+			}
+		})
+	}
+}
+
+// TestNDPDNSSearchListDispatch tests that the integrator is informed when an
+// NDP DNS Search List option is received with at least one domain name in the
+// search list.
+func TestNDPDNSSearchListDispatch(t *testing.T) {
+	const nicID = 1
+
+	ndpDisp := ndpDispatcher{
+		dnsslC: make(chan ndpDNSSLEvent, 3),
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	optSer := header.NDPOptionsSerializer{
+		header.NDPDNSSearchList([]byte{
+			0, 0,
+			0, 0, 0, 0,
+			2, 'h', 'i',
+			0,
+		}),
+		header.NDPDNSSearchList([]byte{
+			0, 0,
+			0, 0, 0, 1,
+			1, 'i',
+			0,
+			2, 'a', 'm',
+			2, 'm', 'e',
+			0,
+		}),
+		header.NDPDNSSearchList([]byte{
+			0, 0,
+			0, 0, 1, 0,
+			3, 'x', 'y', 'z',
+			0,
+			5, 'h', 'e', 'l', 'l', 'o',
+			5, 'w', 'o', 'r', 'l', 'd',
+			0,
+			4, 't', 'h', 'i', 's',
+			2, 'i', 's',
+			1, 'a',
+			4, 't', 'e', 's', 't',
+			0,
+		}),
+	}
+	expected := []struct {
+		domainNames []string
+		lifetime    time.Duration
+	}{
+		{
+			domainNames: []string{
+				"hi",
+			},
+			lifetime: 0,
+		},
+		{
+			domainNames: []string{
+				"i",
+				"am.me",
+			},
+			lifetime: time.Second,
+		},
+		{
+			domainNames: []string{
+				"xyz",
+				"hello.world",
+				"this.is.a.test",
+			},
+			lifetime: 256 * time.Second,
+		},
+	}
+
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithOpts(llAddr1, 0, optSer))
+
+	for i, expected := range expected {
+		select {
+		case dnssl := <-ndpDisp.dnsslC:
+			if dnssl.nicID != nicID {
+				t.Errorf("got %d-th dnssl nicID = %d, want = %d", i, dnssl.nicID, nicID)
+			}
+			if diff := cmp.Diff(dnssl.domainNames, expected.domainNames); diff != "" {
+				t.Errorf("%d-th dnssl domain names mismatch (-want +got):\n%s", i, diff)
+			}
+			if dnssl.lifetime != expected.lifetime {
+				t.Errorf("got %d-th dnssl lifetime = %s, want = %s", i, dnssl.lifetime, expected.lifetime)
+			}
+		default:
+			t.Fatal("expected a DNSSL event")
+		}
+	}
+
+	// Should have no more DNSSL options.
+	select {
+	case <-ndpDisp.dnsslC:
+		t.Fatal("unexpectedly got a DNSSL event")
+	default:
+	}
+}
+
+// TestCleanupNDPState tests that all discovered routers and prefixes, and
+// auto-generated addresses are invalidated when a NIC becomes a router.
+func TestCleanupNDPState(t *testing.T) {
+	const (
+		lifetimeSeconds          = 5
+		maxRouterAndPrefixEvents = 4
+		nicID1                   = 1
+		nicID2                   = 2
+	)
+
+	prefix1, subnet1, e1Addr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, subnet2, e1Addr2 := prefixSubnetAddr(1, linkAddr1)
+	e2Addr1 := addrForSubnet(subnet1, linkAddr2)
+	e2Addr2 := addrForSubnet(subnet2, linkAddr2)
+	llAddrWithPrefix1 := tcpip.AddressWithPrefix{
+		Address:   llAddr1,
+		PrefixLen: 64,
+	}
+	llAddrWithPrefix2 := tcpip.AddressWithPrefix{
+		Address:   llAddr2,
+		PrefixLen: 64,
+	}
+
+	tests := []struct {
+		name                 string
+		cleanupFn            func(t *testing.T, s *stack.Stack)
+		keepAutoGenLinkLocal bool
+		maxAutoGenAddrEvents int
+		skipFinalAddrCheck   bool
+	}{
+		// A NIC should still keep its auto-generated link-local address when
+		// becoming a router.
+		{
+			name: "Enable forwarding",
+			cleanupFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
+				s.SetForwarding(true)
+			},
+			keepAutoGenLinkLocal: true,
+			maxAutoGenAddrEvents: 4,
+		},
+
+		// A NIC should cleanup all NDP state when it is disabled.
+		{
+			name: "Disable NIC",
+			cleanupFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
+
+				if err := s.DisableNIC(nicID1); err != nil {
+					t.Fatalf("s.DisableNIC(%d): %s", nicID1, err)
+				}
+				if err := s.DisableNIC(nicID2); err != nil {
+					t.Fatalf("s.DisableNIC(%d): %s", nicID2, err)
+				}
+			},
+			keepAutoGenLinkLocal: false,
+			maxAutoGenAddrEvents: 6,
+		},
+
+		// A NIC should cleanup all NDP state when it is removed.
+		{
+			name: "Remove NIC",
+			cleanupFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
+
+				if err := s.RemoveNIC(nicID1); err != nil {
+					t.Fatalf("s.RemoveNIC(%d): %s", nicID1, err)
+				}
+				if err := s.RemoveNIC(nicID2); err != nil {
+					t.Fatalf("s.RemoveNIC(%d): %s", nicID2, err)
+				}
+			},
+			keepAutoGenLinkLocal: false,
+			maxAutoGenAddrEvents: 6,
+			// The NICs are removed so we can't check their addresses after calling
+			// stopFn.
+			skipFinalAddrCheck: true,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				routerC:        make(chan ndpRouterEvent, maxRouterAndPrefixEvents),
+				rememberRouter: true,
+				prefixC:        make(chan ndpPrefixEvent, maxRouterAndPrefixEvents),
+				rememberPrefix: true,
+				autoGenAddrC:   make(chan ndpAutoGenAddrEvent, test.maxAutoGenAddrEvents),
+			}
+			s := stack.New(stack.Options{
+				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+				AutoGenIPv6LinkLocal: true,
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:              true,
+					DiscoverDefaultRouters: true,
+					DiscoverOnLinkPrefixes: true,
+					AutoGenGlobalAddresses: true,
+				},
+				NDPDisp: &ndpDisp,
+			})
+
+			expectRouterEvent := func() (bool, ndpRouterEvent) {
+				select {
+				case e := <-ndpDisp.routerC:
+					return true, e
+				default:
+				}
+
+				return false, ndpRouterEvent{}
+			}
+
+			expectPrefixEvent := func() (bool, ndpPrefixEvent) {
+				select {
+				case e := <-ndpDisp.prefixC:
+					return true, e
+				default:
+				}
+
+				return false, ndpPrefixEvent{}
+			}
+
+			expectAutoGenAddrEvent := func() (bool, ndpAutoGenAddrEvent) {
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					return true, e
+				default:
+				}
+
+				return false, ndpAutoGenAddrEvent{}
+			}
+
+			e1 := channel.New(0, 1280, linkAddr1)
+			if err := s.CreateNIC(nicID1, e1); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID1, err)
+			}
+			// We have other tests that make sure we receive the *correct* events
+			// on normal discovery of routers/prefixes, and auto-generated
+			// addresses. Here we just make sure we get an event and let other tests
+			// handle the correctness check.
+			expectAutoGenAddrEvent()
+
+			e2 := channel.New(0, 1280, linkAddr2)
+			if err := s.CreateNIC(nicID2, e2); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID2, err)
+			}
+			expectAutoGenAddrEvent()
+
+			// Receive RAs on NIC(1) and NIC(2) from default routers (llAddr3 and
+			// llAddr4) w/ PI (for prefix1 in RA from llAddr3 and prefix2 in RA from
+			// llAddr4) to discover multiple routers and prefixes, and auto-gen
+			// multiple addresses.
+
+			e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID1)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID1)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr1, nicID1)
+			}
+
+			e1.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID1)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID1)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID1)
+			}
+
+			e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, lifetimeSeconds, prefix1, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr3, nicID2)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix1, nicID2)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e1Addr2, nicID2)
+			}
+
+			e2.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr4, lifetimeSeconds, prefix2, true, true, lifetimeSeconds, lifetimeSeconds))
+			if ok, _ := expectRouterEvent(); !ok {
+				t.Errorf("expected router event for %s on NIC(%d)", llAddr4, nicID2)
+			}
+			if ok, _ := expectPrefixEvent(); !ok {
+				t.Errorf("expected prefix event for %s on NIC(%d)", prefix2, nicID2)
+			}
+			if ok, _ := expectAutoGenAddrEvent(); !ok {
+				t.Errorf("expected auto-gen addr event for %s on NIC(%d)", e2Addr2, nicID2)
+			}
+
+			// We should have the auto-generated addresses added.
+			nicinfo := s.NICInfo()
+			nic1Addrs := nicinfo[nicID1].ProtocolAddresses
+			nic2Addrs := nicinfo[nicID2].ProtocolAddresses
+			if !containsV6Addr(nic1Addrs, llAddrWithPrefix1) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+			}
+			if !containsV6Addr(nic1Addrs, e1Addr1) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
+			}
+			if !containsV6Addr(nic1Addrs, e1Addr2) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
+			}
+			if !containsV6Addr(nic2Addrs, llAddrWithPrefix2) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+			}
+			if !containsV6Addr(nic2Addrs, e2Addr1) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
+			}
+			if !containsV6Addr(nic2Addrs, e2Addr2) {
+				t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
+			}
+
+			// We can't proceed any further if we already failed the test (missing
+			// some discovery/auto-generated address events or addresses).
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			test.cleanupFn(t, s)
+
+			// Collect invalidation events after having NDP state cleaned up.
+			gotRouterEvents := make(map[ndpRouterEvent]int)
+			for i := 0; i < maxRouterAndPrefixEvents; i++ {
+				ok, e := expectRouterEvent()
+				if !ok {
+					t.Errorf("expected %d router events after becoming a router; got = %d", maxRouterAndPrefixEvents, i)
+					break
+				}
+				gotRouterEvents[e]++
+			}
+			gotPrefixEvents := make(map[ndpPrefixEvent]int)
+			for i := 0; i < maxRouterAndPrefixEvents; i++ {
+				ok, e := expectPrefixEvent()
+				if !ok {
+					t.Errorf("expected %d prefix events after becoming a router; got = %d", maxRouterAndPrefixEvents, i)
+					break
+				}
+				gotPrefixEvents[e]++
+			}
+			gotAutoGenAddrEvents := make(map[ndpAutoGenAddrEvent]int)
+			for i := 0; i < test.maxAutoGenAddrEvents; i++ {
+				ok, e := expectAutoGenAddrEvent()
+				if !ok {
+					t.Errorf("expected %d auto-generated address events after becoming a router; got = %d", test.maxAutoGenAddrEvents, i)
+					break
+				}
+				gotAutoGenAddrEvents[e]++
+			}
+
+			// No need to proceed any further if we already failed the test (missing
+			// some invalidation events).
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			expectedRouterEvents := map[ndpRouterEvent]int{
+				{nicID: nicID1, addr: llAddr3, discovered: false}: 1,
+				{nicID: nicID1, addr: llAddr4, discovered: false}: 1,
+				{nicID: nicID2, addr: llAddr3, discovered: false}: 1,
+				{nicID: nicID2, addr: llAddr4, discovered: false}: 1,
+			}
+			if diff := cmp.Diff(expectedRouterEvents, gotRouterEvents); diff != "" {
+				t.Errorf("router events mismatch (-want +got):\n%s", diff)
+			}
+			expectedPrefixEvents := map[ndpPrefixEvent]int{
+				{nicID: nicID1, prefix: subnet1, discovered: false}: 1,
+				{nicID: nicID1, prefix: subnet2, discovered: false}: 1,
+				{nicID: nicID2, prefix: subnet1, discovered: false}: 1,
+				{nicID: nicID2, prefix: subnet2, discovered: false}: 1,
+			}
+			if diff := cmp.Diff(expectedPrefixEvents, gotPrefixEvents); diff != "" {
+				t.Errorf("prefix events mismatch (-want +got):\n%s", diff)
+			}
+			expectedAutoGenAddrEvents := map[ndpAutoGenAddrEvent]int{
+				{nicID: nicID1, addr: e1Addr1, eventType: invalidatedAddr}: 1,
+				{nicID: nicID1, addr: e1Addr2, eventType: invalidatedAddr}: 1,
+				{nicID: nicID2, addr: e2Addr1, eventType: invalidatedAddr}: 1,
+				{nicID: nicID2, addr: e2Addr2, eventType: invalidatedAddr}: 1,
+			}
+
+			if !test.keepAutoGenLinkLocal {
+				expectedAutoGenAddrEvents[ndpAutoGenAddrEvent{nicID: nicID1, addr: llAddrWithPrefix1, eventType: invalidatedAddr}] = 1
+				expectedAutoGenAddrEvents[ndpAutoGenAddrEvent{nicID: nicID2, addr: llAddrWithPrefix2, eventType: invalidatedAddr}] = 1
+			}
+
+			if diff := cmp.Diff(expectedAutoGenAddrEvents, gotAutoGenAddrEvents); diff != "" {
+				t.Errorf("auto-generated address events mismatch (-want +got):\n%s", diff)
+			}
+
+			if !test.skipFinalAddrCheck {
+				// Make sure the auto-generated addresses got removed.
+				nicinfo = s.NICInfo()
+				nic1Addrs = nicinfo[nicID1].ProtocolAddresses
+				nic2Addrs = nicinfo[nicID2].ProtocolAddresses
+				if containsV6Addr(nic1Addrs, llAddrWithPrefix1) != test.keepAutoGenLinkLocal {
+					if test.keepAutoGenLinkLocal {
+						t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+					} else {
+						t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", llAddrWithPrefix1, nicID1, nic1Addrs)
+					}
+				}
+				if containsV6Addr(nic1Addrs, e1Addr1) {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr1, nicID1, nic1Addrs)
+				}
+				if containsV6Addr(nic1Addrs, e1Addr2) {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e1Addr2, nicID1, nic1Addrs)
+				}
+				if containsV6Addr(nic2Addrs, llAddrWithPrefix2) != test.keepAutoGenLinkLocal {
+					if test.keepAutoGenLinkLocal {
+						t.Errorf("missing %s from the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+					} else {
+						t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", llAddrWithPrefix2, nicID2, nic2Addrs)
+					}
+				}
+				if containsV6Addr(nic2Addrs, e2Addr1) {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr1, nicID2, nic2Addrs)
+				}
+				if containsV6Addr(nic2Addrs, e2Addr2) {
+					t.Errorf("still have %s in the list of addresses for NIC(%d): %+v", e2Addr2, nicID2, nic2Addrs)
+				}
+			}
+
+			// Should not get any more events (invalidation timers should have been
+			// cancelled when the NDP state was cleaned up).
+			time.Sleep(lifetimeSeconds*time.Second + defaultAsyncNegativeEventTimeout)
+			select {
+			case <-ndpDisp.routerC:
+				t.Error("unexpected router event")
+			default:
+			}
+			select {
+			case <-ndpDisp.prefixC:
+				t.Error("unexpected prefix event")
+			default:
+			}
+			select {
+			case <-ndpDisp.autoGenAddrC:
+				t.Error("unexpected auto-generated address event")
+			default:
+			}
+		})
+	}
+}
+
+// TestDHCPv6ConfigurationFromNDPDA tests that the NDPDispatcher is properly
+// informed when new information about what configurations are available via
+// DHCPv6 is learned.
+func TestDHCPv6ConfigurationFromNDPDA(t *testing.T) {
+	const nicID = 1
+
+	ndpDisp := ndpDispatcher{
+		dhcpv6ConfigurationC: make(chan ndpDHCPv6Event, 1),
+		rememberRouter:       true,
+	}
+	e := channel.New(0, 1280, linkAddr1)
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			HandleRAs: true,
+		},
+		NDPDisp: &ndpDisp,
+	})
+
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	expectDHCPv6Event := func(configuration stack.DHCPv6ConfigurationFromNDPRA) {
+		t.Helper()
+		select {
+		case e := <-ndpDisp.dhcpv6ConfigurationC:
+			if diff := cmp.Diff(ndpDHCPv6Event{nicID: nicID, configuration: configuration}, e, cmp.AllowUnexported(e)); diff != "" {
+				t.Errorf("dhcpv6 event mismatch (-want +got):\n%s", diff)
+			}
+		default:
+			t.Fatal("expected DHCPv6 configuration event")
+		}
+	}
+
+	expectNoDHCPv6Event := func() {
+		t.Helper()
+		select {
+		case <-ndpDisp.dhcpv6ConfigurationC:
+			t.Fatal("unexpected DHCPv6 configuration event")
+		default:
+		}
+	}
+
+	// Even if the first RA reports no DHCPv6 configurations are available, the
+	// dispatcher should get an event.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
+	expectDHCPv6Event(stack.DHCPv6NoConfiguration)
+	// Receiving the same update again should not result in an event to the
+	// dispatcher.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
+	expectNoDHCPv6Event()
+
+	// Receive an RA that updates the DHCPv6 configuration to Other
+	// Configurations.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+	expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+	expectNoDHCPv6Event()
+
+	// Receive an RA that updates the DHCPv6 configuration to Managed Address.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, false))
+	expectDHCPv6Event(stack.DHCPv6ManagedAddress)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, false))
+	expectNoDHCPv6Event()
+
+	// Receive an RA that updates the DHCPv6 configuration to none.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
+	expectDHCPv6Event(stack.DHCPv6NoConfiguration)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, false))
+	expectNoDHCPv6Event()
+
+	// Receive an RA that updates the DHCPv6 configuration to Managed Address.
+	//
+	// Note, when the M flag is set, the O flag is redundant.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, true))
+	expectDHCPv6Event(stack.DHCPv6ManagedAddress)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, true))
+	expectNoDHCPv6Event()
+	// Even though the DHCPv6 flags are different, the effective configuration is
+	// the same so we should not receive a new event.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, false))
+	expectNoDHCPv6Event()
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, true, true))
+	expectNoDHCPv6Event()
+
+	// Receive an RA that updates the DHCPv6 configuration to Other
+	// Configurations.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+	expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+	expectNoDHCPv6Event()
+
+	// Cycling the NIC should cause the last DHCPv6 configuration to be cleared.
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+	}
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+
+	// Receive an RA that updates the DHCPv6 configuration to Other
+	// Configurations.
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+	expectDHCPv6Event(stack.DHCPv6OtherConfigurations)
+	e.InjectInbound(header.IPv6ProtocolNumber, raBufWithDHCPv6(llAddr2, false, true))
+	expectNoDHCPv6Event()
+}
+
+// TestRouterSolicitation tests the initial Router Solicitations that are sent
+// when a NIC newly becomes enabled.
+func TestRouterSolicitation(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name                        string
+		linkHeaderLen               uint16
+		linkAddr                    tcpip.LinkAddress
+		nicAddr                     tcpip.Address
+		expectedSrcAddr             tcpip.Address
+		expectedNDPOpts             []header.NDPOption
+		maxRtrSolicit               uint8
+		rtrSolicitInt               time.Duration
+		effectiveRtrSolicitInt      time.Duration
+		maxRtrSolicitDelay          time.Duration
+		effectiveMaxRtrSolicitDelay time.Duration
+	}{
+		{
+			name:                        "Single RS with 2s delay and interval",
+			expectedSrcAddr:             header.IPv6Any,
+			maxRtrSolicit:               1,
+			rtrSolicitInt:               2 * time.Second,
+			effectiveRtrSolicitInt:      2 * time.Second,
+			maxRtrSolicitDelay:          2 * time.Second,
+			effectiveMaxRtrSolicitDelay: 2 * time.Second,
+		},
+		{
+			name:                        "Single RS with 4s delay and interval",
+			expectedSrcAddr:             header.IPv6Any,
+			maxRtrSolicit:               1,
+			rtrSolicitInt:               4 * time.Second,
+			effectiveRtrSolicitInt:      4 * time.Second,
+			maxRtrSolicitDelay:          4 * time.Second,
+			effectiveMaxRtrSolicitDelay: 4 * time.Second,
+		},
+		{
+			name:                        "Two RS with delay",
+			linkHeaderLen:               1,
+			nicAddr:                     llAddr1,
+			expectedSrcAddr:             llAddr1,
+			maxRtrSolicit:               2,
+			rtrSolicitInt:               2 * time.Second,
+			effectiveRtrSolicitInt:      2 * time.Second,
+			maxRtrSolicitDelay:          500 * time.Millisecond,
+			effectiveMaxRtrSolicitDelay: 500 * time.Millisecond,
+		},
+		{
+			name:            "Single RS without delay",
+			linkHeaderLen:   2,
+			linkAddr:        linkAddr1,
+			nicAddr:         llAddr1,
+			expectedSrcAddr: llAddr1,
+			expectedNDPOpts: []header.NDPOption{
+				header.NDPSourceLinkLayerAddressOption(linkAddr1),
+			},
+			maxRtrSolicit:               1,
+			rtrSolicitInt:               2 * time.Second,
+			effectiveRtrSolicitInt:      2 * time.Second,
+			maxRtrSolicitDelay:          0,
+			effectiveMaxRtrSolicitDelay: 0,
+		},
+		{
+			name:                        "Two RS without delay and invalid zero interval",
+			linkHeaderLen:               3,
+			linkAddr:                    linkAddr1,
+			expectedSrcAddr:             header.IPv6Any,
+			maxRtrSolicit:               2,
+			rtrSolicitInt:               0,
+			effectiveRtrSolicitInt:      4 * time.Second,
+			maxRtrSolicitDelay:          0,
+			effectiveMaxRtrSolicitDelay: 0,
+		},
+		{
+			name:                        "Three RS without delay",
+			linkAddr:                    linkAddr1,
+			expectedSrcAddr:             header.IPv6Any,
+			maxRtrSolicit:               3,
+			rtrSolicitInt:               500 * time.Millisecond,
+			effectiveRtrSolicitInt:      500 * time.Millisecond,
+			maxRtrSolicitDelay:          0,
+			effectiveMaxRtrSolicitDelay: 0,
+		},
+		{
+			name:                        "Two RS with invalid negative delay",
+			linkAddr:                    linkAddr1,
+			expectedSrcAddr:             header.IPv6Any,
+			maxRtrSolicit:               2,
+			rtrSolicitInt:               time.Second,
+			effectiveRtrSolicitInt:      time.Second,
+			maxRtrSolicitDelay:          -3 * time.Second,
+			effectiveMaxRtrSolicitDelay: time.Second,
+		},
+	}
+
+	// This Run will not return until the parallel tests finish.
+	//
+	// We need this because we need to do some teardown work after the
+	// parallel tests complete.
+	//
+	// See https://godoc.org/testing#hdr-Subtests_and_Sub_benchmarks for
+	// more details.
+	t.Run("group", func(t *testing.T) {
+		for _, test := range tests {
+			test := test
+
+			t.Run(test.name, func(t *testing.T) {
+				t.Parallel()
+
+				e := channelLinkWithHeaderLength{
+					Endpoint:     channel.New(int(test.maxRtrSolicit), 1280, test.linkAddr),
+					headerLength: test.linkHeaderLen,
+				}
+				e.Endpoint.LinkEPCapabilities |= stack.CapabilityResolutionRequired
+				waitForPkt := func(timeout time.Duration) {
+					t.Helper()
+					ctx, cancel := context.WithTimeout(context.Background(), timeout)
+					defer cancel()
+					p, ok := e.ReadContext(ctx)
+					if !ok {
+						t.Fatal("timed out waiting for packet")
+						return
+					}
+
+					if p.Proto != header.IPv6ProtocolNumber {
+						t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+					}
+
+					// Make sure the right remote link address is used.
+					if want := header.EthernetAddressFromMulticastIPv6Address(header.IPv6AllRoutersMulticastAddress); p.Route.RemoteLinkAddress != want {
+						t.Errorf("got remote link address = %s, want = %s", p.Route.RemoteLinkAddress, want)
+					}
+
+					checker.IPv6(t,
+						p.Pkt.Header.View(),
+						checker.SrcAddr(test.expectedSrcAddr),
+						checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+						checker.TTL(header.NDPHopLimit),
+						checker.NDPRS(checker.NDPRSOptions(test.expectedNDPOpts)),
+					)
+
+					if l, want := p.Pkt.Header.AvailableLength(), int(test.linkHeaderLen); l != want {
+						t.Errorf("got p.Pkt.Header.AvailableLength() = %d; want = %d", l, want)
+					}
+				}
+				waitForNothing := func(timeout time.Duration) {
+					t.Helper()
+					ctx, cancel := context.WithTimeout(context.Background(), timeout)
+					defer cancel()
+					if _, ok := e.ReadContext(ctx); ok {
+						t.Fatal("unexpectedly got a packet")
+					}
+				}
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+					NDPConfigs: stack.NDPConfigurations{
+						MaxRtrSolicitations:     test.maxRtrSolicit,
+						RtrSolicitationInterval: test.rtrSolicitInt,
+						MaxRtrSolicitationDelay: test.maxRtrSolicitDelay,
+					},
+				})
+				if err := s.CreateNIC(nicID, &e); err != nil {
+					t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+				}
+
+				if addr := test.nicAddr; addr != "" {
+					if err := s.AddAddress(nicID, header.IPv6ProtocolNumber, addr); err != nil {
+						t.Fatalf("AddAddress(%d, %d, %s) = %s", nicID, header.IPv6ProtocolNumber, addr, err)
+					}
+				}
+
+				// Make sure each RS is sent at the right time.
+				remaining := test.maxRtrSolicit
+				if remaining > 0 {
+					waitForPkt(test.effectiveMaxRtrSolicitDelay + defaultAsyncPositiveEventTimeout)
+					remaining--
+				}
+
+				for ; remaining > 0; remaining-- {
+					if test.effectiveRtrSolicitInt > defaultAsyncPositiveEventTimeout {
+						waitForNothing(test.effectiveRtrSolicitInt - defaultAsyncNegativeEventTimeout)
+						waitForPkt(defaultAsyncPositiveEventTimeout)
+					} else {
+						waitForPkt(test.effectiveRtrSolicitInt + defaultAsyncPositiveEventTimeout)
+					}
+				}
+
+				// Make sure no more RS.
+				if test.effectiveRtrSolicitInt > test.effectiveMaxRtrSolicitDelay {
+					waitForNothing(test.effectiveRtrSolicitInt + defaultAsyncNegativeEventTimeout)
+				} else {
+					waitForNothing(test.effectiveMaxRtrSolicitDelay + defaultAsyncNegativeEventTimeout)
+				}
+
+				// Make sure the counter got properly
+				// incremented.
+				if got, want := s.Stats().ICMP.V6PacketsSent.RouterSolicit.Value(), uint64(test.maxRtrSolicit); got != want {
+					t.Fatalf("got sent RouterSolicit = %d, want = %d", got, want)
+				}
+			})
+		}
+	})
+}
+
+func TestStopStartSolicitingRouters(t *testing.T) {
+	const nicID = 1
+	const delay = 0
+	const interval = 500 * time.Millisecond
+	const maxRtrSolicitations = 3
+
+	tests := []struct {
+		name    string
+		startFn func(t *testing.T, s *stack.Stack)
+		// first is used to tell stopFn that it is being called for the first time
+		// after router solicitations were last enabled.
+		stopFn func(t *testing.T, s *stack.Stack, first bool)
+	}{
+		// Tests that when forwarding is enabled or disabled, router solicitations
+		// are stopped or started, respectively.
+		{
+			name: "Enable and disable forwarding",
+			startFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
+				s.SetForwarding(false)
+			},
+			stopFn: func(t *testing.T, s *stack.Stack, _ bool) {
+				t.Helper()
+				s.SetForwarding(true)
+			},
+		},
+
+		// Tests that when a NIC is enabled or disabled, router solicitations
+		// are started or stopped, respectively.
+		{
+			name: "Enable and disable NIC",
+			startFn: func(t *testing.T, s *stack.Stack) {
+				t.Helper()
+
+				if err := s.EnableNIC(nicID); err != nil {
+					t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+				}
+			},
+			stopFn: func(t *testing.T, s *stack.Stack, _ bool) {
+				t.Helper()
+
+				if err := s.DisableNIC(nicID); err != nil {
+					t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+				}
+			},
+		},
+
+		// Tests that when a NIC is removed, router solicitations are stopped. We
+		// cannot start router solications on a removed NIC.
+		{
+			name: "Remove NIC",
+			stopFn: func(t *testing.T, s *stack.Stack, first bool) {
+				t.Helper()
+
+				// Only try to remove the NIC the first time stopFn is called since it's
+				// impossible to remove an already removed NIC.
+				if !first {
+					return
+				}
+
+				if err := s.RemoveNIC(nicID); err != nil {
+					t.Fatalf("s.RemoveNIC(%d): %s", nicID, err)
+				}
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e := channel.New(maxRtrSolicitations, 1280, linkAddr1)
+			waitForPkt := func(timeout time.Duration) {
+				t.Helper()
+
+				ctx, cancel := context.WithTimeout(context.Background(), timeout)
+				defer cancel()
+				p, ok := e.ReadContext(ctx)
+				if !ok {
+					t.Fatal("timed out waiting for packet")
+				}
+
+				if p.Proto != header.IPv6ProtocolNumber {
+					t.Fatalf("got Proto = %d, want = %d", p.Proto, header.IPv6ProtocolNumber)
+				}
+				checker.IPv6(t, p.Pkt.Header.View(),
+					checker.SrcAddr(header.IPv6Any),
+					checker.DstAddr(header.IPv6AllRoutersMulticastAddress),
+					checker.TTL(header.NDPHopLimit),
+					checker.NDPRS())
+			}
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					MaxRtrSolicitations:     maxRtrSolicitations,
+					RtrSolicitationInterval: interval,
+					MaxRtrSolicitationDelay: delay,
+				},
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+
+			// Stop soliciting routers.
+			test.stopFn(t, s, true /* first */)
+			ctx, cancel := context.WithTimeout(context.Background(), delay+defaultAsyncNegativeEventTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				// A single RS may have been sent before solicitations were stopped.
+				ctx, cancel := context.WithTimeout(context.Background(), interval+defaultAsyncNegativeEventTimeout)
+				defer cancel()
+				if _, ok = e.ReadContext(ctx); ok {
+					t.Fatal("should not have sent more than one RS message")
+				}
+			}
+
+			// Stopping router solicitations after it has already been stopped should
+			// do nothing.
+			test.stopFn(t, s, false /* first */)
+			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultAsyncNegativeEventTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				t.Fatal("unexpectedly got a packet after router solicitation has been stopepd")
+			}
+
+			// If test.startFn is nil, there is no way to restart router solications.
+			if test.startFn == nil {
+				return
+			}
+
+			// Start soliciting routers.
+			test.startFn(t, s)
+			waitForPkt(delay + defaultAsyncPositiveEventTimeout)
+			waitForPkt(interval + defaultAsyncPositiveEventTimeout)
+			waitForPkt(interval + defaultAsyncPositiveEventTimeout)
+			ctx, cancel = context.WithTimeout(context.Background(), interval+defaultAsyncNegativeEventTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				t.Fatal("unexpectedly got an extra packet after sending out the expected RSs")
+			}
+
+			// Starting router solicitations after it has already completed should do
+			// nothing.
+			test.startFn(t, s)
+			ctx, cancel = context.WithTimeout(context.Background(), delay+defaultAsyncNegativeEventTimeout)
+			defer cancel()
+			if _, ok := e.ReadContext(ctx); ok {
+				t.Fatal("unexpectedly got a packet after finishing router solicitations")
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
new file mode 100644
index 000000000..7b80534e6
--- /dev/null
+++ b/pkg/tcpip/stack/nic.go
@@ -0,0 +1,1743 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+	"reflect"
+	"sort"
+	"strings"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+var ipv4BroadcastAddr = tcpip.ProtocolAddress{
+	Protocol: header.IPv4ProtocolNumber,
+	AddressWithPrefix: tcpip.AddressWithPrefix{
+		Address:   header.IPv4Broadcast,
+		PrefixLen: 8 * header.IPv4AddressSize,
+	},
+}
+
+// NIC represents a "network interface card" to which the networking stack is
+// attached.
+type NIC struct {
+	stack   *Stack
+	id      tcpip.NICID
+	name    string
+	linkEP  LinkEndpoint
+	context NICContext
+
+	stats NICStats
+
+	mu struct {
+		sync.RWMutex
+		enabled       bool
+		spoofing      bool
+		promiscuous   bool
+		primary       map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint
+		endpoints     map[NetworkEndpointID]*referencedNetworkEndpoint
+		addressRanges []tcpip.Subnet
+		mcastJoins    map[NetworkEndpointID]uint32
+		// packetEPs is protected by mu, but the contained PacketEndpoint
+		// values are not.
+		packetEPs map[tcpip.NetworkProtocolNumber][]PacketEndpoint
+		ndp       ndpState
+	}
+}
+
+// NICStats includes transmitted and received stats.
+type NICStats struct {
+	Tx DirectionStats
+	Rx DirectionStats
+
+	DisabledRx DirectionStats
+}
+
+func makeNICStats() NICStats {
+	var s NICStats
+	tcpip.InitStatCounters(reflect.ValueOf(&s).Elem())
+	return s
+}
+
+// DirectionStats includes packet and byte counts.
+type DirectionStats struct {
+	Packets *tcpip.StatCounter
+	Bytes   *tcpip.StatCounter
+}
+
+// PrimaryEndpointBehavior is an enumeration of an endpoint's primacy behavior.
+type PrimaryEndpointBehavior int
+
+const (
+	// CanBePrimaryEndpoint indicates the endpoint can be used as a primary
+	// endpoint for new connections with no local address. This is the
+	// default when calling NIC.AddAddress.
+	CanBePrimaryEndpoint PrimaryEndpointBehavior = iota
+
+	// FirstPrimaryEndpoint indicates the endpoint should be the first
+	// primary endpoint considered. If there are multiple endpoints with
+	// this behavior, the most recently-added one will be first.
+	FirstPrimaryEndpoint
+
+	// NeverPrimaryEndpoint indicates the endpoint should never be a
+	// primary endpoint.
+	NeverPrimaryEndpoint
+)
+
+// newNIC returns a new NIC using the default NDP configurations from stack.
+func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, ctx NICContext) *NIC {
+	// TODO(b/141011931): Validate a LinkEndpoint (ep) is valid. For
+	// example, make sure that the link address it provides is a valid
+	// unicast ethernet address.
+
+	// TODO(b/143357959): RFC 8200 section 5 requires that IPv6 endpoints
+	// observe an MTU of at least 1280 bytes. Ensure that this requirement
+	// of IPv6 is supported on this endpoint's LinkEndpoint.
+
+	nic := &NIC{
+		stack:   stack,
+		id:      id,
+		name:    name,
+		linkEP:  ep,
+		context: ctx,
+		stats:   makeNICStats(),
+	}
+	nic.mu.primary = make(map[tcpip.NetworkProtocolNumber][]*referencedNetworkEndpoint)
+	nic.mu.endpoints = make(map[NetworkEndpointID]*referencedNetworkEndpoint)
+	nic.mu.mcastJoins = make(map[NetworkEndpointID]uint32)
+	nic.mu.packetEPs = make(map[tcpip.NetworkProtocolNumber][]PacketEndpoint)
+	nic.mu.ndp = ndpState{
+		nic:            nic,
+		configs:        stack.ndpConfigs,
+		dad:            make(map[tcpip.Address]dadState),
+		defaultRouters: make(map[tcpip.Address]defaultRouterState),
+		onLinkPrefixes: make(map[tcpip.Subnet]onLinkPrefixState),
+		slaacPrefixes:  make(map[tcpip.Subnet]slaacPrefixState),
+	}
+	nic.mu.ndp.initializeTempAddrState()
+
+	// Register supported packet endpoint protocols.
+	for _, netProto := range header.Ethertypes {
+		nic.mu.packetEPs[netProto] = []PacketEndpoint{}
+	}
+	for _, netProto := range stack.networkProtocols {
+		nic.mu.packetEPs[netProto.Number()] = []PacketEndpoint{}
+	}
+
+	nic.linkEP.Attach(nic)
+
+	return nic
+}
+
+// enabled returns true if n is enabled.
+func (n *NIC) enabled() bool {
+	n.mu.RLock()
+	enabled := n.mu.enabled
+	n.mu.RUnlock()
+	return enabled
+}
+
+// disable disables n.
+//
+// It undoes the work done by enable.
+func (n *NIC) disable() *tcpip.Error {
+	n.mu.RLock()
+	enabled := n.mu.enabled
+	n.mu.RUnlock()
+	if !enabled {
+		return nil
+	}
+
+	n.mu.Lock()
+	err := n.disableLocked()
+	n.mu.Unlock()
+	return err
+}
+
+// disableLocked disables n.
+//
+// It undoes the work done by enable.
+//
+// n MUST be locked.
+func (n *NIC) disableLocked() *tcpip.Error {
+	if !n.mu.enabled {
+		return nil
+	}
+
+	// TODO(b/147015577): Should Routes that are currently bound to n be
+	// invalidated? Currently, Routes will continue to work when a NIC is enabled
+	// again, and applications may not know that the underlying NIC was ever
+	// disabled.
+
+	if _, ok := n.stack.networkProtocols[header.IPv6ProtocolNumber]; ok {
+		n.mu.ndp.stopSolicitingRouters()
+		n.mu.ndp.cleanupState(false /* hostOnly */)
+
+		// Stop DAD for all the unicast IPv6 endpoints that are in the
+		// permanentTentative state.
+		for _, r := range n.mu.endpoints {
+			if addr := r.ep.ID().LocalAddress; r.getKind() == permanentTentative && header.IsV6UnicastAddress(addr) {
+				n.mu.ndp.stopDuplicateAddressDetection(addr)
+			}
+		}
+
+		// The NIC may have already left the multicast group.
+		if err := n.leaveGroupLocked(header.IPv6AllNodesMulticastAddress, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress {
+			return err
+		}
+	}
+
+	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
+		// The address may have already been removed.
+		if err := n.removePermanentAddressLocked(ipv4BroadcastAddr.AddressWithPrefix.Address); err != nil && err != tcpip.ErrBadLocalAddress {
+			return err
+		}
+	}
+
+	n.mu.enabled = false
+	return nil
+}
+
+// enable enables n.
+//
+// If the stack has IPv6 enabled, enable will join the IPv6 All-Nodes Multicast
+// address (ff02::1), start DAD for permanent addresses, and start soliciting
+// routers if the stack is not operating as a router. If the stack is also
+// configured to auto-generate a link-local address, one will be generated.
+func (n *NIC) enable() *tcpip.Error {
+	n.mu.RLock()
+	enabled := n.mu.enabled
+	n.mu.RUnlock()
+	if enabled {
+		return nil
+	}
+
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	if n.mu.enabled {
+		return nil
+	}
+
+	n.mu.enabled = true
+
+	// Create an endpoint to receive broadcast packets on this interface.
+	if _, ok := n.stack.networkProtocols[header.IPv4ProtocolNumber]; ok {
+		if _, err := n.addAddressLocked(ipv4BroadcastAddr, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
+			return err
+		}
+	}
+
+	// Join the IPv6 All-Nodes Multicast group if the stack is configured to
+	// use IPv6. This is required to ensure that this node properly receives
+	// and responds to the various NDP messages that are destined to the
+	// all-nodes multicast address. An example is the Neighbor Advertisement
+	// when we perform Duplicate Address Detection, or Router Advertisement
+	// when we do Router Discovery. See RFC 4862, section 5.4.2 and RFC 4861
+	// section 4.2 for more information.
+	//
+	// Also auto-generate an IPv6 link-local address based on the NIC's
+	// link address if it is configured to do so. Note, each interface is
+	// required to have IPv6 link-local unicast address, as per RFC 4291
+	// section 2.1.
+	_, ok := n.stack.networkProtocols[header.IPv6ProtocolNumber]
+	if !ok {
+		return nil
+	}
+
+	// Join the All-Nodes multicast group before starting DAD as responses to DAD
+	// (NDP NS) messages may be sent to the All-Nodes multicast group if the
+	// source address of the NDP NS is the unspecified address, as per RFC 4861
+	// section 7.2.4.
+	if err := n.joinGroupLocked(header.IPv6ProtocolNumber, header.IPv6AllNodesMulticastAddress); err != nil {
+		return err
+	}
+
+	// Perform DAD on the all the unicast IPv6 endpoints that are in the permanent
+	// state.
+	//
+	// Addresses may have aleady completed DAD but in the time since the NIC was
+	// last enabled, other devices may have acquired the same addresses.
+	for _, r := range n.mu.endpoints {
+		addr := r.ep.ID().LocalAddress
+		if k := r.getKind(); (k != permanent && k != permanentTentative) || !header.IsV6UnicastAddress(addr) {
+			continue
+		}
+
+		r.setKind(permanentTentative)
+		if err := n.mu.ndp.startDuplicateAddressDetection(addr, r); err != nil {
+			return err
+		}
+	}
+
+	// Do not auto-generate an IPv6 link-local address for loopback devices.
+	if n.stack.autoGenIPv6LinkLocal && !n.isLoopback() {
+		// The valid and preferred lifetime is infinite for the auto-generated
+		// link-local address.
+		n.mu.ndp.doSLAAC(header.IPv6LinkLocalPrefix.Subnet(), header.NDPInfiniteLifetime, header.NDPInfiniteLifetime)
+	}
+
+	// If we are operating as a router, then do not solicit routers since we
+	// won't process the RAs anyways.
+	//
+	// Routers do not process Router Advertisements (RA) the same way a host
+	// does. That is, routers do not learn from RAs (e.g. on-link prefixes
+	// and default routers). Therefore, soliciting RAs from other routers on
+	// a link is unnecessary for routers.
+	if !n.stack.forwarding {
+		n.mu.ndp.startSolicitingRouters()
+	}
+
+	return nil
+}
+
+// remove detaches NIC from the link endpoint, and marks existing referenced
+// network endpoints expired. This guarantees no packets between this NIC and
+// the network stack.
+func (n *NIC) remove() *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	n.disableLocked()
+
+	// TODO(b/151378115): come up with a better way to pick an error than the
+	// first one.
+	var err *tcpip.Error
+
+	// Forcefully leave multicast groups.
+	for nid := range n.mu.mcastJoins {
+		if tempErr := n.leaveGroupLocked(nid.LocalAddress, true /* force */); tempErr != nil && err == nil {
+			err = tempErr
+		}
+	}
+
+	// Remove permanent and permanentTentative addresses, so no packet goes out.
+	for nid, ref := range n.mu.endpoints {
+		switch ref.getKind() {
+		case permanentTentative, permanent:
+			if tempErr := n.removePermanentAddressLocked(nid.LocalAddress); tempErr != nil && err == nil {
+				err = tempErr
+			}
+		}
+	}
+
+	// Detach from link endpoint, so no packet comes in.
+	n.linkEP.Attach(nil)
+
+	return err
+}
+
+// becomeIPv6Router transitions n into an IPv6 router.
+//
+// When transitioning into an IPv6 router, host-only state (NDP discovered
+// routers, discovered on-link prefixes, and auto-generated addresses) will
+// be cleaned up/invalidated and NDP router solicitations will be stopped.
+func (n *NIC) becomeIPv6Router() {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	n.mu.ndp.cleanupState(true /* hostOnly */)
+	n.mu.ndp.stopSolicitingRouters()
+}
+
+// becomeIPv6Host transitions n into an IPv6 host.
+//
+// When transitioning into an IPv6 host, NDP router solicitations will be
+// started.
+func (n *NIC) becomeIPv6Host() {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	n.mu.ndp.startSolicitingRouters()
+}
+
+// setPromiscuousMode enables or disables promiscuous mode.
+func (n *NIC) setPromiscuousMode(enable bool) {
+	n.mu.Lock()
+	n.mu.promiscuous = enable
+	n.mu.Unlock()
+}
+
+func (n *NIC) isPromiscuousMode() bool {
+	n.mu.RLock()
+	rv := n.mu.promiscuous
+	n.mu.RUnlock()
+	return rv
+}
+
+func (n *NIC) isLoopback() bool {
+	return n.linkEP.Capabilities()&CapabilityLoopback != 0
+}
+
+// setSpoofing enables or disables address spoofing.
+func (n *NIC) setSpoofing(enable bool) {
+	n.mu.Lock()
+	n.mu.spoofing = enable
+	n.mu.Unlock()
+}
+
+// primaryEndpoint will return the first non-deprecated endpoint if such an
+// endpoint exists for the given protocol and remoteAddr. If no non-deprecated
+// endpoint exists, the first deprecated endpoint will be returned.
+//
+// If an IPv6 primary endpoint is requested, Source Address Selection (as
+// defined by RFC 6724 section 5) will be performed.
+func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber, remoteAddr tcpip.Address) *referencedNetworkEndpoint {
+	if protocol == header.IPv6ProtocolNumber && remoteAddr != "" {
+		return n.primaryIPv6Endpoint(remoteAddr)
+	}
+
+	n.mu.RLock()
+	defer n.mu.RUnlock()
+
+	var deprecatedEndpoint *referencedNetworkEndpoint
+	for _, r := range n.mu.primary[protocol] {
+		if !r.isValidForOutgoingRLocked() {
+			continue
+		}
+
+		if !r.deprecated {
+			if r.tryIncRef() {
+				// r is not deprecated, so return it immediately.
+				//
+				// If we kept track of a deprecated endpoint, decrement its reference
+				// count since it was incremented when we decided to keep track of it.
+				if deprecatedEndpoint != nil {
+					deprecatedEndpoint.decRefLocked()
+					deprecatedEndpoint = nil
+				}
+
+				return r
+			}
+		} else if deprecatedEndpoint == nil && r.tryIncRef() {
+			// We prefer an endpoint that is not deprecated, but we keep track of r in
+			// case n doesn't have any non-deprecated endpoints.
+			//
+			// If we end up finding a more preferred endpoint, r's reference count
+			// will be decremented when such an endpoint is found.
+			deprecatedEndpoint = r
+		}
+	}
+
+	// n doesn't have any valid non-deprecated endpoints, so return
+	// deprecatedEndpoint (which may be nil if n doesn't have any valid deprecated
+	// endpoints either).
+	return deprecatedEndpoint
+}
+
+// ipv6AddrCandidate is an IPv6 candidate for Source Address Selection (RFC
+// 6724 section 5).
+type ipv6AddrCandidate struct {
+	ref   *referencedNetworkEndpoint
+	scope header.IPv6AddressScope
+}
+
+// primaryIPv6Endpoint returns an IPv6 endpoint following Source Address
+// Selection (RFC 6724 section 5).
+//
+// Note, only rules 1-3 and 7 are followed.
+//
+// remoteAddr must be a valid IPv6 address.
+func (n *NIC) primaryIPv6Endpoint(remoteAddr tcpip.Address) *referencedNetworkEndpoint {
+	n.mu.RLock()
+	ref := n.primaryIPv6EndpointRLocked(remoteAddr)
+	n.mu.RUnlock()
+	return ref
+}
+
+// primaryIPv6EndpointLocked returns an IPv6 endpoint following Source Address
+// Selection (RFC 6724 section 5).
+//
+// Note, only rules 1-3 and 7 are followed.
+//
+// remoteAddr must be a valid IPv6 address.
+//
+// n.mu MUST be read locked.
+func (n *NIC) primaryIPv6EndpointRLocked(remoteAddr tcpip.Address) *referencedNetworkEndpoint {
+	primaryAddrs := n.mu.primary[header.IPv6ProtocolNumber]
+
+	if len(primaryAddrs) == 0 {
+		return nil
+	}
+
+	// Create a candidate set of available addresses we can potentially use as a
+	// source address.
+	cs := make([]ipv6AddrCandidate, 0, len(primaryAddrs))
+	for _, r := range primaryAddrs {
+		// If r is not valid for outgoing connections, it is not a valid endpoint.
+		if !r.isValidForOutgoingRLocked() {
+			continue
+		}
+
+		addr := r.ep.ID().LocalAddress
+		scope, err := header.ScopeForIPv6Address(addr)
+		if err != nil {
+			// Should never happen as we got r from the primary IPv6 endpoint list and
+			// ScopeForIPv6Address only returns an error if addr is not an IPv6
+			// address.
+			panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", addr, err))
+		}
+
+		cs = append(cs, ipv6AddrCandidate{
+			ref:   r,
+			scope: scope,
+		})
+	}
+
+	remoteScope, err := header.ScopeForIPv6Address(remoteAddr)
+	if err != nil {
+		// primaryIPv6Endpoint should never be called with an invalid IPv6 address.
+		panic(fmt.Sprintf("header.ScopeForIPv6Address(%s): %s", remoteAddr, err))
+	}
+
+	// Sort the addresses as per RFC 6724 section 5 rules 1-3.
+	//
+	// TODO(b/146021396): Implement rules 4-8 of RFC 6724 section 5.
+	sort.Slice(cs, func(i, j int) bool {
+		sa := cs[i]
+		sb := cs[j]
+
+		// Prefer same address as per RFC 6724 section 5 rule 1.
+		if sa.ref.ep.ID().LocalAddress == remoteAddr {
+			return true
+		}
+		if sb.ref.ep.ID().LocalAddress == remoteAddr {
+			return false
+		}
+
+		// Prefer appropriate scope as per RFC 6724 section 5 rule 2.
+		if sa.scope < sb.scope {
+			return sa.scope >= remoteScope
+		} else if sb.scope < sa.scope {
+			return sb.scope < remoteScope
+		}
+
+		// Avoid deprecated addresses as per RFC 6724 section 5 rule 3.
+		if saDep, sbDep := sa.ref.deprecated, sb.ref.deprecated; saDep != sbDep {
+			// If sa is not deprecated, it is preferred over sb.
+			return sbDep
+		}
+
+		// Prefer temporary addresses as per RFC 6724 section 5 rule 7.
+		if saTemp, sbTemp := sa.ref.configType == slaacTemp, sb.ref.configType == slaacTemp; saTemp != sbTemp {
+			return saTemp
+		}
+
+		// sa and sb are equal, return the endpoint that is closest to the front of
+		// the primary endpoint list.
+		return i < j
+	})
+
+	// Return the most preferred address that can have its reference count
+	// incremented.
+	for _, c := range cs {
+		if r := c.ref; r.tryIncRef() {
+			return r
+		}
+	}
+
+	return nil
+}
+
+// hasPermanentAddrLocked returns true if n has a permanent (including currently
+// tentative) address, addr.
+func (n *NIC) hasPermanentAddrLocked(addr tcpip.Address) bool {
+	ref, ok := n.mu.endpoints[NetworkEndpointID{addr}]
+
+	if !ok {
+		return false
+	}
+
+	kind := ref.getKind()
+
+	return kind == permanent || kind == permanentTentative
+}
+
+type getRefBehaviour int
+
+const (
+	// spoofing indicates that the NIC's spoofing flag should be observed when
+	// getting a NIC's referenced network endpoint.
+	spoofing getRefBehaviour = iota
+
+	// promiscuous indicates that the NIC's promiscuous flag should be observed
+	// when getting a NIC's referenced network endpoint.
+	promiscuous
+)
+
+func (n *NIC) getRef(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) *referencedNetworkEndpoint {
+	return n.getRefOrCreateTemp(protocol, dst, CanBePrimaryEndpoint, promiscuous)
+}
+
+// findEndpoint finds the endpoint, if any, with the given address.
+func (n *NIC) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) *referencedNetworkEndpoint {
+	return n.getRefOrCreateTemp(protocol, address, peb, spoofing)
+}
+
+// getRefEpOrCreateTemp returns the referenced network endpoint for the given
+// protocol and address.
+//
+// If none exists a temporary one may be created if we are in promiscuous mode
+// or spoofing. Promiscuous mode will only be checked if promiscuous is true.
+// Similarly, spoofing will only be checked if spoofing is true.
+func (n *NIC) getRefOrCreateTemp(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior, tempRef getRefBehaviour) *referencedNetworkEndpoint {
+	n.mu.RLock()
+
+	var spoofingOrPromiscuous bool
+	switch tempRef {
+	case spoofing:
+		spoofingOrPromiscuous = n.mu.spoofing
+	case promiscuous:
+		spoofingOrPromiscuous = n.mu.promiscuous
+	}
+
+	if ref, ok := n.mu.endpoints[NetworkEndpointID{address}]; ok {
+		// An endpoint with this id exists, check if it can be used and return it.
+		if !ref.isAssignedRLocked(spoofingOrPromiscuous) {
+			n.mu.RUnlock()
+			return nil
+		}
+
+		if ref.tryIncRef() {
+			n.mu.RUnlock()
+			return ref
+		}
+	}
+
+	// A usable reference was not found, create a temporary one if requested by
+	// the caller or if the address is found in the NIC's subnets.
+	createTempEP := spoofingOrPromiscuous
+	if !createTempEP {
+		for _, sn := range n.mu.addressRanges {
+			// Skip the subnet address.
+			if address == sn.ID() {
+				continue
+			}
+			// For now just skip the broadcast address, until we support it.
+			// FIXME(b/137608825): Add support for sending/receiving directed
+			// (subnet) broadcast.
+			if address == sn.Broadcast() {
+				continue
+			}
+			if sn.Contains(address) {
+				createTempEP = true
+				break
+			}
+		}
+	}
+
+	n.mu.RUnlock()
+
+	if !createTempEP {
+		return nil
+	}
+
+	// Try again with the lock in exclusive mode. If we still can't get the
+	// endpoint, create a new "temporary" endpoint. It will only exist while
+	// there's a route through it.
+	n.mu.Lock()
+	ref := n.getRefOrCreateTempLocked(protocol, address, peb)
+	n.mu.Unlock()
+	return ref
+}
+
+/// getRefOrCreateTempLocked returns an existing endpoint for address or creates
+/// and returns a temporary endpoint.
+func (n *NIC) getRefOrCreateTempLocked(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) *referencedNetworkEndpoint {
+	if ref, ok := n.mu.endpoints[NetworkEndpointID{address}]; ok {
+		// No need to check the type as we are ok with expired endpoints at this
+		// point.
+		if ref.tryIncRef() {
+			return ref
+		}
+		// tryIncRef failing means the endpoint is scheduled to be removed once the
+		// lock is released. Remove it here so we can create a new (temporary) one.
+		// The removal logic waiting for the lock handles this case.
+		n.removeEndpointLocked(ref)
+	}
+
+	// Add a new temporary endpoint.
+	netProto, ok := n.stack.networkProtocols[protocol]
+	if !ok {
+		return nil
+	}
+	ref, _ := n.addAddressLocked(tcpip.ProtocolAddress{
+		Protocol: protocol,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   address,
+			PrefixLen: netProto.DefaultPrefixLen(),
+		},
+	}, peb, temporary, static, false)
+	return ref
+}
+
+// addAddressLocked adds a new protocolAddress to n.
+//
+// If n already has the address in a non-permanent state, and the kind given is
+// permanent, that address will be promoted in place and its properties set to
+// the properties provided. Otherwise, it returns tcpip.ErrDuplicateAddress.
+func (n *NIC) addAddressLocked(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior, kind networkEndpointKind, configType networkEndpointConfigType, deprecated bool) (*referencedNetworkEndpoint, *tcpip.Error) {
+	// TODO(b/141022673): Validate IP addresses before adding them.
+
+	// Sanity check.
+	id := NetworkEndpointID{LocalAddress: protocolAddress.AddressWithPrefix.Address}
+	if ref, ok := n.mu.endpoints[id]; ok {
+		// Endpoint already exists.
+		if kind != permanent {
+			return nil, tcpip.ErrDuplicateAddress
+		}
+		switch ref.getKind() {
+		case permanentTentative, permanent:
+			// The NIC already have a permanent endpoint with that address.
+			return nil, tcpip.ErrDuplicateAddress
+		case permanentExpired, temporary:
+			// Promote the endpoint to become permanent and respect the new peb,
+			// configType and deprecated status.
+			if ref.tryIncRef() {
+				// TODO(b/147748385): Perform Duplicate Address Detection when promoting
+				// an IPv6 endpoint to permanent.
+				ref.setKind(permanent)
+				ref.deprecated = deprecated
+				ref.configType = configType
+
+				refs := n.mu.primary[ref.protocol]
+				for i, r := range refs {
+					if r == ref {
+						switch peb {
+						case CanBePrimaryEndpoint:
+							return ref, nil
+						case FirstPrimaryEndpoint:
+							if i == 0 {
+								return ref, nil
+							}
+							n.mu.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
+						case NeverPrimaryEndpoint:
+							n.mu.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
+							return ref, nil
+						}
+					}
+				}
+
+				n.insertPrimaryEndpointLocked(ref, peb)
+
+				return ref, nil
+			}
+			// tryIncRef failing means the endpoint is scheduled to be removed once
+			// the lock is released. Remove it here so we can create a new
+			// (permanent) one. The removal logic waiting for the lock handles this
+			// case.
+			n.removeEndpointLocked(ref)
+		}
+	}
+
+	netProto, ok := n.stack.networkProtocols[protocolAddress.Protocol]
+	if !ok {
+		return nil, tcpip.ErrUnknownProtocol
+	}
+
+	// Create the new network endpoint.
+	ep, err := netProto.NewEndpoint(n.id, protocolAddress.AddressWithPrefix, n.stack, n, n.linkEP, n.stack)
+	if err != nil {
+		return nil, err
+	}
+
+	isIPv6Unicast := protocolAddress.Protocol == header.IPv6ProtocolNumber && header.IsV6UnicastAddress(protocolAddress.AddressWithPrefix.Address)
+
+	// If the address is an IPv6 address and it is a permanent address,
+	// mark it as tentative so it goes through the DAD process if the NIC is
+	// enabled. If the NIC is not enabled, DAD will be started when the NIC is
+	// enabled.
+	if isIPv6Unicast && kind == permanent {
+		kind = permanentTentative
+	}
+
+	ref := &referencedNetworkEndpoint{
+		refs:       1,
+		ep:         ep,
+		nic:        n,
+		protocol:   protocolAddress.Protocol,
+		kind:       kind,
+		configType: configType,
+		deprecated: deprecated,
+	}
+
+	// Set up cache if link address resolution exists for this protocol.
+	if n.linkEP.Capabilities()&CapabilityResolutionRequired != 0 {
+		if _, ok := n.stack.linkAddrResolvers[protocolAddress.Protocol]; ok {
+			ref.linkCache = n.stack
+		}
+	}
+
+	// If we are adding an IPv6 unicast address, join the solicited-node
+	// multicast address.
+	if isIPv6Unicast {
+		snmc := header.SolicitedNodeAddr(protocolAddress.AddressWithPrefix.Address)
+		if err := n.joinGroupLocked(protocolAddress.Protocol, snmc); err != nil {
+			return nil, err
+		}
+	}
+
+	n.mu.endpoints[id] = ref
+
+	n.insertPrimaryEndpointLocked(ref, peb)
+
+	// If we are adding a tentative IPv6 address, start DAD if the NIC is enabled.
+	if isIPv6Unicast && kind == permanentTentative && n.mu.enabled {
+		if err := n.mu.ndp.startDuplicateAddressDetection(protocolAddress.AddressWithPrefix.Address, ref); err != nil {
+			return nil, err
+		}
+	}
+
+	return ref, nil
+}
+
+// AddAddress adds a new address to n, so that it starts accepting packets
+// targeted at the given address (and network protocol).
+func (n *NIC) AddAddress(protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) *tcpip.Error {
+	// Add the endpoint.
+	n.mu.Lock()
+	_, err := n.addAddressLocked(protocolAddress, peb, permanent, static, false /* deprecated */)
+	n.mu.Unlock()
+
+	return err
+}
+
+// AllAddresses returns all addresses (primary and non-primary) associated with
+// this NIC.
+func (n *NIC) AllAddresses() []tcpip.ProtocolAddress {
+	n.mu.RLock()
+	defer n.mu.RUnlock()
+
+	addrs := make([]tcpip.ProtocolAddress, 0, len(n.mu.endpoints))
+	for nid, ref := range n.mu.endpoints {
+		// Don't include tentative, expired or temporary endpoints to
+		// avoid confusion and prevent the caller from using those.
+		switch ref.getKind() {
+		case permanentExpired, temporary:
+			continue
+		}
+
+		addrs = append(addrs, tcpip.ProtocolAddress{
+			Protocol: ref.protocol,
+			AddressWithPrefix: tcpip.AddressWithPrefix{
+				Address:   nid.LocalAddress,
+				PrefixLen: ref.ep.PrefixLen(),
+			},
+		})
+	}
+	return addrs
+}
+
+// PrimaryAddresses returns the primary addresses associated with this NIC.
+func (n *NIC) PrimaryAddresses() []tcpip.ProtocolAddress {
+	n.mu.RLock()
+	defer n.mu.RUnlock()
+
+	var addrs []tcpip.ProtocolAddress
+	for proto, list := range n.mu.primary {
+		for _, ref := range list {
+			// Don't include tentative, expired or tempory endpoints
+			// to avoid confusion and prevent the caller from using
+			// those.
+			switch ref.getKind() {
+			case permanentTentative, permanentExpired, temporary:
+				continue
+			}
+
+			addrs = append(addrs, tcpip.ProtocolAddress{
+				Protocol: proto,
+				AddressWithPrefix: tcpip.AddressWithPrefix{
+					Address:   ref.ep.ID().LocalAddress,
+					PrefixLen: ref.ep.PrefixLen(),
+				},
+			})
+		}
+	}
+	return addrs
+}
+
+// primaryAddress returns the primary address associated with this NIC.
+//
+// primaryAddress will return the first non-deprecated address if such an
+// address exists. If no non-deprecated address exists, the first deprecated
+// address will be returned.
+func (n *NIC) primaryAddress(proto tcpip.NetworkProtocolNumber) tcpip.AddressWithPrefix {
+	n.mu.RLock()
+	defer n.mu.RUnlock()
+
+	list, ok := n.mu.primary[proto]
+	if !ok {
+		return tcpip.AddressWithPrefix{}
+	}
+
+	var deprecatedEndpoint *referencedNetworkEndpoint
+	for _, ref := range list {
+		// Don't include tentative, expired or tempory endpoints to avoid confusion
+		// and prevent the caller from using those.
+		switch ref.getKind() {
+		case permanentTentative, permanentExpired, temporary:
+			continue
+		}
+
+		if !ref.deprecated {
+			return tcpip.AddressWithPrefix{
+				Address:   ref.ep.ID().LocalAddress,
+				PrefixLen: ref.ep.PrefixLen(),
+			}
+		}
+
+		if deprecatedEndpoint == nil {
+			deprecatedEndpoint = ref
+		}
+	}
+
+	if deprecatedEndpoint != nil {
+		return tcpip.AddressWithPrefix{
+			Address:   deprecatedEndpoint.ep.ID().LocalAddress,
+			PrefixLen: deprecatedEndpoint.ep.PrefixLen(),
+		}
+	}
+
+	return tcpip.AddressWithPrefix{}
+}
+
+// AddAddressRange adds a range of addresses to n, so that it starts accepting
+// packets targeted at the given addresses and network protocol. The range is
+// given by a subnet address, and all addresses contained in the subnet are
+// used except for the subnet address itself and the subnet's broadcast
+// address.
+func (n *NIC) AddAddressRange(protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) {
+	n.mu.Lock()
+	n.mu.addressRanges = append(n.mu.addressRanges, subnet)
+	n.mu.Unlock()
+}
+
+// RemoveAddressRange removes the given address range from n.
+func (n *NIC) RemoveAddressRange(subnet tcpip.Subnet) {
+	n.mu.Lock()
+
+	// Use the same underlying array.
+	tmp := n.mu.addressRanges[:0]
+	for _, sub := range n.mu.addressRanges {
+		if sub != subnet {
+			tmp = append(tmp, sub)
+		}
+	}
+	n.mu.addressRanges = tmp
+
+	n.mu.Unlock()
+}
+
+// AddressRanges returns the Subnets associated with this NIC.
+func (n *NIC) AddressRanges() []tcpip.Subnet {
+	n.mu.RLock()
+	defer n.mu.RUnlock()
+	sns := make([]tcpip.Subnet, 0, len(n.mu.addressRanges)+len(n.mu.endpoints))
+	for nid := range n.mu.endpoints {
+		sn, err := tcpip.NewSubnet(nid.LocalAddress, tcpip.AddressMask(strings.Repeat("\xff", len(nid.LocalAddress))))
+		if err != nil {
+			// This should never happen as the mask has been carefully crafted to
+			// match the address.
+			panic("Invalid endpoint subnet: " + err.Error())
+		}
+		sns = append(sns, sn)
+	}
+	return append(sns, n.mu.addressRanges...)
+}
+
+// insertPrimaryEndpointLocked adds r to n's primary endpoint list as required
+// by peb.
+//
+// n MUST be locked.
+func (n *NIC) insertPrimaryEndpointLocked(r *referencedNetworkEndpoint, peb PrimaryEndpointBehavior) {
+	switch peb {
+	case CanBePrimaryEndpoint:
+		n.mu.primary[r.protocol] = append(n.mu.primary[r.protocol], r)
+	case FirstPrimaryEndpoint:
+		n.mu.primary[r.protocol] = append([]*referencedNetworkEndpoint{r}, n.mu.primary[r.protocol]...)
+	}
+}
+
+func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) {
+	id := *r.ep.ID()
+
+	// Nothing to do if the reference has already been replaced with a different
+	// one. This happens in the case where 1) this endpoint's ref count hit zero
+	// and was waiting (on the lock) to be removed and 2) the same address was
+	// re-added in the meantime by removing this endpoint from the list and
+	// adding a new one.
+	if n.mu.endpoints[id] != r {
+		return
+	}
+
+	if r.getKind() == permanent {
+		panic("Reference count dropped to zero before being removed")
+	}
+
+	delete(n.mu.endpoints, id)
+	refs := n.mu.primary[r.protocol]
+	for i, ref := range refs {
+		if ref == r {
+			n.mu.primary[r.protocol] = append(refs[:i], refs[i+1:]...)
+			refs[len(refs)-1] = nil
+			break
+		}
+	}
+
+	r.ep.Close()
+}
+
+func (n *NIC) removeEndpoint(r *referencedNetworkEndpoint) {
+	n.mu.Lock()
+	n.removeEndpointLocked(r)
+	n.mu.Unlock()
+}
+
+func (n *NIC) removePermanentAddressLocked(addr tcpip.Address) *tcpip.Error {
+	r, ok := n.mu.endpoints[NetworkEndpointID{addr}]
+	if !ok {
+		return tcpip.ErrBadLocalAddress
+	}
+
+	kind := r.getKind()
+	if kind != permanent && kind != permanentTentative {
+		return tcpip.ErrBadLocalAddress
+	}
+
+	switch r.protocol {
+	case header.IPv6ProtocolNumber:
+		return n.removePermanentIPv6EndpointLocked(r, true /* allowSLAACInvalidation */)
+	default:
+		r.expireLocked()
+		return nil
+	}
+}
+
+func (n *NIC) removePermanentIPv6EndpointLocked(r *referencedNetworkEndpoint, allowSLAACInvalidation bool) *tcpip.Error {
+	addr := r.addrWithPrefix()
+
+	isIPv6Unicast := header.IsV6UnicastAddress(addr.Address)
+
+	if isIPv6Unicast {
+		n.mu.ndp.stopDuplicateAddressDetection(addr.Address)
+
+		// If we are removing an address generated via SLAAC, cleanup
+		// its SLAAC resources and notify the integrator.
+		switch r.configType {
+		case slaac:
+			n.mu.ndp.cleanupSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation)
+		case slaacTemp:
+			n.mu.ndp.cleanupTempSLAACAddrResourcesAndNotify(addr, allowSLAACInvalidation)
+		}
+	}
+
+	r.expireLocked()
+
+	// At this point the endpoint is deleted.
+
+	// If we are removing an IPv6 unicast address, leave the solicited-node
+	// multicast address.
+	//
+	// We ignore the tcpip.ErrBadLocalAddress error because the solicited-node
+	// multicast group may be left by user action.
+	if isIPv6Unicast {
+		snmc := header.SolicitedNodeAddr(addr.Address)
+		if err := n.leaveGroupLocked(snmc, false /* force */); err != nil && err != tcpip.ErrBadLocalAddress {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// RemoveAddress removes an address from n.
+func (n *NIC) RemoveAddress(addr tcpip.Address) *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+	return n.removePermanentAddressLocked(addr)
+}
+
+// joinGroup adds a new endpoint for the given multicast address, if none
+// exists yet. Otherwise it just increments its count.
+func (n *NIC) joinGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	return n.joinGroupLocked(protocol, addr)
+}
+
+// joinGroupLocked adds a new endpoint for the given multicast address, if none
+// exists yet. Otherwise it just increments its count. n MUST be locked before
+// joinGroupLocked is called.
+func (n *NIC) joinGroupLocked(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
+	// TODO(b/143102137): When implementing MLD, make sure MLD packets are
+	// not sent unless a valid link-local address is available for use on n
+	// as an MLD packet's source address must be a link-local address as
+	// outlined in RFC 3810 section 5.
+
+	id := NetworkEndpointID{addr}
+	joins := n.mu.mcastJoins[id]
+	if joins == 0 {
+		netProto, ok := n.stack.networkProtocols[protocol]
+		if !ok {
+			return tcpip.ErrUnknownProtocol
+		}
+		if _, err := n.addAddressLocked(tcpip.ProtocolAddress{
+			Protocol: protocol,
+			AddressWithPrefix: tcpip.AddressWithPrefix{
+				Address:   addr,
+				PrefixLen: netProto.DefaultPrefixLen(),
+			},
+		}, NeverPrimaryEndpoint, permanent, static, false /* deprecated */); err != nil {
+			return err
+		}
+	}
+	n.mu.mcastJoins[id] = joins + 1
+	return nil
+}
+
+// leaveGroup decrements the count for the given multicast address, and when it
+// reaches zero removes the endpoint for this address.
+func (n *NIC) leaveGroup(addr tcpip.Address) *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	return n.leaveGroupLocked(addr, false /* force */)
+}
+
+// leaveGroupLocked decrements the count for the given multicast address, and
+// when it reaches zero removes the endpoint for this address. n MUST be locked
+// before leaveGroupLocked is called.
+//
+// If force is true, then the count for the multicast addres is ignored and the
+// endpoint will be removed immediately.
+func (n *NIC) leaveGroupLocked(addr tcpip.Address, force bool) *tcpip.Error {
+	id := NetworkEndpointID{addr}
+	joins, ok := n.mu.mcastJoins[id]
+	if !ok {
+		// There are no joins with this address on this NIC.
+		return tcpip.ErrBadLocalAddress
+	}
+
+	joins--
+	if force || joins == 0 {
+		// There are no outstanding joins or we are forced to leave, clean up.
+		delete(n.mu.mcastJoins, id)
+		return n.removePermanentAddressLocked(addr)
+	}
+
+	n.mu.mcastJoins[id] = joins
+	return nil
+}
+
+// isInGroup returns true if n has joined the multicast group addr.
+func (n *NIC) isInGroup(addr tcpip.Address) bool {
+	n.mu.RLock()
+	joins := n.mu.mcastJoins[NetworkEndpointID{addr}]
+	n.mu.RUnlock()
+
+	return joins != 0
+}
+
+func handlePacket(protocol tcpip.NetworkProtocolNumber, dst, src tcpip.Address, localLinkAddr, remotelinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, pkt *PacketBuffer) {
+	r := makeRoute(protocol, dst, src, localLinkAddr, ref, false /* handleLocal */, false /* multicastLoop */)
+	r.RemoteLinkAddress = remotelinkAddr
+
+	ref.ep.HandlePacket(&r, pkt)
+	ref.decRef()
+}
+
+// DeliverNetworkPacket finds the appropriate network protocol endpoint and
+// hands the packet over for further processing. This function is called when
+// the NIC receives a packet from the link endpoint.
+// Note that the ownership of the slice backing vv is retained by the caller.
+// This rule applies only to the slice itself, not to the items of the slice;
+// the ownership of the items is not retained by the caller.
+func (n *NIC) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	n.mu.RLock()
+	enabled := n.mu.enabled
+	// If the NIC is not yet enabled, don't receive any packets.
+	if !enabled {
+		n.mu.RUnlock()
+
+		n.stats.DisabledRx.Packets.Increment()
+		n.stats.DisabledRx.Bytes.IncrementBy(uint64(pkt.Data.Size()))
+		return
+	}
+
+	n.stats.Rx.Packets.Increment()
+	n.stats.Rx.Bytes.IncrementBy(uint64(pkt.Data.Size()))
+
+	netProto, ok := n.stack.networkProtocols[protocol]
+	if !ok {
+		n.mu.RUnlock()
+		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
+		return
+	}
+
+	// If no local link layer address is provided, assume it was sent
+	// directly to this NIC.
+	if local == "" {
+		local = n.linkEP.LinkAddress()
+	}
+
+	// Are any packet sockets listening for this network protocol?
+	packetEPs := n.mu.packetEPs[protocol]
+	// Check whether there are packet sockets listening for every protocol.
+	// If we received a packet with protocol EthernetProtocolAll, then the
+	// previous for loop will have handled it.
+	if protocol != header.EthernetProtocolAll {
+		packetEPs = append(packetEPs, n.mu.packetEPs[header.EthernetProtocolAll]...)
+	}
+	n.mu.RUnlock()
+	for _, ep := range packetEPs {
+		ep.HandlePacket(n.id, local, protocol, pkt.Clone())
+	}
+
+	if netProto.Number() == header.IPv4ProtocolNumber || netProto.Number() == header.IPv6ProtocolNumber {
+		n.stack.stats.IP.PacketsReceived.Increment()
+	}
+
+	// Parse headers.
+	transProtoNum, hasTransportHdr, ok := netProto.Parse(pkt)
+	if !ok {
+		// The packet is too small to contain a network header.
+		n.stack.stats.MalformedRcvdPackets.Increment()
+		return
+	}
+	if hasTransportHdr {
+		// Parse the transport header if present.
+		if state, ok := n.stack.transportProtocols[transProtoNum]; ok {
+			state.proto.Parse(pkt)
+		}
+	}
+
+	src, dst := netProto.ParseAddresses(pkt.NetworkHeader)
+
+	if n.stack.handleLocal && !n.isLoopback() && n.getRef(protocol, src) != nil {
+		// The source address is one of our own, so we never should have gotten a
+		// packet like this unless handleLocal is false. Loopback also calls this
+		// function even though the packets didn't come from the physical interface
+		// so don't drop those.
+		n.stack.stats.IP.InvalidSourceAddressesReceived.Increment()
+		return
+	}
+
+	// TODO(gvisor.dev/issue/170): Not supporting iptables for IPv6 yet.
+	// Loopback traffic skips the prerouting chain.
+	if protocol == header.IPv4ProtocolNumber && !n.isLoopback() {
+		// iptables filtering.
+		ipt := n.stack.IPTables()
+		address := n.primaryAddress(protocol)
+		if ok := ipt.Check(Prerouting, pkt, nil, nil, address.Address, ""); !ok {
+			// iptables is telling us to drop the packet.
+			return
+		}
+	}
+
+	if ref := n.getRef(protocol, dst); ref != nil {
+		handlePacket(protocol, dst, src, n.linkEP.LinkAddress(), remote, ref, pkt)
+		return
+	}
+
+	// This NIC doesn't care about the packet. Find a NIC that cares about the
+	// packet and forward it to the NIC.
+	//
+	// TODO: Should we be forwarding the packet even if promiscuous?
+	if n.stack.Forwarding() {
+		r, err := n.stack.FindRoute(0, "", dst, protocol, false /* multicastLoop */)
+		if err != nil {
+			n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment()
+			return
+		}
+
+		// Found a NIC.
+		n := r.ref.nic
+		n.mu.RLock()
+		ref, ok := n.mu.endpoints[NetworkEndpointID{dst}]
+		ok = ok && ref.isValidForOutgoingRLocked() && ref.tryIncRef()
+		n.mu.RUnlock()
+		if ok {
+			r.LocalLinkAddress = n.linkEP.LinkAddress()
+			r.RemoteLinkAddress = remote
+			r.RemoteAddress = src
+			// TODO(b/123449044): Update the source NIC as well.
+			ref.ep.HandlePacket(&r, pkt)
+			ref.decRef()
+			r.Release()
+			return
+		}
+
+		// n doesn't have a destination endpoint.
+		// Send the packet out of n.
+		// TODO(b/128629022): move this logic to route.WritePacket.
+		if ch, err := r.Resolve(nil); err != nil {
+			if err == tcpip.ErrWouldBlock {
+				n.stack.forwarder.enqueue(ch, n, &r, protocol, pkt)
+				// forwarder will release route.
+				return
+			}
+			n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment()
+			r.Release()
+			return
+		}
+
+		// The link-address resolution finished immediately.
+		n.forwardPacket(&r, protocol, pkt)
+		r.Release()
+		return
+	}
+
+	// If a packet socket handled the packet, don't treat it as invalid.
+	if len(packetEPs) == 0 {
+		n.stack.stats.IP.InvalidDestinationAddressesReceived.Increment()
+	}
+}
+
+func (n *NIC) forwardPacket(r *Route, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) {
+	// TODO(b/143425874) Decrease the TTL field in forwarded packets.
+	// TODO(b/151227689): Avoid copying the packet when forwarding. We can do this
+	// by having lower layers explicity write each header instead of just
+	// pkt.Header.
+
+	// pkt may have set its NetworkHeader and TransportHeader. If we're
+	// forwarding, we'll have to copy them into pkt.Header.
+	pkt.Header = buffer.NewPrependable(int(n.linkEP.MaxHeaderLength()) + len(pkt.NetworkHeader) + len(pkt.TransportHeader))
+	if n := copy(pkt.Header.Prepend(len(pkt.TransportHeader)), pkt.TransportHeader); n != len(pkt.TransportHeader) {
+		panic(fmt.Sprintf("copied %d bytes, expected %d", n, len(pkt.TransportHeader)))
+	}
+	if n := copy(pkt.Header.Prepend(len(pkt.NetworkHeader)), pkt.NetworkHeader); n != len(pkt.NetworkHeader) {
+		panic(fmt.Sprintf("copied %d bytes, expected %d", n, len(pkt.NetworkHeader)))
+	}
+
+	// WritePacket takes ownership of pkt, calculate numBytes first.
+	numBytes := pkt.Header.UsedLength() + pkt.Data.Size()
+
+	if err := n.linkEP.WritePacket(r, nil /* gso */, protocol, pkt); err != nil {
+		r.Stats().IP.OutgoingPacketErrors.Increment()
+		return
+	}
+
+	n.stats.Tx.Packets.Increment()
+	n.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
+}
+
+// DeliverTransportPacket delivers the packets to the appropriate transport
+// protocol endpoint.
+func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) {
+	state, ok := n.stack.transportProtocols[protocol]
+	if !ok {
+		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
+		return
+	}
+
+	transProto := state.proto
+
+	// Raw socket packets are delivered based solely on the transport
+	// protocol number. We do not inspect the payload to ensure it's
+	// validly formed.
+	n.stack.demux.deliverRawPacket(r, protocol, pkt)
+
+	// TransportHeader is nil only when pkt is an ICMP packet or was reassembled
+	// from fragments.
+	if pkt.TransportHeader == nil {
+		// TODO(gvisor.dev/issue/170): ICMP packets don't have their TransportHeader
+		// fields set yet, parse it here. See icmp/protocol.go:protocol.Parse for a
+		// full explanation.
+		if protocol == header.ICMPv4ProtocolNumber || protocol == header.ICMPv6ProtocolNumber {
+			// ICMP packets may be longer, but until icmp.Parse is implemented, here
+			// we parse it using the minimum size.
+			transHeader, ok := pkt.Data.PullUp(transProto.MinimumPacketSize())
+			if !ok {
+				n.stack.stats.MalformedRcvdPackets.Increment()
+				return
+			}
+			pkt.TransportHeader = transHeader
+			pkt.Data.TrimFront(len(pkt.TransportHeader))
+		} else {
+			// This is either a bad packet or was re-assembled from fragments.
+			transProto.Parse(pkt)
+		}
+	}
+
+	if len(pkt.TransportHeader) < transProto.MinimumPacketSize() {
+		n.stack.stats.MalformedRcvdPackets.Increment()
+		return
+	}
+
+	srcPort, dstPort, err := transProto.ParsePorts(pkt.TransportHeader)
+	if err != nil {
+		n.stack.stats.MalformedRcvdPackets.Increment()
+		return
+	}
+
+	id := TransportEndpointID{dstPort, r.LocalAddress, srcPort, r.RemoteAddress}
+	if n.stack.demux.deliverPacket(r, protocol, pkt, id) {
+		return
+	}
+
+	// Try to deliver to per-stack default handler.
+	if state.defaultHandler != nil {
+		if state.defaultHandler(r, id, pkt) {
+			return
+		}
+	}
+
+	// We could not find an appropriate destination for this packet, so
+	// deliver it to the global handler.
+	if !transProto.HandleUnknownDestinationPacket(r, id, pkt) {
+		n.stack.stats.MalformedRcvdPackets.Increment()
+	}
+}
+
+// DeliverTransportControlPacket delivers control packets to the appropriate
+// transport protocol endpoint.
+func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt *PacketBuffer) {
+	state, ok := n.stack.transportProtocols[trans]
+	if !ok {
+		return
+	}
+
+	transProto := state.proto
+
+	// ICMPv4 only guarantees that 8 bytes of the transport protocol will
+	// be present in the payload. We know that the ports are within the
+	// first 8 bytes for all known transport protocols.
+	transHeader, ok := pkt.Data.PullUp(8)
+	if !ok {
+		return
+	}
+
+	srcPort, dstPort, err := transProto.ParsePorts(transHeader)
+	if err != nil {
+		return
+	}
+
+	id := TransportEndpointID{srcPort, local, dstPort, remote}
+	if n.stack.demux.deliverControlPacket(n, net, trans, typ, extra, pkt, id) {
+		return
+	}
+}
+
+// ID returns the identifier of n.
+func (n *NIC) ID() tcpip.NICID {
+	return n.id
+}
+
+// Name returns the name of n.
+func (n *NIC) Name() string {
+	return n.name
+}
+
+// Stack returns the instance of the Stack that owns this NIC.
+func (n *NIC) Stack() *Stack {
+	return n.stack
+}
+
+// LinkEndpoint returns the link endpoint of n.
+func (n *NIC) LinkEndpoint() LinkEndpoint {
+	return n.linkEP
+}
+
+// isAddrTentative returns true if addr is tentative on n.
+//
+// Note that if addr is not associated with n, then this function will return
+// false. It will only return true if the address is associated with the NIC
+// AND it is tentative.
+func (n *NIC) isAddrTentative(addr tcpip.Address) bool {
+	n.mu.RLock()
+	defer n.mu.RUnlock()
+
+	ref, ok := n.mu.endpoints[NetworkEndpointID{addr}]
+	if !ok {
+		return false
+	}
+
+	return ref.getKind() == permanentTentative
+}
+
+// dupTentativeAddrDetected attempts to inform n that a tentative addr is a
+// duplicate on a link.
+//
+// dupTentativeAddrDetected will remove the tentative address if it exists. If
+// the address was generated via SLAAC, an attempt will be made to generate a
+// new address.
+func (n *NIC) dupTentativeAddrDetected(addr tcpip.Address) *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	ref, ok := n.mu.endpoints[NetworkEndpointID{addr}]
+	if !ok {
+		return tcpip.ErrBadAddress
+	}
+
+	if ref.getKind() != permanentTentative {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	// If the address is a SLAAC address, do not invalidate its SLAAC prefix as a
+	// new address will be generated for it.
+	if err := n.removePermanentIPv6EndpointLocked(ref, false /* allowSLAACInvalidation */); err != nil {
+		return err
+	}
+
+	prefix := ref.addrWithPrefix().Subnet()
+
+	switch ref.configType {
+	case slaac:
+		n.mu.ndp.regenerateSLAACAddr(prefix)
+	case slaacTemp:
+		// Do not reset the generation attempts counter for the prefix as the
+		// temporary address is being regenerated in response to a DAD conflict.
+		n.mu.ndp.regenerateTempSLAACAddr(prefix, false /* resetGenAttempts */)
+	}
+
+	return nil
+}
+
+// setNDPConfigs sets the NDP configurations for n.
+//
+// Note, if c contains invalid NDP configuration values, it will be fixed to
+// use default values for the erroneous values.
+func (n *NIC) setNDPConfigs(c NDPConfigurations) {
+	c.validate()
+
+	n.mu.Lock()
+	n.mu.ndp.configs = c
+	n.mu.Unlock()
+}
+
+// handleNDPRA handles an NDP Router Advertisement message that arrived on n.
+func (n *NIC) handleNDPRA(ip tcpip.Address, ra header.NDPRouterAdvert) {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	n.mu.ndp.handleRA(ip, ra)
+}
+
+type networkEndpointKind int32
+
+const (
+	// A permanentTentative endpoint is a permanent address that is not yet
+	// considered to be fully bound to an interface in the traditional
+	// sense. That is, the address is associated with a NIC, but packets
+	// destined to the address MUST NOT be accepted and MUST be silently
+	// dropped, and the address MUST NOT be used as a source address for
+	// outgoing packets. For IPv6, addresses will be of this kind until
+	// NDP's Duplicate Address Detection has resolved, or be deleted if
+	// the process results in detecting a duplicate address.
+	permanentTentative networkEndpointKind = iota
+
+	// A permanent endpoint is created by adding a permanent address (vs. a
+	// temporary one) to the NIC. Its reference count is biased by 1 to avoid
+	// removal when no route holds a reference to it. It is removed by explicitly
+	// removing the permanent address from the NIC.
+	permanent
+
+	// An expired permanent endpoint is a permanent endpoint that had its address
+	// removed from the NIC, and it is waiting to be removed once no more routes
+	// hold a reference to it. This is achieved by decreasing its reference count
+	// by 1. If its address is re-added before the endpoint is removed, its type
+	// changes back to permanent and its reference count increases by 1 again.
+	permanentExpired
+
+	// A temporary endpoint is created for spoofing outgoing packets, or when in
+	// promiscuous mode and accepting incoming packets that don't match any
+	// permanent endpoint. Its reference count is not biased by 1 and the
+	// endpoint is removed immediately when no more route holds a reference to
+	// it. A temporary endpoint can be promoted to permanent if its address
+	// is added permanently.
+	temporary
+)
+
+func (n *NIC) registerPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	eps, ok := n.mu.packetEPs[netProto]
+	if !ok {
+		return tcpip.ErrNotSupported
+	}
+	n.mu.packetEPs[netProto] = append(eps, ep)
+
+	return nil
+}
+
+func (n *NIC) unregisterPacketEndpoint(netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	eps, ok := n.mu.packetEPs[netProto]
+	if !ok {
+		return
+	}
+
+	for i, epOther := range eps {
+		if epOther == ep {
+			n.mu.packetEPs[netProto] = append(eps[:i], eps[i+1:]...)
+			return
+		}
+	}
+}
+
+type networkEndpointConfigType int32
+
+const (
+	// A statically configured endpoint is an address that was added by
+	// some user-specified action (adding an explicit address, joining a
+	// multicast group).
+	static networkEndpointConfigType = iota
+
+	// A SLAAC configured endpoint is an IPv6 endpoint that was added by
+	// SLAAC as per RFC 4862 section 5.5.3.
+	slaac
+
+	// A temporary SLAAC configured endpoint is an IPv6 endpoint that was added by
+	// SLAAC as per RFC 4941. Temporary SLAAC addresses are short-lived and are
+	// not expected to be valid (or preferred) forever; hence the term temporary.
+	slaacTemp
+)
+
+type referencedNetworkEndpoint struct {
+	ep       NetworkEndpoint
+	nic      *NIC
+	protocol tcpip.NetworkProtocolNumber
+
+	// linkCache is set if link address resolution is enabled for this
+	// protocol. Set to nil otherwise.
+	linkCache LinkAddressCache
+
+	// refs is counting references held for this endpoint. When refs hits zero it
+	// triggers the automatic removal of the endpoint from the NIC.
+	refs int32
+
+	// networkEndpointKind must only be accessed using {get,set}Kind().
+	kind networkEndpointKind
+
+	// configType is the method that was used to configure this endpoint.
+	// This must never change except during endpoint creation and promotion to
+	// permanent.
+	configType networkEndpointConfigType
+
+	// deprecated indicates whether or not the endpoint should be considered
+	// deprecated. That is, when deprecated is true, other endpoints that are not
+	// deprecated should be preferred.
+	deprecated bool
+}
+
+func (r *referencedNetworkEndpoint) addrWithPrefix() tcpip.AddressWithPrefix {
+	return tcpip.AddressWithPrefix{
+		Address:   r.ep.ID().LocalAddress,
+		PrefixLen: r.ep.PrefixLen(),
+	}
+}
+
+func (r *referencedNetworkEndpoint) getKind() networkEndpointKind {
+	return networkEndpointKind(atomic.LoadInt32((*int32)(&r.kind)))
+}
+
+func (r *referencedNetworkEndpoint) setKind(kind networkEndpointKind) {
+	atomic.StoreInt32((*int32)(&r.kind), int32(kind))
+}
+
+// isValidForOutgoing returns true if the endpoint can be used to send out a
+// packet. It requires the endpoint to not be marked expired (i.e., its address)
+// has been removed) unless the NIC is in spoofing mode, or temporary.
+func (r *referencedNetworkEndpoint) isValidForOutgoing() bool {
+	r.nic.mu.RLock()
+	defer r.nic.mu.RUnlock()
+
+	return r.isValidForOutgoingRLocked()
+}
+
+// isValidForOutgoingRLocked is the same as isValidForOutgoing but requires
+// r.nic.mu to be read locked.
+func (r *referencedNetworkEndpoint) isValidForOutgoingRLocked() bool {
+	if !r.nic.mu.enabled {
+		return false
+	}
+
+	return r.isAssignedRLocked(r.nic.mu.spoofing)
+}
+
+// isAssignedRLocked returns true if r is considered to be assigned to the NIC.
+//
+// r.nic.mu must be read locked.
+func (r *referencedNetworkEndpoint) isAssignedRLocked(spoofingOrPromiscuous bool) bool {
+	switch r.getKind() {
+	case permanentTentative:
+		return false
+	case permanentExpired:
+		return spoofingOrPromiscuous
+	default:
+		return true
+	}
+}
+
+// expireLocked decrements the reference count and marks the permanent endpoint
+// as expired.
+func (r *referencedNetworkEndpoint) expireLocked() {
+	r.setKind(permanentExpired)
+	r.decRefLocked()
+}
+
+// decRef decrements the ref count and cleans up the endpoint once it reaches
+// zero.
+func (r *referencedNetworkEndpoint) decRef() {
+	if atomic.AddInt32(&r.refs, -1) == 0 {
+		r.nic.removeEndpoint(r)
+	}
+}
+
+// decRefLocked is the same as decRef but assumes that the NIC.mu mutex is
+// locked.
+func (r *referencedNetworkEndpoint) decRefLocked() {
+	if atomic.AddInt32(&r.refs, -1) == 0 {
+		r.nic.removeEndpointLocked(r)
+	}
+}
+
+// incRef increments the ref count. It must only be called when the caller is
+// known to be holding a reference to the endpoint, otherwise tryIncRef should
+// be used.
+func (r *referencedNetworkEndpoint) incRef() {
+	atomic.AddInt32(&r.refs, 1)
+}
+
+// tryIncRef attempts to increment the ref count from n to n+1, but only if n is
+// not zero. That is, it will increment the count if the endpoint is still
+// alive, and do nothing if it has already been clean up.
+func (r *referencedNetworkEndpoint) tryIncRef() bool {
+	for {
+		v := atomic.LoadInt32(&r.refs)
+		if v == 0 {
+			return false
+		}
+
+		if atomic.CompareAndSwapInt32(&r.refs, v, v+1) {
+			return true
+		}
+	}
+}
+
+// stack returns the Stack instance that owns the underlying endpoint.
+func (r *referencedNetworkEndpoint) stack() *Stack {
+	return r.nic.stack
+}
diff --git a/pkg/tcpip/stack/nic_test.go b/pkg/tcpip/stack/nic_test.go
new file mode 100644
index 000000000..31f865260
--- /dev/null
+++ b/pkg/tcpip/stack/nic_test.go
@@ -0,0 +1,318 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"math"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+var _ LinkEndpoint = (*testLinkEndpoint)(nil)
+
+// A LinkEndpoint that throws away outgoing packets.
+//
+// We use this instead of the channel endpoint as the channel package depends on
+// the stack package which this test lives in, causing a cyclic dependency.
+type testLinkEndpoint struct {
+	dispatcher NetworkDispatcher
+}
+
+// Attach implements LinkEndpoint.Attach.
+func (e *testLinkEndpoint) Attach(dispatcher NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements LinkEndpoint.IsAttached.
+func (e *testLinkEndpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements LinkEndpoint.MTU.
+func (*testLinkEndpoint) MTU() uint32 {
+	return math.MaxUint16
+}
+
+// Capabilities implements LinkEndpoint.Capabilities.
+func (*testLinkEndpoint) Capabilities() LinkEndpointCapabilities {
+	return CapabilityResolutionRequired
+}
+
+// MaxHeaderLength implements LinkEndpoint.MaxHeaderLength.
+func (*testLinkEndpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (*testLinkEndpoint) LinkAddress() tcpip.LinkAddress {
+	return ""
+}
+
+// Wait implements LinkEndpoint.Wait.
+func (*testLinkEndpoint) Wait() {}
+
+// WritePacket implements LinkEndpoint.WritePacket.
+func (e *testLinkEndpoint) WritePacket(*Route, *GSO, tcpip.NetworkProtocolNumber, *PacketBuffer) *tcpip.Error {
+	return nil
+}
+
+// WritePackets implements LinkEndpoint.WritePackets.
+func (e *testLinkEndpoint) WritePackets(*Route, *GSO, PacketBufferList, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	// Our tests don't use this so we don't support it.
+	return 0, tcpip.ErrNotSupported
+}
+
+// WriteRawPacket implements LinkEndpoint.WriteRawPacket.
+func (e *testLinkEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
+	// Our tests don't use this so we don't support it.
+	return tcpip.ErrNotSupported
+}
+
+var _ NetworkEndpoint = (*testIPv6Endpoint)(nil)
+
+// An IPv6 NetworkEndpoint that throws away outgoing packets.
+//
+// We use this instead of ipv6.endpoint because the ipv6 package depends on
+// the stack package which this test lives in, causing a cyclic dependency.
+type testIPv6Endpoint struct {
+	nicID     tcpip.NICID
+	id        NetworkEndpointID
+	prefixLen int
+	linkEP    LinkEndpoint
+	protocol  *testIPv6Protocol
+}
+
+// DefaultTTL implements NetworkEndpoint.DefaultTTL.
+func (*testIPv6Endpoint) DefaultTTL() uint8 {
+	return 0
+}
+
+// MTU implements NetworkEndpoint.MTU.
+func (e *testIPv6Endpoint) MTU() uint32 {
+	return e.linkEP.MTU() - header.IPv6MinimumSize
+}
+
+// Capabilities implements NetworkEndpoint.Capabilities.
+func (e *testIPv6Endpoint) Capabilities() LinkEndpointCapabilities {
+	return e.linkEP.Capabilities()
+}
+
+// MaxHeaderLength implements NetworkEndpoint.MaxHeaderLength.
+func (e *testIPv6Endpoint) MaxHeaderLength() uint16 {
+	return e.linkEP.MaxHeaderLength() + header.IPv6MinimumSize
+}
+
+// WritePacket implements NetworkEndpoint.WritePacket.
+func (*testIPv6Endpoint) WritePacket(*Route, *GSO, NetworkHeaderParams, *PacketBuffer) *tcpip.Error {
+	return nil
+}
+
+// WritePackets implements NetworkEndpoint.WritePackets.
+func (*testIPv6Endpoint) WritePackets(*Route, *GSO, PacketBufferList, NetworkHeaderParams) (int, *tcpip.Error) {
+	// Our tests don't use this so we don't support it.
+	return 0, tcpip.ErrNotSupported
+}
+
+// WriteHeaderIncludedPacket implements
+// NetworkEndpoint.WriteHeaderIncludedPacket.
+func (*testIPv6Endpoint) WriteHeaderIncludedPacket(*Route, *PacketBuffer) *tcpip.Error {
+	// Our tests don't use this so we don't support it.
+	return tcpip.ErrNotSupported
+}
+
+// ID implements NetworkEndpoint.ID.
+func (e *testIPv6Endpoint) ID() *NetworkEndpointID {
+	return &e.id
+}
+
+// PrefixLen implements NetworkEndpoint.PrefixLen.
+func (e *testIPv6Endpoint) PrefixLen() int {
+	return e.prefixLen
+}
+
+// NICID implements NetworkEndpoint.NICID.
+func (e *testIPv6Endpoint) NICID() tcpip.NICID {
+	return e.nicID
+}
+
+// HandlePacket implements NetworkEndpoint.HandlePacket.
+func (*testIPv6Endpoint) HandlePacket(*Route, *PacketBuffer) {
+}
+
+// Close implements NetworkEndpoint.Close.
+func (*testIPv6Endpoint) Close() {}
+
+// NetworkProtocolNumber implements NetworkEndpoint.NetworkProtocolNumber.
+func (*testIPv6Endpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return header.IPv6ProtocolNumber
+}
+
+var _ NetworkProtocol = (*testIPv6Protocol)(nil)
+
+// An IPv6 NetworkProtocol that supports the bare minimum to make a stack
+// believe it supports IPv6.
+//
+// We use this instead of ipv6.protocol because the ipv6 package depends on
+// the stack package which this test lives in, causing a cyclic dependency.
+type testIPv6Protocol struct{}
+
+// Number implements NetworkProtocol.Number.
+func (*testIPv6Protocol) Number() tcpip.NetworkProtocolNumber {
+	return header.IPv6ProtocolNumber
+}
+
+// MinimumPacketSize implements NetworkProtocol.MinimumPacketSize.
+func (*testIPv6Protocol) MinimumPacketSize() int {
+	return header.IPv6MinimumSize
+}
+
+// DefaultPrefixLen implements NetworkProtocol.DefaultPrefixLen.
+func (*testIPv6Protocol) DefaultPrefixLen() int {
+	return header.IPv6AddressSize * 8
+}
+
+// ParseAddresses implements NetworkProtocol.ParseAddresses.
+func (*testIPv6Protocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
+	h := header.IPv6(v)
+	return h.SourceAddress(), h.DestinationAddress()
+}
+
+// NewEndpoint implements NetworkProtocol.NewEndpoint.
+func (p *testIPv6Protocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, _ LinkAddressCache, _ TransportDispatcher, linkEP LinkEndpoint, _ *Stack) (NetworkEndpoint, *tcpip.Error) {
+	return &testIPv6Endpoint{
+		nicID:     nicID,
+		id:        NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
+		prefixLen: addrWithPrefix.PrefixLen,
+		linkEP:    linkEP,
+		protocol:  p,
+	}, nil
+}
+
+// SetOption implements NetworkProtocol.SetOption.
+func (*testIPv6Protocol) SetOption(interface{}) *tcpip.Error {
+	return nil
+}
+
+// Option implements NetworkProtocol.Option.
+func (*testIPv6Protocol) Option(interface{}) *tcpip.Error {
+	return nil
+}
+
+// Close implements NetworkProtocol.Close.
+func (*testIPv6Protocol) Close() {}
+
+// Wait implements NetworkProtocol.Wait.
+func (*testIPv6Protocol) Wait() {}
+
+// Parse implements NetworkProtocol.Parse.
+func (*testIPv6Protocol) Parse(*PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
+	return 0, false, false
+}
+
+var _ LinkAddressResolver = (*testIPv6Protocol)(nil)
+
+// LinkAddressProtocol implements LinkAddressResolver.
+func (*testIPv6Protocol) LinkAddressProtocol() tcpip.NetworkProtocolNumber {
+	return header.IPv6ProtocolNumber
+}
+
+// LinkAddressRequest implements LinkAddressResolver.
+func (*testIPv6Protocol) LinkAddressRequest(_, _ tcpip.Address, _ LinkEndpoint) *tcpip.Error {
+	return nil
+}
+
+// ResolveStaticAddress implements LinkAddressResolver.
+func (*testIPv6Protocol) ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool) {
+	if header.IsV6MulticastAddress(addr) {
+		return header.EthernetAddressFromMulticastIPv6Address(addr), true
+	}
+	return "", false
+}
+
+// Test the race condition where a NIC is removed and an RS timer fires at the
+// same time.
+func TestRemoveNICWhileHandlingRSTimer(t *testing.T) {
+	const (
+		nicID = 1
+
+		maxRtrSolicitations = 5
+	)
+
+	e := testLinkEndpoint{}
+	s := New(Options{
+		NetworkProtocols: []NetworkProtocol{&testIPv6Protocol{}},
+		NDPConfigs: NDPConfigurations{
+			MaxRtrSolicitations:     maxRtrSolicitations,
+			RtrSolicitationInterval: minimumRtrSolicitationInterval,
+		},
+	})
+
+	if err := s.CreateNIC(nicID, &e); err != nil {
+		t.Fatalf("s.CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	s.mu.Lock()
+	// Wait for the router solicitation timer to fire and block trying to obtain
+	// the stack lock when doing link address resolution.
+	time.Sleep(minimumRtrSolicitationInterval * 2)
+	if err := s.removeNICLocked(nicID); err != nil {
+		t.Fatalf("s.removeNICLocked(%d) = %s", nicID, err)
+	}
+	s.mu.Unlock()
+}
+
+func TestDisabledRxStatsWhenNICDisabled(t *testing.T) {
+	// When the NIC is disabled, the only field that matters is the stats field.
+	// This test is limited to stats counter checks.
+	nic := NIC{
+		stats: makeNICStats(),
+	}
+
+	if got := nic.stats.DisabledRx.Packets.Value(); got != 0 {
+		t.Errorf("got DisabledRx.Packets = %d, want = 0", got)
+	}
+	if got := nic.stats.DisabledRx.Bytes.Value(); got != 0 {
+		t.Errorf("got DisabledRx.Bytes = %d, want = 0", got)
+	}
+	if got := nic.stats.Rx.Packets.Value(); got != 0 {
+		t.Errorf("got Rx.Packets = %d, want = 0", got)
+	}
+	if got := nic.stats.Rx.Bytes.Value(); got != 0 {
+		t.Errorf("got Rx.Bytes = %d, want = 0", got)
+	}
+
+	if t.Failed() {
+		t.FailNow()
+	}
+
+	nic.DeliverNetworkPacket("", "", 0, &PacketBuffer{Data: buffer.View([]byte{1, 2, 3, 4}).ToVectorisedView()})
+
+	if got := nic.stats.DisabledRx.Packets.Value(); got != 1 {
+		t.Errorf("got DisabledRx.Packets = %d, want = 1", got)
+	}
+	if got := nic.stats.DisabledRx.Bytes.Value(); got != 4 {
+		t.Errorf("got DisabledRx.Bytes = %d, want = 4", got)
+	}
+	if got := nic.stats.Rx.Packets.Value(); got != 0 {
+		t.Errorf("got Rx.Packets = %d, want = 0", got)
+	}
+	if got := nic.stats.Rx.Bytes.Value(); got != 0 {
+		t.Errorf("got Rx.Bytes = %d, want = 0", got)
+	}
+}
diff --git a/pkg/tcpip/stack/packet_buffer.go b/pkg/tcpip/stack/packet_buffer.go
new file mode 100644
index 000000000..1b5da6017
--- /dev/null
+++ b/pkg/tcpip/stack/packet_buffer.go
@@ -0,0 +1,115 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at //
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+// A PacketBuffer contains all the data of a network packet.
+//
+// As a PacketBuffer traverses up the stack, it may be necessary to pass it to
+// multiple endpoints. Clone() should be called in such cases so that
+// modifications to the Data field do not affect other copies.
+type PacketBuffer struct {
+	_ noCopy
+
+	// PacketBufferEntry is used to build an intrusive list of
+	// PacketBuffers.
+	PacketBufferEntry
+
+	// Data holds the payload of the packet. For inbound packets, it also
+	// holds the headers, which are consumed as the packet moves up the
+	// stack. Headers are guaranteed not to be split across views.
+	//
+	// The bytes backing Data are immutable, but Data itself may be trimmed
+	// or otherwise modified.
+	Data buffer.VectorisedView
+
+	// Header holds the headers of outbound packets. As a packet is passed
+	// down the stack, each layer adds to Header. Note that forwarded
+	// packets don't populate Headers on their way out -- their headers and
+	// payload are never parsed out and remain in Data.
+	//
+	// TODO(gvisor.dev/issue/170): Forwarded packets don't currently
+	// populate Header, but should. This will be doable once early parsing
+	// (https://github.com/google/gvisor/pull/1995) is supported.
+	Header buffer.Prependable
+
+	// These fields are used by both inbound and outbound packets. They
+	// typically overlap with the Data and Header fields.
+	//
+	// The bytes backing these views are immutable. Each field may be nil
+	// if either it has not been set yet or no such header exists (e.g.
+	// packets sent via loopback may not have a link header).
+	//
+	// These fields may be Views into other slices (either Data or Header).
+	// SR dosen't support this, so deep copies are necessary in some cases.
+	LinkHeader      buffer.View
+	NetworkHeader   buffer.View
+	TransportHeader buffer.View
+
+	// Hash is the transport layer hash of this packet. A value of zero
+	// indicates no valid hash has been set.
+	Hash uint32
+
+	// Owner is implemented by task to get the uid and gid.
+	// Only set for locally generated packets.
+	Owner tcpip.PacketOwner
+
+	// The following fields are only set by the qdisc layer when the packet
+	// is added to a queue.
+	EgressRoute           *Route
+	GSOOptions            *GSO
+	NetworkProtocolNumber tcpip.NetworkProtocolNumber
+
+	// NatDone indicates if the packet has been manipulated as per NAT
+	// iptables rule.
+	NatDone bool
+}
+
+// Clone makes a copy of pk. It clones the Data field, which creates a new
+// VectorisedView but does not deep copy the underlying bytes.
+//
+// Clone also does not deep copy any of its other fields.
+//
+// FIXME(b/153685824): Data gets copied but not other header references.
+func (pk *PacketBuffer) Clone() *PacketBuffer {
+	return &PacketBuffer{
+		PacketBufferEntry:     pk.PacketBufferEntry,
+		Data:                  pk.Data.Clone(nil),
+		Header:                pk.Header,
+		LinkHeader:            pk.LinkHeader,
+		NetworkHeader:         pk.NetworkHeader,
+		TransportHeader:       pk.TransportHeader,
+		Hash:                  pk.Hash,
+		Owner:                 pk.Owner,
+		EgressRoute:           pk.EgressRoute,
+		GSOOptions:            pk.GSOOptions,
+		NetworkProtocolNumber: pk.NetworkProtocolNumber,
+		NatDone:               pk.NatDone,
+	}
+}
+
+// noCopy may be embedded into structs which must not be copied
+// after the first use.
+//
+// See https://golang.org/issues/8005#issuecomment-190753527
+// for details.
+type noCopy struct{}
+
+// Lock is a no-op used by -copylocks checker from `go vet`.
+func (*noCopy) Lock()   {}
+func (*noCopy) Unlock() {}
diff --git a/pkg/tcpip/stack/rand.go b/pkg/tcpip/stack/rand.go
new file mode 100644
index 000000000..421fb5c15
--- /dev/null
+++ b/pkg/tcpip/stack/rand.go
@@ -0,0 +1,40 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	mathrand "math/rand"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// lockedRandomSource provides a threadsafe rand.Source.
+type lockedRandomSource struct {
+	mu  sync.Mutex
+	src mathrand.Source
+}
+
+func (r *lockedRandomSource) Int63() (n int64) {
+	r.mu.Lock()
+	n = r.src.Int63()
+	r.mu.Unlock()
+	return n
+}
+
+func (r *lockedRandomSource) Seed(seed int64) {
+	r.mu.Lock()
+	r.src.Seed(seed)
+	r.mu.Unlock()
+}
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
new file mode 100644
index 000000000..5cbc946b6
--- /dev/null
+++ b/pkg/tcpip/stack/registration.go
@@ -0,0 +1,560 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// NetworkEndpointID is the identifier of a network layer protocol endpoint.
+// Currently the local address is sufficient because all supported protocols
+// (i.e., IPv4 and IPv6) have different sizes for their addresses.
+type NetworkEndpointID struct {
+	LocalAddress tcpip.Address
+}
+
+// TransportEndpointID is the identifier of a transport layer protocol endpoint.
+//
+// +stateify savable
+type TransportEndpointID struct {
+	// LocalPort is the local port associated with the endpoint.
+	LocalPort uint16
+
+	// LocalAddress is the local [network layer] address associated with
+	// the endpoint.
+	LocalAddress tcpip.Address
+
+	// RemotePort is the remote port associated with the endpoint.
+	RemotePort uint16
+
+	// RemoteAddress it the remote [network layer] address associated with
+	// the endpoint.
+	RemoteAddress tcpip.Address
+}
+
+// ControlType is the type of network control message.
+type ControlType int
+
+// The following are the allowed values for ControlType values.
+const (
+	ControlPacketTooBig ControlType = iota
+	ControlPortUnreachable
+	ControlUnknown
+)
+
+// TransportEndpoint is the interface that needs to be implemented by transport
+// protocol (e.g., tcp, udp) endpoints that can handle packets.
+type TransportEndpoint interface {
+	// UniqueID returns an unique ID for this transport endpoint.
+	UniqueID() uint64
+
+	// HandlePacket is called by the stack when new packets arrive to
+	// this transport endpoint. It sets pkt.TransportHeader.
+	//
+	// HandlePacket takes ownership of pkt.
+	HandlePacket(r *Route, id TransportEndpointID, pkt *PacketBuffer)
+
+	// HandleControlPacket is called by the stack when new control (e.g.
+	// ICMP) packets arrive to this transport endpoint.
+	// HandleControlPacket takes ownership of pkt.
+	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, pkt *PacketBuffer)
+
+	// Abort initiates an expedited endpoint teardown. It puts the endpoint
+	// in a closed state and frees all resources associated with it. This
+	// cleanup may happen asynchronously. Wait can be used to block on this
+	// asynchronous cleanup.
+	Abort()
+
+	// Wait waits for any worker goroutines owned by the endpoint to stop.
+	//
+	// An endpoint can be requested to stop its worker goroutines by calling
+	// its Close method.
+	//
+	// Wait will not block if the endpoint hasn't started any goroutines
+	// yet, even if it might later.
+	Wait()
+}
+
+// RawTransportEndpoint is the interface that needs to be implemented by raw
+// transport protocol endpoints. RawTransportEndpoints receive the entire
+// packet - including the network and transport headers - as delivered to
+// netstack.
+type RawTransportEndpoint interface {
+	// HandlePacket is called by the stack when new packets arrive to
+	// this transport endpoint. The packet contains all data from the link
+	// layer up.
+	//
+	// HandlePacket takes ownership of pkt.
+	HandlePacket(r *Route, pkt *PacketBuffer)
+}
+
+// PacketEndpoint is the interface that needs to be implemented by packet
+// transport protocol endpoints. These endpoints receive link layer headers in
+// addition to whatever they contain (usually network and transport layer
+// headers and a payload).
+type PacketEndpoint interface {
+	// HandlePacket is called by the stack when new packets arrive that
+	// match the endpoint.
+	//
+	// Implementers should treat packet as immutable and should copy it
+	// before before modification.
+	//
+	// linkHeader may have a length of 0, in which case the PacketEndpoint
+	// should construct its own ethernet header for applications.
+	//
+	// HandlePacket takes ownership of pkt.
+	HandlePacket(nicID tcpip.NICID, addr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
+}
+
+// TransportProtocol is the interface that needs to be implemented by transport
+// protocols (e.g., tcp, udp) that want to be part of the networking stack.
+type TransportProtocol interface {
+	// Number returns the transport protocol number.
+	Number() tcpip.TransportProtocolNumber
+
+	// NewEndpoint creates a new endpoint of the transport protocol.
+	NewEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+
+	// NewRawEndpoint creates a new raw endpoint of the transport protocol.
+	NewRawEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+
+	// MinimumPacketSize returns the minimum valid packet size of this
+	// transport protocol. The stack automatically drops any packets smaller
+	// than this targeted at this protocol.
+	MinimumPacketSize() int
+
+	// ParsePorts returns the source and destination ports stored in a
+	// packet of this protocol.
+	ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
+
+	// HandleUnknownDestinationPacket handles packets targeted at this
+	// protocol but that don't match any existing endpoint. For example,
+	// it is targeted at a port that have no listeners.
+	//
+	// The return value indicates whether the packet was well-formed (for
+	// stats purposes only).
+	//
+	// HandleUnknownDestinationPacket takes ownership of pkt.
+	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) bool
+
+	// SetOption allows enabling/disabling protocol specific features.
+	// SetOption returns an error if the option is not supported or the
+	// provided option value is invalid.
+	SetOption(option interface{}) *tcpip.Error
+
+	// Option allows retrieving protocol specific option values.
+	// Option returns an error if the option is not supported or the
+	// provided option value is invalid.
+	Option(option interface{}) *tcpip.Error
+
+	// Close requests that any worker goroutines owned by the protocol
+	// stop.
+	Close()
+
+	// Wait waits for any worker goroutines owned by the protocol to stop.
+	Wait()
+
+	// Parse sets pkt.TransportHeader and trims pkt.Data appropriately. It does
+	// neither and returns false if pkt.Data is too small, i.e. pkt.Data.Size() <
+	// MinimumPacketSize()
+	Parse(pkt *PacketBuffer) (ok bool)
+}
+
+// TransportDispatcher contains the methods used by the network stack to deliver
+// packets to the appropriate transport endpoint after it has been handled by
+// the network layer.
+type TransportDispatcher interface {
+	// DeliverTransportPacket delivers packets to the appropriate
+	// transport protocol endpoint.
+	//
+	// pkt.NetworkHeader must be set before calling DeliverTransportPacket.
+	//
+	// DeliverTransportPacket takes ownership of pkt.
+	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer)
+
+	// DeliverTransportControlPacket delivers control packets to the
+	// appropriate transport protocol endpoint.
+	//
+	// pkt.NetworkHeader must be set before calling
+	// DeliverTransportControlPacket.
+	//
+	// DeliverTransportControlPacket takes ownership of pkt.
+	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt *PacketBuffer)
+}
+
+// PacketLooping specifies where an outbound packet should be sent.
+type PacketLooping byte
+
+const (
+	// PacketOut indicates that the packet should be passed to the link
+	// endpoint.
+	PacketOut PacketLooping = 1 << iota
+
+	// PacketLoop indicates that the packet should be handled locally.
+	PacketLoop
+)
+
+// NetworkHeaderParams are the header parameters given as input by the
+// transport endpoint to the network.
+type NetworkHeaderParams struct {
+	// Protocol refers to the transport protocol number.
+	Protocol tcpip.TransportProtocolNumber
+
+	// TTL refers to Time To Live field of the IP-header.
+	TTL uint8
+
+	// TOS refers to TypeOfService or TrafficClass field of the IP-header.
+	TOS uint8
+}
+
+// NetworkEndpoint is the interface that needs to be implemented by endpoints
+// of network layer protocols (e.g., ipv4, ipv6).
+type NetworkEndpoint interface {
+	// DefaultTTL is the default time-to-live value (or hop limit, in ipv6)
+	// for this endpoint.
+	DefaultTTL() uint8
+
+	// MTU is the maximum transmission unit for this endpoint. This is
+	// generally calculated as the MTU of the underlying data link endpoint
+	// minus the network endpoint max header length.
+	MTU() uint32
+
+	// Capabilities returns the set of capabilities supported by the
+	// underlying link-layer endpoint.
+	Capabilities() LinkEndpointCapabilities
+
+	// MaxHeaderLength returns the maximum size the network (and lower
+	// level layers combined) headers can have. Higher levels use this
+	// information to reserve space in the front of the packets they're
+	// building.
+	MaxHeaderLength() uint16
+
+	// WritePacket writes a packet to the given destination address and
+	// protocol. It takes ownership of pkt. pkt.TransportHeader must have already
+	// been set.
+	WritePacket(r *Route, gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error
+
+	// WritePackets writes packets to the given destination address and
+	// protocol. pkts must not be zero length. It takes ownership of pkts and
+	// underlying packets.
+	WritePackets(r *Route, gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error)
+
+	// WriteHeaderIncludedPacket writes a packet that includes a network
+	// header to the given destination address. It takes ownership of pkt.
+	WriteHeaderIncludedPacket(r *Route, pkt *PacketBuffer) *tcpip.Error
+
+	// ID returns the network protocol endpoint ID.
+	ID() *NetworkEndpointID
+
+	// PrefixLen returns the network endpoint's subnet prefix length in bits.
+	PrefixLen() int
+
+	// NICID returns the id of the NIC this endpoint belongs to.
+	NICID() tcpip.NICID
+
+	// HandlePacket is called by the link layer when new packets arrive to
+	// this network endpoint. It sets pkt.NetworkHeader.
+	//
+	// HandlePacket takes ownership of pkt.
+	HandlePacket(r *Route, pkt *PacketBuffer)
+
+	// Close is called when the endpoint is reomved from a stack.
+	Close()
+
+	// NetworkProtocolNumber returns the tcpip.NetworkProtocolNumber for
+	// this endpoint.
+	NetworkProtocolNumber() tcpip.NetworkProtocolNumber
+}
+
+// NetworkProtocol is the interface that needs to be implemented by network
+// protocols (e.g., ipv4, ipv6) that want to be part of the networking stack.
+type NetworkProtocol interface {
+	// Number returns the network protocol number.
+	Number() tcpip.NetworkProtocolNumber
+
+	// MinimumPacketSize returns the minimum valid packet size of this
+	// network protocol. The stack automatically drops any packets smaller
+	// than this targeted at this protocol.
+	MinimumPacketSize() int
+
+	// DefaultPrefixLen returns the protocol's default prefix length.
+	DefaultPrefixLen() int
+
+	// ParseAddresses returns the source and destination addresses stored in a
+	// packet of this protocol.
+	ParseAddresses(v buffer.View) (src, dst tcpip.Address)
+
+	// NewEndpoint creates a new endpoint of this protocol.
+	NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, sender LinkEndpoint, st *Stack) (NetworkEndpoint, *tcpip.Error)
+
+	// SetOption allows enabling/disabling protocol specific features.
+	// SetOption returns an error if the option is not supported or the
+	// provided option value is invalid.
+	SetOption(option interface{}) *tcpip.Error
+
+	// Option allows retrieving protocol specific option values.
+	// Option returns an error if the option is not supported or the
+	// provided option value is invalid.
+	Option(option interface{}) *tcpip.Error
+
+	// Close requests that any worker goroutines owned by the protocol
+	// stop.
+	Close()
+
+	// Wait waits for any worker goroutines owned by the protocol to stop.
+	Wait()
+
+	// Parse sets pkt.NetworkHeader and trims pkt.Data appropriately. It
+	// returns:
+	// - The encapsulated protocol, if present.
+	// - Whether there is an encapsulated transport protocol payload (e.g. ARP
+	//   does not encapsulate anything).
+	// - Whether pkt.Data was large enough to parse and set pkt.NetworkHeader.
+	Parse(pkt *PacketBuffer) (proto tcpip.TransportProtocolNumber, hasTransportHdr bool, ok bool)
+}
+
+// NetworkDispatcher contains the methods used by the network stack to deliver
+// packets to the appropriate network endpoint after it has been handled by
+// the data link layer.
+type NetworkDispatcher interface {
+	// DeliverNetworkPacket finds the appropriate network protocol endpoint
+	// and hands the packet over for further processing.
+	//
+	// pkt.LinkHeader may or may not be set before calling
+	// DeliverNetworkPacket. Some packets do not have link headers (e.g.
+	// packets sent via loopback), and won't have the field set.
+	//
+	// DeliverNetworkPacket takes ownership of pkt.
+	DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
+}
+
+// LinkEndpointCapabilities is the type associated with the capabilities
+// supported by a link-layer endpoint. It is a set of bitfields.
+type LinkEndpointCapabilities uint
+
+// The following are the supported link endpoint capabilities.
+const (
+	CapabilityNone LinkEndpointCapabilities = 0
+	// CapabilityTXChecksumOffload indicates that the link endpoint supports
+	// checksum computation for outgoing packets and the stack can skip
+	// computing checksums when sending packets.
+	CapabilityTXChecksumOffload LinkEndpointCapabilities = 1 << iota
+	// CapabilityRXChecksumOffload indicates that the link endpoint supports
+	// checksum verification on received packets and that it's safe for the
+	// stack to skip checksum verification.
+	CapabilityRXChecksumOffload
+	CapabilityResolutionRequired
+	CapabilitySaveRestore
+	CapabilityDisconnectOk
+	CapabilityLoopback
+	CapabilityHardwareGSO
+
+	// CapabilitySoftwareGSO indicates the link endpoint supports of sending
+	// multiple packets using a single call (LinkEndpoint.WritePackets).
+	CapabilitySoftwareGSO
+)
+
+// LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
+// ethernet, loopback, raw) and used by network layer protocols to send packets
+// out through the implementer's data link endpoint. When a link header exists,
+// it sets each PacketBuffer's LinkHeader field before passing it up the
+// stack.
+type LinkEndpoint interface {
+	// MTU is the maximum transmission unit for this endpoint. This is
+	// usually dictated by the backing physical network; when such a
+	// physical network doesn't exist, the limit is generally 64k, which
+	// includes the maximum size of an IP packet.
+	MTU() uint32
+
+	// Capabilities returns the set of capabilities supported by the
+	// endpoint.
+	Capabilities() LinkEndpointCapabilities
+
+	// MaxHeaderLength returns the maximum size the data link (and
+	// lower level layers combined) headers can have. Higher levels use this
+	// information to reserve space in the front of the packets they're
+	// building.
+	MaxHeaderLength() uint16
+
+	// LinkAddress returns the link address (typically a MAC) of the
+	// link endpoint.
+	LinkAddress() tcpip.LinkAddress
+
+	// WritePacket writes a packet with the given protocol through the
+	// given route. It takes ownership of pkt. pkt.NetworkHeader and
+	// pkt.TransportHeader must have already been set.
+	//
+	// To participate in transparent bridging, a LinkEndpoint implementation
+	// should call eth.Encode with header.EthernetFields.SrcAddr set to
+	// r.LocalLinkAddress if it is provided.
+	WritePacket(r *Route, gso *GSO, protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer) *tcpip.Error
+
+	// WritePackets writes packets with the given protocol through the
+	// given route. pkts must not be zero length. It takes ownership of pkts and
+	// underlying packets.
+	//
+	// Right now, WritePackets is used only when the software segmentation
+	// offload is enabled. If it will be used for something else, it may
+	// require to change syscall filters.
+	WritePackets(r *Route, gso *GSO, pkts PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error)
+
+	// WriteRawPacket writes a packet directly to the link. The packet
+	// should already have an ethernet header. It takes ownership of vv.
+	WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error
+
+	// Attach attaches the data link layer endpoint to the network-layer
+	// dispatcher of the stack.
+	//
+	// Attach will be called with a nil dispatcher if the receiver's associated
+	// NIC is being removed.
+	Attach(dispatcher NetworkDispatcher)
+
+	// IsAttached returns whether a NetworkDispatcher is attached to the
+	// endpoint.
+	IsAttached() bool
+
+	// Wait waits for any worker goroutines owned by the endpoint to stop.
+	//
+	// For now, requesting that an endpoint's worker goroutine(s) stop is
+	// implementation specific.
+	//
+	// Wait will not block if the endpoint hasn't started any goroutines
+	// yet, even if it might later.
+	Wait()
+}
+
+// InjectableLinkEndpoint is a LinkEndpoint where inbound packets are
+// delivered via the Inject method.
+type InjectableLinkEndpoint interface {
+	LinkEndpoint
+
+	// InjectInbound injects an inbound packet.
+	InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *PacketBuffer)
+
+	// InjectOutbound writes a fully formed outbound packet directly to the
+	// link.
+	//
+	// dest is used by endpoints with multiple raw destinations.
+	InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error
+}
+
+// A LinkAddressResolver is an extension to a NetworkProtocol that
+// can resolve link addresses.
+type LinkAddressResolver interface {
+	// LinkAddressRequest sends a request for the LinkAddress of addr.
+	// The request is sent on linkEP with localAddr as the source.
+	//
+	// A valid response will cause the discovery protocol's network
+	// endpoint to call AddLinkAddress.
+	LinkAddressRequest(addr, localAddr tcpip.Address, linkEP LinkEndpoint) *tcpip.Error
+
+	// ResolveStaticAddress attempts to resolve address without sending
+	// requests. It either resolves the name immediately or returns the
+	// empty LinkAddress.
+	//
+	// It can be used to resolve broadcast addresses for example.
+	ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool)
+
+	// LinkAddressProtocol returns the network protocol of the
+	// addresses this this resolver can resolve.
+	LinkAddressProtocol() tcpip.NetworkProtocolNumber
+}
+
+// A LinkAddressCache caches link addresses.
+type LinkAddressCache interface {
+	// CheckLocalAddress determines if the given local address exists, and if it
+	// does not exist.
+	CheckLocalAddress(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID
+
+	// AddLinkAddress adds a link address to the cache.
+	AddLinkAddress(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress)
+
+	// GetLinkAddress looks up the cache to translate address to link address (e.g. IP -> MAC).
+	// If the LinkEndpoint requests address resolution and there is a LinkAddressResolver
+	// registered with the network protocol, the cache attempts to resolve the address
+	// and returns ErrWouldBlock. Waker is notified when address resolution is
+	// complete (success or not).
+	//
+	// If address resolution is required, ErrNoLinkAddress and a notification channel is
+	// returned for the top level caller to block. Channel is closed once address resolution
+	// is complete (success or not).
+	GetLinkAddress(nicID tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, w *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error)
+
+	// RemoveWaker removes a waker that has been added in GetLinkAddress().
+	RemoveWaker(nicID tcpip.NICID, addr tcpip.Address, waker *sleep.Waker)
+}
+
+// RawFactory produces endpoints for writing various types of raw packets.
+type RawFactory interface {
+	// NewUnassociatedEndpoint produces endpoints for writing packets not
+	// associated with a particular transport protocol. Such endpoints can
+	// be used to write arbitrary packets that include the network header.
+	NewUnassociatedEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+
+	// NewPacketEndpoint produces endpoints for reading and writing packets
+	// that include network and (when cooked is false) link layer headers.
+	NewPacketEndpoint(stack *Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+}
+
+// GSOType is the type of GSO segments.
+//
+// +stateify savable
+type GSOType int
+
+// Types of gso segments.
+const (
+	GSONone GSOType = iota
+
+	// Hardware GSO types:
+	GSOTCPv4
+	GSOTCPv6
+
+	// GSOSW is used for software GSO segments which have to be sent by
+	// endpoint.WritePackets.
+	GSOSW
+)
+
+// GSO contains generic segmentation offload properties.
+//
+// +stateify savable
+type GSO struct {
+	// Type is one of GSONone, GSOTCPv4, etc.
+	Type GSOType
+	// NeedsCsum is set if the checksum offload is enabled.
+	NeedsCsum bool
+	// CsumOffset is offset after that to place checksum.
+	CsumOffset uint16
+
+	// Mss is maximum segment size.
+	MSS uint16
+	// L3Len is L3 (IP) header length.
+	L3HdrLen uint16
+
+	// MaxSize is maximum GSO packet size.
+	MaxSize uint32
+}
+
+// GSOEndpoint provides access to GSO properties.
+type GSOEndpoint interface {
+	// GSOMaxSize returns the maximum GSO packet size.
+	GSOMaxSize() uint32
+}
+
+// SoftwareGSOMaxSize is a maximum allowed size of a software GSO segment.
+// This isn't a hard limit, because it is never set into packet headers.
+const SoftwareGSOMaxSize = (1 << 16)
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
new file mode 100644
index 000000000..d65f8049e
--- /dev/null
+++ b/pkg/tcpip/stack/route.go
@@ -0,0 +1,289 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+)
+
+// Route represents a route through the networking stack to a given destination.
+type Route struct {
+	// RemoteAddress is the final destination of the route.
+	RemoteAddress tcpip.Address
+
+	// RemoteLinkAddress is the link-layer (MAC) address of the
+	// final destination of the route.
+	RemoteLinkAddress tcpip.LinkAddress
+
+	// LocalAddress is the local address where the route starts.
+	LocalAddress tcpip.Address
+
+	// LocalLinkAddress is the link-layer (MAC) address of the
+	// where the route starts.
+	LocalLinkAddress tcpip.LinkAddress
+
+	// NextHop is the next node in the path to the destination.
+	NextHop tcpip.Address
+
+	// NetProto is the network-layer protocol.
+	NetProto tcpip.NetworkProtocolNumber
+
+	// ref a reference to the network endpoint through which the route
+	// starts.
+	ref *referencedNetworkEndpoint
+
+	// Loop controls where WritePacket should send packets.
+	Loop PacketLooping
+}
+
+// makeRoute initializes a new route. It takes ownership of the provided
+// reference to a network endpoint.
+func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, handleLocal, multicastLoop bool) Route {
+	loop := PacketOut
+	if handleLocal && localAddr != "" && remoteAddr == localAddr {
+		loop = PacketLoop
+	} else if multicastLoop && (header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)) {
+		loop |= PacketLoop
+	} else if remoteAddr == header.IPv4Broadcast {
+		loop |= PacketLoop
+	}
+
+	return Route{
+		NetProto:         netProto,
+		LocalAddress:     localAddr,
+		LocalLinkAddress: localLinkAddr,
+		RemoteAddress:    remoteAddr,
+		ref:              ref,
+		Loop:             loop,
+	}
+}
+
+// NICID returns the id of the NIC from which this route originates.
+func (r *Route) NICID() tcpip.NICID {
+	return r.ref.ep.NICID()
+}
+
+// MaxHeaderLength forwards the call to the network endpoint's implementation.
+func (r *Route) MaxHeaderLength() uint16 {
+	return r.ref.ep.MaxHeaderLength()
+}
+
+// Stats returns a mutable copy of current stats.
+func (r *Route) Stats() tcpip.Stats {
+	return r.ref.nic.stack.Stats()
+}
+
+// PseudoHeaderChecksum forwards the call to the network endpoint's
+// implementation.
+func (r *Route) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, totalLen uint16) uint16 {
+	return header.PseudoHeaderChecksum(protocol, r.LocalAddress, r.RemoteAddress, totalLen)
+}
+
+// Capabilities returns the link-layer capabilities of the route.
+func (r *Route) Capabilities() LinkEndpointCapabilities {
+	return r.ref.ep.Capabilities()
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (r *Route) GSOMaxSize() uint32 {
+	if gso, ok := r.ref.ep.(GSOEndpoint); ok {
+		return gso.GSOMaxSize()
+	}
+	return 0
+}
+
+// Resolve attempts to resolve the link address if necessary. Returns ErrWouldBlock in
+// case address resolution requires blocking, e.g. wait for ARP reply. Waker is
+// notified when address resolution is complete (success or not).
+//
+// If address resolution is required, ErrNoLinkAddress and a notification channel is
+// returned for the top level caller to block. Channel is closed once address resolution
+// is complete (success or not).
+//
+// The NIC r uses must not be locked.
+func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) {
+	if !r.IsResolutionRequired() {
+		// Nothing to do if there is no cache (which does the resolution on cache miss) or
+		// link address is already known.
+		return nil, nil
+	}
+
+	nextAddr := r.NextHop
+	if nextAddr == "" {
+		// Local link address is already known.
+		if r.RemoteAddress == r.LocalAddress {
+			r.RemoteLinkAddress = r.LocalLinkAddress
+			return nil, nil
+		}
+		nextAddr = r.RemoteAddress
+	}
+	linkAddr, ch, err := r.ref.linkCache.GetLinkAddress(r.ref.nic.ID(), nextAddr, r.LocalAddress, r.NetProto, waker)
+	if err != nil {
+		return ch, err
+	}
+	r.RemoteLinkAddress = linkAddr
+	return nil, nil
+}
+
+// RemoveWaker removes a waker that has been added in Resolve().
+func (r *Route) RemoveWaker(waker *sleep.Waker) {
+	nextAddr := r.NextHop
+	if nextAddr == "" {
+		nextAddr = r.RemoteAddress
+	}
+	r.ref.linkCache.RemoveWaker(r.ref.nic.ID(), nextAddr, waker)
+}
+
+// IsResolutionRequired returns true if Resolve() must be called to resolve
+// the link address before the this route can be written to.
+//
+// The NIC r uses must not be locked.
+func (r *Route) IsResolutionRequired() bool {
+	return r.ref.isValidForOutgoing() && r.ref.linkCache != nil && r.RemoteLinkAddress == ""
+}
+
+// WritePacket writes the packet through the given route.
+func (r *Route) WritePacket(gso *GSO, params NetworkHeaderParams, pkt *PacketBuffer) *tcpip.Error {
+	if !r.ref.isValidForOutgoing() {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	// WritePacket takes ownership of pkt, calculate numBytes first.
+	numBytes := pkt.Header.UsedLength() + pkt.Data.Size()
+
+	err := r.ref.ep.WritePacket(r, gso, params, pkt)
+	if err != nil {
+		r.Stats().IP.OutgoingPacketErrors.Increment()
+	} else {
+		r.ref.nic.stats.Tx.Packets.Increment()
+		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
+	}
+	return err
+}
+
+// WritePackets writes a list of n packets through the given route and returns
+// the number of packets written.
+func (r *Route) WritePackets(gso *GSO, pkts PacketBufferList, params NetworkHeaderParams) (int, *tcpip.Error) {
+	if !r.ref.isValidForOutgoing() {
+		return 0, tcpip.ErrInvalidEndpointState
+	}
+
+	// WritePackets takes ownership of pkt, calculate length first.
+	numPkts := pkts.Len()
+
+	n, err := r.ref.ep.WritePackets(r, gso, pkts, params)
+	if err != nil {
+		r.Stats().IP.OutgoingPacketErrors.IncrementBy(uint64(numPkts - n))
+	}
+	r.ref.nic.stats.Tx.Packets.IncrementBy(uint64(n))
+
+	writtenBytes := 0
+	for i, pb := 0, pkts.Front(); i < n && pb != nil; i, pb = i+1, pb.Next() {
+		writtenBytes += pb.Header.UsedLength()
+		writtenBytes += pb.Data.Size()
+	}
+
+	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(writtenBytes))
+	return n, err
+}
+
+// WriteHeaderIncludedPacket writes a packet already containing a network
+// header through the given route.
+func (r *Route) WriteHeaderIncludedPacket(pkt *PacketBuffer) *tcpip.Error {
+	if !r.ref.isValidForOutgoing() {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	// WriteHeaderIncludedPacket takes ownership of pkt, calculate numBytes first.
+	numBytes := pkt.Data.Size()
+
+	if err := r.ref.ep.WriteHeaderIncludedPacket(r, pkt); err != nil {
+		r.Stats().IP.OutgoingPacketErrors.Increment()
+		return err
+	}
+	r.ref.nic.stats.Tx.Packets.Increment()
+	r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(numBytes))
+	return nil
+}
+
+// DefaultTTL returns the default TTL of the underlying network endpoint.
+func (r *Route) DefaultTTL() uint8 {
+	return r.ref.ep.DefaultTTL()
+}
+
+// MTU returns the MTU of the underlying network endpoint.
+func (r *Route) MTU() uint32 {
+	return r.ref.ep.MTU()
+}
+
+// NetworkProtocolNumber returns the NetworkProtocolNumber of the underlying
+// network endpoint.
+func (r *Route) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return r.ref.ep.NetworkProtocolNumber()
+}
+
+// Release frees all resources associated with the route.
+func (r *Route) Release() {
+	if r.ref != nil {
+		r.ref.decRef()
+		r.ref = nil
+	}
+}
+
+// Clone Clone a route such that the original one can be released and the new
+// one will remain valid.
+func (r *Route) Clone() Route {
+	if r.ref != nil {
+		r.ref.incRef()
+	}
+	return *r
+}
+
+// MakeLoopedRoute duplicates the given route with special handling for routes
+// used for sending multicast or broadcast packets. In those cases the
+// multicast/broadcast address is the remote address when sending out, but for
+// incoming (looped) packets it becomes the local address. Similarly, the local
+// interface address that was the local address going out becomes the remote
+// address coming in. This is different to unicast routes where local and
+// remote addresses remain the same as they identify location (local vs remote)
+// not direction (source vs destination).
+func (r *Route) MakeLoopedRoute() Route {
+	l := r.Clone()
+	if r.RemoteAddress == header.IPv4Broadcast || header.IsV4MulticastAddress(r.RemoteAddress) || header.IsV6MulticastAddress(r.RemoteAddress) {
+		l.RemoteAddress, l.LocalAddress = l.LocalAddress, l.RemoteAddress
+		l.RemoteLinkAddress = l.LocalLinkAddress
+	}
+	return l
+}
+
+// Stack returns the instance of the Stack that owns this route.
+func (r *Route) Stack() *Stack {
+	return r.ref.stack()
+}
+
+// ReverseRoute returns new route with given source and destination address.
+func (r *Route) ReverseRoute(src tcpip.Address, dst tcpip.Address) Route {
+	return Route{
+		NetProto:          r.NetProto,
+		LocalAddress:      dst,
+		LocalLinkAddress:  r.RemoteLinkAddress,
+		RemoteAddress:     src,
+		RemoteLinkAddress: r.LocalLinkAddress,
+		ref:               r.ref,
+		Loop:              r.Loop,
+	}
+}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
new file mode 100644
index 000000000..cdcfb8321
--- /dev/null
+++ b/pkg/tcpip/stack/stack.go
@@ -0,0 +1,1938 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package stack provides the glue between networking protocols and the
+// consumers of the networking stack.
+//
+// For consumers, the only function of interest is New(), everything else is
+// provided by the tcpip/public package.
+package stack
+
+import (
+	"bytes"
+	"encoding/binary"
+	mathrand "math/rand"
+	"sync/atomic"
+	"time"
+
+	"golang.org/x/time/rate"
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// ageLimit is set to the same cache stale time used in Linux.
+	ageLimit = 1 * time.Minute
+	// resolutionTimeout is set to the same ARP timeout used in Linux.
+	resolutionTimeout = 1 * time.Second
+	// resolutionAttempts is set to the same ARP retries used in Linux.
+	resolutionAttempts = 3
+
+	// DefaultTOS is the default type of service value for network endpoints.
+	DefaultTOS = 0
+)
+
+type transportProtocolState struct {
+	proto          TransportProtocol
+	defaultHandler func(r *Route, id TransportEndpointID, pkt *PacketBuffer) bool
+}
+
+// TCPProbeFunc is the expected function type for a TCP probe function to be
+// passed to stack.AddTCPProbe.
+type TCPProbeFunc func(s TCPEndpointState)
+
+// TCPCubicState is used to hold a copy of the internal cubic state when the
+// TCPProbeFunc is invoked.
+type TCPCubicState struct {
+	WLastMax                float64
+	WMax                    float64
+	T                       time.Time
+	TimeSinceLastCongestion time.Duration
+	C                       float64
+	K                       float64
+	Beta                    float64
+	WC                      float64
+	WEst                    float64
+}
+
+// TCPEndpointID is the unique 4 tuple that identifies a given endpoint.
+type TCPEndpointID struct {
+	// LocalPort is the local port associated with the endpoint.
+	LocalPort uint16
+
+	// LocalAddress is the local [network layer] address associated with
+	// the endpoint.
+	LocalAddress tcpip.Address
+
+	// RemotePort is the remote port associated with the endpoint.
+	RemotePort uint16
+
+	// RemoteAddress it the remote [network layer] address associated with
+	// the endpoint.
+	RemoteAddress tcpip.Address
+}
+
+// TCPFastRecoveryState holds a copy of the internal fast recovery state of a
+// TCP endpoint.
+type TCPFastRecoveryState struct {
+	// Active if true indicates the endpoint is in fast recovery.
+	Active bool
+
+	// First is the first unacknowledged sequence number being recovered.
+	First seqnum.Value
+
+	// Last is the 'recover' sequence number that indicates the point at
+	// which we should exit recovery barring any timeouts etc.
+	Last seqnum.Value
+
+	// MaxCwnd is the maximum value we are permitted to grow the congestion
+	// window during recovery. This is set at the time we enter recovery.
+	MaxCwnd int
+
+	// HighRxt is the highest sequence number which has been retransmitted
+	// during the current loss recovery phase.
+	// See: RFC 6675 Section 2 for details.
+	HighRxt seqnum.Value
+
+	// RescueRxt is the highest sequence number which has been
+	// optimistically retransmitted to prevent stalling of the ACK clock
+	// when there is loss at the end of the window and no new data is
+	// available for transmission.
+	// See: RFC 6675 Section 2 for details.
+	RescueRxt seqnum.Value
+}
+
+// TCPReceiverState holds a copy of the internal state of the receiver for
+// a given TCP endpoint.
+type TCPReceiverState struct {
+	// RcvNxt is the TCP variable RCV.NXT.
+	RcvNxt seqnum.Value
+
+	// RcvAcc is the TCP variable RCV.ACC.
+	RcvAcc seqnum.Value
+
+	// RcvWndScale is the window scaling to use for inbound segments.
+	RcvWndScale uint8
+
+	// PendingBufUsed is the number of bytes pending in the receive
+	// queue.
+	PendingBufUsed seqnum.Size
+
+	// PendingBufSize is the size of the socket receive buffer.
+	PendingBufSize seqnum.Size
+}
+
+// TCPSenderState holds a copy of the internal state of the sender for
+// a given TCP Endpoint.
+type TCPSenderState struct {
+	// LastSendTime is the time at which we sent the last segment.
+	LastSendTime time.Time
+
+	// DupAckCount is the number of Duplicate ACK's received.
+	DupAckCount int
+
+	// SndCwnd is the size of the sending congestion window in packets.
+	SndCwnd int
+
+	// Ssthresh is the slow start threshold in packets.
+	Ssthresh int
+
+	// SndCAAckCount is the number of packets consumed in congestion
+	// avoidance mode.
+	SndCAAckCount int
+
+	// Outstanding is the number of packets in flight.
+	Outstanding int
+
+	// SndWnd is the send window size in bytes.
+	SndWnd seqnum.Size
+
+	// SndUna is the next unacknowledged sequence number.
+	SndUna seqnum.Value
+
+	// SndNxt is the sequence number of the next segment to be sent.
+	SndNxt seqnum.Value
+
+	// RTTMeasureSeqNum is the sequence number being used for the latest RTT
+	// measurement.
+	RTTMeasureSeqNum seqnum.Value
+
+	// RTTMeasureTime is the time when the RTTMeasureSeqNum was sent.
+	RTTMeasureTime time.Time
+
+	// Closed indicates that the caller has closed the endpoint for sending.
+	Closed bool
+
+	// SRTT is the smoothed round-trip time as defined in section 2 of
+	// RFC 6298.
+	SRTT time.Duration
+
+	// RTO is the retransmit timeout as defined in section of 2 of RFC 6298.
+	RTO time.Duration
+
+	// RTTVar is the round-trip time variation as defined in section 2 of
+	// RFC 6298.
+	RTTVar time.Duration
+
+	// SRTTInited if true indicates take a valid RTT measurement has been
+	// completed.
+	SRTTInited bool
+
+	// MaxPayloadSize is the maximum size of the payload of a given segment.
+	// It is initialized on demand.
+	MaxPayloadSize int
+
+	// SndWndScale is the number of bits to shift left when reading the send
+	// window size from a segment.
+	SndWndScale uint8
+
+	// MaxSentAck is the highest acknowledgement number sent till now.
+	MaxSentAck seqnum.Value
+
+	// FastRecovery holds the fast recovery state for the endpoint.
+	FastRecovery TCPFastRecoveryState
+
+	// Cubic holds the state related to CUBIC congestion control.
+	Cubic TCPCubicState
+}
+
+// TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
+type TCPSACKInfo struct {
+	// Blocks is the list of SACK Blocks that identify the out of order segments
+	// held by a given TCP endpoint.
+	Blocks []header.SACKBlock
+
+	// ReceivedBlocks are the SACK blocks received by this endpoint
+	// from the peer endpoint.
+	ReceivedBlocks []header.SACKBlock
+
+	// MaxSACKED is the highest sequence number that has been SACKED
+	// by the peer.
+	MaxSACKED seqnum.Value
+}
+
+// RcvBufAutoTuneParams holds state related to TCP receive buffer auto-tuning.
+type RcvBufAutoTuneParams struct {
+	// MeasureTime is the time at which the current measurement
+	// was started.
+	MeasureTime time.Time
+
+	// CopiedBytes is the number of bytes copied to userspace since
+	// this measure began.
+	CopiedBytes int
+
+	// PrevCopiedBytes is the number of bytes copied to userspace in
+	// the previous RTT period.
+	PrevCopiedBytes int
+
+	// RcvBufSize is the auto tuned receive buffer size.
+	RcvBufSize int
+
+	// RTT is the smoothed RTT as measured by observing the time between
+	// when a byte is first acknowledged and the receipt of data that is at
+	// least one window beyond the sequence number that was acknowledged.
+	RTT time.Duration
+
+	// RTTVar is the "round-trip time variation" as defined in section 2
+	// of RFC6298.
+	RTTVar time.Duration
+
+	// RTTMeasureSeqNumber is the highest acceptable sequence number at the
+	// time this RTT measurement period began.
+	RTTMeasureSeqNumber seqnum.Value
+
+	// RTTMeasureTime is the absolute time at which the current RTT
+	// measurement period began.
+	RTTMeasureTime time.Time
+
+	// Disabled is true if an explicit receive buffer is set for the
+	// endpoint.
+	Disabled bool
+}
+
+// TCPEndpointState is a copy of the internal state of a TCP endpoint.
+type TCPEndpointState struct {
+	// ID is a copy of the TransportEndpointID for the endpoint.
+	ID TCPEndpointID
+
+	// SegTime denotes the absolute time when this segment was received.
+	SegTime time.Time
+
+	// RcvBufSize is the size of the receive socket buffer for the endpoint.
+	RcvBufSize int
+
+	// RcvBufUsed is the amount of bytes actually held in the receive socket
+	// buffer for the endpoint.
+	RcvBufUsed int
+
+	// RcvBufAutoTuneParams is used to hold state variables to compute
+	// the auto tuned receive buffer size.
+	RcvAutoParams RcvBufAutoTuneParams
+
+	// RcvClosed if true, indicates the endpoint has been closed for reading.
+	RcvClosed bool
+
+	// SendTSOk is used to indicate when the TS Option has been negotiated.
+	// When sendTSOk is true every non-RST segment should carry a TS as per
+	// RFC7323#section-1.1.
+	SendTSOk bool
+
+	// RecentTS is the timestamp that should be sent in the TSEcr field of
+	// the timestamp for future segments sent by the endpoint. This field is
+	// updated if required when a new segment is received by this endpoint.
+	RecentTS uint32
+
+	// TSOffset is a randomized offset added to the value of the TSVal field
+	// in the timestamp option.
+	TSOffset uint32
+
+	// SACKPermitted is set to true if the peer sends the TCPSACKPermitted
+	// option in the SYN/SYN-ACK.
+	SACKPermitted bool
+
+	// SACK holds TCP SACK related information for this endpoint.
+	SACK TCPSACKInfo
+
+	// SndBufSize is the size of the socket send buffer.
+	SndBufSize int
+
+	// SndBufUsed is the number of bytes held in the socket send buffer.
+	SndBufUsed int
+
+	// SndClosed indicates that the endpoint has been closed for sends.
+	SndClosed bool
+
+	// SndBufInQueue is the number of bytes in the send queue.
+	SndBufInQueue seqnum.Size
+
+	// PacketTooBigCount is used to notify the main protocol routine how
+	// many times a "packet too big" control packet is received.
+	PacketTooBigCount int
+
+	// SndMTU is the smallest MTU seen in the control packets received.
+	SndMTU int
+
+	// Receiver holds variables related to the TCP receiver for the endpoint.
+	Receiver TCPReceiverState
+
+	// Sender holds state related to the TCP Sender for the endpoint.
+	Sender TCPSenderState
+}
+
+// ResumableEndpoint is an endpoint that needs to be resumed after restore.
+type ResumableEndpoint interface {
+	// Resume resumes an endpoint after restore. This can be used to restart
+	// background workers such as protocol goroutines. This must be called after
+	// all indirect dependencies of the endpoint has been restored, which
+	// generally implies at the end of the restore process.
+	Resume(*Stack)
+}
+
+// uniqueIDGenerator is a default unique ID generator.
+type uniqueIDGenerator uint64
+
+func (u *uniqueIDGenerator) UniqueID() uint64 {
+	return atomic.AddUint64((*uint64)(u), 1)
+}
+
+// NICNameFromID is a function that returns a stable name for the specified NIC,
+// even if different NIC IDs are used to refer to the same NIC in different
+// program runs. It is used when generating opaque interface identifiers (IIDs).
+// If the NIC was created with a name, it will be passed to NICNameFromID.
+//
+// NICNameFromID SHOULD return unique NIC names so unique opaque IIDs are
+// generated for the same prefix on differnt NICs.
+type NICNameFromID func(tcpip.NICID, string) string
+
+// OpaqueInterfaceIdentifierOptions holds the options related to the generation
+// of opaque interface indentifiers (IIDs) as defined by RFC 7217.
+type OpaqueInterfaceIdentifierOptions struct {
+	// NICNameFromID is a function that returns a stable name for a specified NIC,
+	// even if the NIC ID changes over time.
+	//
+	// Must be specified to generate the opaque IID.
+	NICNameFromID NICNameFromID
+
+	// SecretKey is a pseudo-random number used as the secret key when generating
+	// opaque IIDs as defined by RFC 7217. The key SHOULD be at least
+	// header.OpaqueIIDSecretKeyMinBytes bytes and MUST follow minimum randomness
+	// requirements for security as outlined by RFC 4086. SecretKey MUST NOT
+	// change between program runs, unless explicitly changed.
+	//
+	// OpaqueInterfaceIdentifierOptions takes ownership of SecretKey. SecretKey
+	// MUST NOT be modified after Stack is created.
+	//
+	// May be nil, but a nil value is highly discouraged to maintain
+	// some level of randomness between nodes.
+	SecretKey []byte
+}
+
+// Stack is a networking stack, with all supported protocols, NICs, and route
+// table.
+type Stack struct {
+	transportProtocols map[tcpip.TransportProtocolNumber]*transportProtocolState
+	networkProtocols   map[tcpip.NetworkProtocolNumber]NetworkProtocol
+	linkAddrResolvers  map[tcpip.NetworkProtocolNumber]LinkAddressResolver
+
+	// rawFactory creates raw endpoints. If nil, raw endpoints are
+	// disabled. It is set during Stack creation and is immutable.
+	rawFactory RawFactory
+
+	demux *transportDemuxer
+
+	stats tcpip.Stats
+
+	linkAddrCache *linkAddrCache
+
+	mu               sync.RWMutex
+	nics             map[tcpip.NICID]*NIC
+	forwarding       bool
+	cleanupEndpoints map[TransportEndpoint]struct{}
+
+	// route is the route table passed in by the user via SetRouteTable(),
+	// it is used by FindRoute() to build a route for a specific
+	// destination.
+	routeTable []tcpip.Route
+
+	*ports.PortManager
+
+	// If not nil, then any new endpoints will have this probe function
+	// invoked everytime they receive a TCP segment.
+	tcpProbeFunc TCPProbeFunc
+
+	// clock is used to generate user-visible times.
+	clock tcpip.Clock
+
+	// handleLocal allows non-loopback interfaces to loop packets.
+	handleLocal bool
+
+	// tables are the iptables packet filtering and manipulation rules.
+	tables *IPTables
+
+	// resumableEndpoints is a list of endpoints that need to be resumed if the
+	// stack is being restored.
+	resumableEndpoints []ResumableEndpoint
+
+	// icmpRateLimiter is a global rate limiter for all ICMP messages generated
+	// by the stack.
+	icmpRateLimiter *ICMPRateLimiter
+
+	// seed is a one-time random value initialized at stack startup
+	// and is used to seed the TCP port picking on active connections
+	//
+	// TODO(gvisor.dev/issue/940): S/R this field.
+	seed uint32
+
+	// ndpConfigs is the default NDP configurations used by interfaces.
+	ndpConfigs NDPConfigurations
+
+	// autoGenIPv6LinkLocal determines whether or not the stack will attempt
+	// to auto-generate an IPv6 link-local address for newly enabled non-loopback
+	// NICs. See the AutoGenIPv6LinkLocal field of Options for more details.
+	autoGenIPv6LinkLocal bool
+
+	// ndpDisp is the NDP event dispatcher that is used to send the netstack
+	// integrator NDP related events.
+	ndpDisp NDPDispatcher
+
+	// uniqueIDGenerator is a generator of unique identifiers.
+	uniqueIDGenerator UniqueID
+
+	// opaqueIIDOpts hold the options for generating opaque interface identifiers
+	// (IIDs) as outlined by RFC 7217.
+	opaqueIIDOpts OpaqueInterfaceIdentifierOptions
+
+	// tempIIDSeed is used to seed the initial temporary interface identifier
+	// history value used to generate IIDs for temporary SLAAC addresses.
+	tempIIDSeed []byte
+
+	// forwarder holds the packets that wait for their link-address resolutions
+	// to complete, and forwards them when each resolution is done.
+	forwarder *forwardQueue
+
+	// randomGenerator is an injectable pseudo random generator that can be
+	// used when a random number is required.
+	randomGenerator *mathrand.Rand
+
+	// sendBufferSize holds the min/default/max send buffer sizes for
+	// endpoints other than TCP.
+	sendBufferSize SendBufferSizeOption
+
+	// receiveBufferSize holds the min/default/max receive buffer sizes for
+	// endpoints other than TCP.
+	receiveBufferSize ReceiveBufferSizeOption
+}
+
+// UniqueID is an abstract generator of unique identifiers.
+type UniqueID interface {
+	UniqueID() uint64
+}
+
+// Options contains optional Stack configuration.
+type Options struct {
+	// NetworkProtocols lists the network protocols to enable.
+	NetworkProtocols []NetworkProtocol
+
+	// TransportProtocols lists the transport protocols to enable.
+	TransportProtocols []TransportProtocol
+
+	// Clock is an optional clock source used for timestampping packets.
+	//
+	// If no Clock is specified, the clock source will be time.Now.
+	Clock tcpip.Clock
+
+	// Stats are optional statistic counters.
+	Stats tcpip.Stats
+
+	// HandleLocal indicates whether packets destined to their source
+	// should be handled by the stack internally (true) or outside the
+	// stack (false).
+	HandleLocal bool
+
+	// UniqueID is an optional generator of unique identifiers.
+	UniqueID UniqueID
+
+	// NDPConfigs is the default NDP configurations used by interfaces.
+	//
+	// By default, NDPConfigs will have a zero value for its
+	// DupAddrDetectTransmits field, implying that DAD will not be performed
+	// before assigning an address to a NIC.
+	NDPConfigs NDPConfigurations
+
+	// AutoGenIPv6LinkLocal determines whether or not the stack will attempt to
+	// auto-generate an IPv6 link-local address for newly enabled non-loopback
+	// NICs.
+	//
+	// Note, setting this to true does not mean that a link-local address
+	// will be assigned right away, or at all. If Duplicate Address Detection
+	// is enabled, an address will only be assigned if it successfully resolves.
+	// If it fails, no further attempt will be made to auto-generate an IPv6
+	// link-local address.
+	//
+	// The generated link-local address will follow RFC 4291 Appendix A
+	// guidelines.
+	AutoGenIPv6LinkLocal bool
+
+	// NDPDisp is the NDP event dispatcher that an integrator can provide to
+	// receive NDP related events.
+	NDPDisp NDPDispatcher
+
+	// RawFactory produces raw endpoints. Raw endpoints are enabled only if
+	// this is non-nil.
+	RawFactory RawFactory
+
+	// OpaqueIIDOpts hold the options for generating opaque interface
+	// identifiers (IIDs) as outlined by RFC 7217.
+	OpaqueIIDOpts OpaqueInterfaceIdentifierOptions
+
+	// RandSource is an optional source to use to generate random
+	// numbers. If omitted it defaults to a Source seeded by the data
+	// returned by rand.Read().
+	//
+	// RandSource must be thread-safe.
+	RandSource mathrand.Source
+
+	// TempIIDSeed is used to seed the initial temporary interface identifier
+	// history value used to generate IIDs for temporary SLAAC addresses.
+	//
+	// Temporary SLAAC adresses are short-lived addresses which are unpredictable
+	// and random from the perspective of other nodes on the network. It is
+	// recommended that the seed be a random byte buffer of at least
+	// header.IIDSize bytes to make sure that temporary SLAAC addresses are
+	// sufficiently random. It should follow minimum randomness requirements for
+	// security as outlined by RFC 4086.
+	//
+	// Note: using a nil value, the same seed across netstack program runs, or a
+	// seed that is too small would reduce randomness and increase predictability,
+	// defeating the purpose of temporary SLAAC addresses.
+	TempIIDSeed []byte
+}
+
+// TransportEndpointInfo holds useful information about a transport endpoint
+// which can be queried by monitoring tools.
+//
+// +stateify savable
+type TransportEndpointInfo struct {
+	// The following fields are initialized at creation time and are
+	// immutable.
+
+	NetProto   tcpip.NetworkProtocolNumber
+	TransProto tcpip.TransportProtocolNumber
+
+	// The following fields are protected by endpoint mu.
+
+	ID TransportEndpointID
+	// BindNICID and bindAddr are set via calls to Bind(). They are used to
+	// reject attempts to send data or connect via a different NIC or
+	// address
+	BindNICID tcpip.NICID
+	BindAddr  tcpip.Address
+	// RegisterNICID is the default NICID registered as a side-effect of
+	// connect or datagram write.
+	RegisterNICID tcpip.NICID
+}
+
+// AddrNetProtoLocked unwraps the specified address if it is a V4-mapped V6
+// address and returns the network protocol number to be used to communicate
+// with the specified address. It returns an error if the passed address is
+// incompatible with the receiver.
+//
+// Preconditon: the parent endpoint mu must be held while calling this method.
+func (e *TransportEndpointInfo) AddrNetProtoLocked(addr tcpip.FullAddress, v6only bool) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	netProto := e.NetProto
+	switch len(addr.Addr) {
+	case header.IPv4AddressSize:
+		netProto = header.IPv4ProtocolNumber
+	case header.IPv6AddressSize:
+		if header.IsV4MappedAddress(addr.Addr) {
+			netProto = header.IPv4ProtocolNumber
+			addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
+			if addr.Addr == header.IPv4Any {
+				addr.Addr = ""
+			}
+		}
+	}
+
+	switch len(e.ID.LocalAddress) {
+	case header.IPv4AddressSize:
+		if len(addr.Addr) == header.IPv6AddressSize {
+			return tcpip.FullAddress{}, 0, tcpip.ErrInvalidEndpointState
+		}
+	case header.IPv6AddressSize:
+		if len(addr.Addr) == header.IPv4AddressSize {
+			return tcpip.FullAddress{}, 0, tcpip.ErrNetworkUnreachable
+		}
+	}
+
+	switch {
+	case netProto == e.NetProto:
+	case netProto == header.IPv4ProtocolNumber && e.NetProto == header.IPv6ProtocolNumber:
+		if v6only {
+			return tcpip.FullAddress{}, 0, tcpip.ErrNoRoute
+		}
+	default:
+		return tcpip.FullAddress{}, 0, tcpip.ErrInvalidEndpointState
+	}
+
+	return addr, netProto, nil
+}
+
+// IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
+// marker interface.
+func (*TransportEndpointInfo) IsEndpointInfo() {}
+
+// New allocates a new networking stack with only the requested networking and
+// transport protocols configured with default options.
+//
+// Note, NDPConfigurations will be fixed before being used by the Stack. That
+// is, if an invalid value was provided, it will be reset to the default value.
+//
+// Protocol options can be changed by calling the
+// SetNetworkProtocolOption/SetTransportProtocolOption methods provided by the
+// stack. Please refer to individual protocol implementations as to what options
+// are supported.
+func New(opts Options) *Stack {
+	clock := opts.Clock
+	if clock == nil {
+		clock = &tcpip.StdClock{}
+	}
+
+	if opts.UniqueID == nil {
+		opts.UniqueID = new(uniqueIDGenerator)
+	}
+
+	randSrc := opts.RandSource
+	if randSrc == nil {
+		// Source provided by mathrand.NewSource is not thread-safe so
+		// we wrap it in a simple thread-safe version.
+		randSrc = &lockedRandomSource{src: mathrand.NewSource(generateRandInt64())}
+	}
+
+	// Make sure opts.NDPConfigs contains valid values only.
+	opts.NDPConfigs.validate()
+
+	s := &Stack{
+		transportProtocols:   make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
+		networkProtocols:     make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
+		linkAddrResolvers:    make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver),
+		nics:                 make(map[tcpip.NICID]*NIC),
+		cleanupEndpoints:     make(map[TransportEndpoint]struct{}),
+		linkAddrCache:        newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts),
+		PortManager:          ports.NewPortManager(),
+		clock:                clock,
+		stats:                opts.Stats.FillIn(),
+		handleLocal:          opts.HandleLocal,
+		tables:               DefaultTables(),
+		icmpRateLimiter:      NewICMPRateLimiter(),
+		seed:                 generateRandUint32(),
+		ndpConfigs:           opts.NDPConfigs,
+		autoGenIPv6LinkLocal: opts.AutoGenIPv6LinkLocal,
+		uniqueIDGenerator:    opts.UniqueID,
+		ndpDisp:              opts.NDPDisp,
+		opaqueIIDOpts:        opts.OpaqueIIDOpts,
+		tempIIDSeed:          opts.TempIIDSeed,
+		forwarder:            newForwardQueue(),
+		randomGenerator:      mathrand.New(randSrc),
+		sendBufferSize: SendBufferSizeOption{
+			Min:     MinBufferSize,
+			Default: DefaultBufferSize,
+			Max:     DefaultMaxBufferSize,
+		},
+		receiveBufferSize: ReceiveBufferSizeOption{
+			Min:     MinBufferSize,
+			Default: DefaultBufferSize,
+			Max:     DefaultMaxBufferSize,
+		},
+	}
+
+	// Add specified network protocols.
+	for _, netProto := range opts.NetworkProtocols {
+		s.networkProtocols[netProto.Number()] = netProto
+		if r, ok := netProto.(LinkAddressResolver); ok {
+			s.linkAddrResolvers[r.LinkAddressProtocol()] = r
+		}
+	}
+
+	// Add specified transport protocols.
+	for _, transProto := range opts.TransportProtocols {
+		s.transportProtocols[transProto.Number()] = &transportProtocolState{
+			proto: transProto,
+		}
+	}
+
+	// Add the factory for raw endpoints, if present.
+	s.rawFactory = opts.RawFactory
+
+	// Create the global transport demuxer.
+	s.demux = newTransportDemuxer(s)
+
+	return s
+}
+
+// UniqueID returns a unique identifier.
+func (s *Stack) UniqueID() uint64 {
+	return s.uniqueIDGenerator.UniqueID()
+}
+
+// SetNetworkProtocolOption allows configuring individual protocol level
+// options. This method returns an error if the protocol is not supported or
+// option is not supported by the protocol implementation or the provided value
+// is incorrect.
+func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error {
+	netProto, ok := s.networkProtocols[network]
+	if !ok {
+		return tcpip.ErrUnknownProtocol
+	}
+	return netProto.SetOption(option)
+}
+
+// NetworkProtocolOption allows retrieving individual protocol level option
+// values. This method returns an error if the protocol is not supported or
+// option is not supported by the protocol implementation.
+// e.g.
+// var v ipv4.MyOption
+// err := s.NetworkProtocolOption(tcpip.IPv4ProtocolNumber, &v)
+// if err != nil {
+//   ...
+// }
+func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error {
+	netProto, ok := s.networkProtocols[network]
+	if !ok {
+		return tcpip.ErrUnknownProtocol
+	}
+	return netProto.Option(option)
+}
+
+// SetTransportProtocolOption allows configuring individual protocol level
+// options. This method returns an error if the protocol is not supported or
+// option is not supported by the protocol implementation or the provided value
+// is incorrect.
+func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error {
+	transProtoState, ok := s.transportProtocols[transport]
+	if !ok {
+		return tcpip.ErrUnknownProtocol
+	}
+	return transProtoState.proto.SetOption(option)
+}
+
+// TransportProtocolOption allows retrieving individual protocol level option
+// values. This method returns an error if the protocol is not supported or
+// option is not supported by the protocol implementation.
+// var v tcp.SACKEnabled
+// if err := s.TransportProtocolOption(tcpip.TCPProtocolNumber, &v); err != nil {
+//   ...
+// }
+func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error {
+	transProtoState, ok := s.transportProtocols[transport]
+	if !ok {
+		return tcpip.ErrUnknownProtocol
+	}
+	return transProtoState.proto.Option(option)
+}
+
+// SetTransportProtocolHandler sets the per-stack default handler for the given
+// protocol.
+//
+// It must be called only during initialization of the stack. Changing it as the
+// stack is operating is not supported.
+func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, *PacketBuffer) bool) {
+	state := s.transportProtocols[p]
+	if state != nil {
+		state.defaultHandler = h
+	}
+}
+
+// NowNanoseconds implements tcpip.Clock.NowNanoseconds.
+func (s *Stack) NowNanoseconds() int64 {
+	return s.clock.NowNanoseconds()
+}
+
+// Stats returns a mutable copy of the current stats.
+//
+// This is not generally exported via the public interface, but is available
+// internally.
+func (s *Stack) Stats() tcpip.Stats {
+	return s.stats
+}
+
+// SetForwarding enables or disables the packet forwarding between NICs.
+//
+// When forwarding becomes enabled, any host-only state on all NICs will be
+// cleaned up and if IPv6 is enabled, NDP Router Solicitations will be started.
+// When forwarding becomes disabled and if IPv6 is enabled, NDP Router
+// Solicitations will be stopped.
+func (s *Stack) SetForwarding(enable bool) {
+	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// If forwarding status didn't change, do nothing further.
+	if s.forwarding == enable {
+		return
+	}
+
+	s.forwarding = enable
+
+	// If this stack does not support IPv6, do nothing further.
+	if _, ok := s.networkProtocols[header.IPv6ProtocolNumber]; !ok {
+		return
+	}
+
+	if enable {
+		for _, nic := range s.nics {
+			nic.becomeIPv6Router()
+		}
+	} else {
+		for _, nic := range s.nics {
+			nic.becomeIPv6Host()
+		}
+	}
+}
+
+// Forwarding returns if the packet forwarding between NICs is enabled.
+func (s *Stack) Forwarding() bool {
+	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	return s.forwarding
+}
+
+// SetRouteTable assigns the route table to be used by this stack. It
+// specifies which NIC to use for given destination address ranges.
+//
+// This method takes ownership of the table.
+func (s *Stack) SetRouteTable(table []tcpip.Route) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	s.routeTable = table
+}
+
+// GetRouteTable returns the route table which is currently in use.
+func (s *Stack) GetRouteTable() []tcpip.Route {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return append([]tcpip.Route(nil), s.routeTable...)
+}
+
+// AddRoute appends a route to the route table.
+func (s *Stack) AddRoute(route tcpip.Route) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.routeTable = append(s.routeTable, route)
+}
+
+// NewEndpoint creates a new transport layer endpoint of the given protocol.
+func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	t, ok := s.transportProtocols[transport]
+	if !ok {
+		return nil, tcpip.ErrUnknownProtocol
+	}
+
+	return t.proto.NewEndpoint(s, network, waiterQueue)
+}
+
+// NewRawEndpoint creates a new raw transport layer endpoint of the given
+// protocol. Raw endpoints receive all traffic for a given protocol regardless
+// of address.
+func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) {
+	if s.rawFactory == nil {
+		return nil, tcpip.ErrNotPermitted
+	}
+
+	if !associated {
+		return s.rawFactory.NewUnassociatedEndpoint(s, network, transport, waiterQueue)
+	}
+
+	t, ok := s.transportProtocols[transport]
+	if !ok {
+		return nil, tcpip.ErrUnknownProtocol
+	}
+
+	return t.proto.NewRawEndpoint(s, network, waiterQueue)
+}
+
+// NewPacketEndpoint creates a new packet endpoint listening for the given
+// netProto.
+func (s *Stack) NewPacketEndpoint(cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	if s.rawFactory == nil {
+		return nil, tcpip.ErrNotPermitted
+	}
+
+	return s.rawFactory.NewPacketEndpoint(s, cooked, netProto, waiterQueue)
+}
+
+// NICContext is an opaque pointer used to store client-supplied NIC metadata.
+type NICContext interface{}
+
+// NICOptions specifies the configuration of a NIC as it is being created.
+// The zero value creates an enabled, unnamed NIC.
+type NICOptions struct {
+	// Name specifies the name of the NIC.
+	Name string
+
+	// Disabled specifies whether to avoid calling Attach on the passed
+	// LinkEndpoint.
+	Disabled bool
+
+	// Context specifies user-defined data that will be returned in stack.NICInfo
+	// for the NIC. Clients of this library can use it to add metadata that
+	// should be tracked alongside a NIC, to avoid having to keep a
+	// map[tcpip.NICID]metadata mirroring stack.Stack's nic map.
+	Context NICContext
+}
+
+// CreateNICWithOptions creates a NIC with the provided id, LinkEndpoint, and
+// NICOptions. See the documentation on type NICOptions for details on how
+// NICs can be configured.
+//
+// LinkEndpoint.Attach will be called to bind ep with a NetworkDispatcher.
+func (s *Stack) CreateNICWithOptions(id tcpip.NICID, ep LinkEndpoint, opts NICOptions) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// Make sure id is unique.
+	if _, ok := s.nics[id]; ok {
+		return tcpip.ErrDuplicateNICID
+	}
+
+	// Make sure name is unique, unless unnamed.
+	if opts.Name != "" {
+		for _, n := range s.nics {
+			if n.Name() == opts.Name {
+				return tcpip.ErrDuplicateNICID
+			}
+		}
+	}
+
+	n := newNIC(s, id, opts.Name, ep, opts.Context)
+	s.nics[id] = n
+	if !opts.Disabled {
+		return n.enable()
+	}
+
+	return nil
+}
+
+// CreateNIC creates a NIC with the provided id and LinkEndpoint and calls
+// LinkEndpoint.Attach to bind ep with a NetworkDispatcher.
+func (s *Stack) CreateNIC(id tcpip.NICID, ep LinkEndpoint) *tcpip.Error {
+	return s.CreateNICWithOptions(id, ep, NICOptions{})
+}
+
+// GetNICByName gets the NIC specified by name.
+func (s *Stack) GetNICByName(name string) (*NIC, bool) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	for _, nic := range s.nics {
+		if nic.Name() == name {
+			return nic, true
+		}
+	}
+	return nil, false
+}
+
+// EnableNIC enables the given NIC so that the link-layer endpoint can start
+// delivering packets to it.
+func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.enable()
+}
+
+// DisableNIC disables the given NIC.
+func (s *Stack) DisableNIC(id tcpip.NICID) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.disable()
+}
+
+// CheckNIC checks if a NIC is usable.
+func (s *Stack) CheckNIC(id tcpip.NICID) bool {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return false
+	}
+
+	return nic.enabled()
+}
+
+// RemoveNIC removes NIC and all related routes from the network stack.
+func (s *Stack) RemoveNIC(id tcpip.NICID) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	return s.removeNICLocked(id)
+}
+
+// removeNICLocked removes NIC and all related routes from the network stack.
+//
+// s.mu must be locked.
+func (s *Stack) removeNICLocked(id tcpip.NICID) *tcpip.Error {
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+	delete(s.nics, id)
+
+	// Remove routes in-place. n tracks the number of routes written.
+	n := 0
+	for i, r := range s.routeTable {
+		s.routeTable[i] = tcpip.Route{}
+		if r.NIC != id {
+			// Keep this route.
+			s.routeTable[n] = r
+			n++
+		}
+	}
+
+	s.routeTable = s.routeTable[:n]
+
+	return nic.remove()
+}
+
+// NICAddressRanges returns a map of NICIDs to their associated subnets.
+func (s *Stack) NICAddressRanges() map[tcpip.NICID][]tcpip.Subnet {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nics := map[tcpip.NICID][]tcpip.Subnet{}
+
+	for id, nic := range s.nics {
+		nics[id] = append(nics[id], nic.AddressRanges()...)
+	}
+	return nics
+}
+
+// NICInfo captures the name and addresses assigned to a NIC.
+type NICInfo struct {
+	Name              string
+	LinkAddress       tcpip.LinkAddress
+	ProtocolAddresses []tcpip.ProtocolAddress
+
+	// Flags indicate the state of the NIC.
+	Flags NICStateFlags
+
+	// MTU is the maximum transmission unit.
+	MTU uint32
+
+	Stats NICStats
+
+	// Context is user-supplied data optionally supplied in CreateNICWithOptions.
+	// See type NICOptions for more details.
+	Context NICContext
+}
+
+// HasNIC returns true if the NICID is defined in the stack.
+func (s *Stack) HasNIC(id tcpip.NICID) bool {
+	s.mu.RLock()
+	_, ok := s.nics[id]
+	s.mu.RUnlock()
+	return ok
+}
+
+// NICInfo returns a map of NICIDs to their associated information.
+func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nics := make(map[tcpip.NICID]NICInfo)
+	for id, nic := range s.nics {
+		flags := NICStateFlags{
+			Up:          true, // Netstack interfaces are always up.
+			Running:     nic.enabled(),
+			Promiscuous: nic.isPromiscuousMode(),
+			Loopback:    nic.isLoopback(),
+		}
+		nics[id] = NICInfo{
+			Name:              nic.name,
+			LinkAddress:       nic.linkEP.LinkAddress(),
+			ProtocolAddresses: nic.PrimaryAddresses(),
+			Flags:             flags,
+			MTU:               nic.linkEP.MTU(),
+			Stats:             nic.stats,
+			Context:           nic.context,
+		}
+	}
+	return nics
+}
+
+// NICStateFlags holds information about the state of an NIC.
+type NICStateFlags struct {
+	// Up indicates whether the interface is running.
+	Up bool
+
+	// Running indicates whether resources are allocated.
+	Running bool
+
+	// Promiscuous indicates whether the interface is in promiscuous mode.
+	Promiscuous bool
+
+	// Loopback indicates whether the interface is a loopback.
+	Loopback bool
+}
+
+// AddAddress adds a new network-layer address to the specified NIC.
+func (s *Stack) AddAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
+	return s.AddAddressWithOptions(id, protocol, addr, CanBePrimaryEndpoint)
+}
+
+// AddProtocolAddress adds a new network-layer protocol address to the
+// specified NIC.
+func (s *Stack) AddProtocolAddress(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress) *tcpip.Error {
+	return s.AddProtocolAddressWithOptions(id, protocolAddress, CanBePrimaryEndpoint)
+}
+
+// AddAddressWithOptions is the same as AddAddress, but allows you to specify
+// whether the new endpoint can be primary or not.
+func (s *Stack) AddAddressWithOptions(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, peb PrimaryEndpointBehavior) *tcpip.Error {
+	netProto, ok := s.networkProtocols[protocol]
+	if !ok {
+		return tcpip.ErrUnknownProtocol
+	}
+	return s.AddProtocolAddressWithOptions(id, tcpip.ProtocolAddress{
+		Protocol: protocol,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   addr,
+			PrefixLen: netProto.DefaultPrefixLen(),
+		},
+	}, peb)
+}
+
+// AddProtocolAddressWithOptions is the same as AddProtocolAddress, but allows
+// you to specify whether the new endpoint can be primary or not.
+func (s *Stack) AddProtocolAddressWithOptions(id tcpip.NICID, protocolAddress tcpip.ProtocolAddress, peb PrimaryEndpointBehavior) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic := s.nics[id]
+	if nic == nil {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.AddAddress(protocolAddress, peb)
+}
+
+// AddAddressRange adds a range of addresses to the specified NIC. The range is
+// given by a subnet address, and all addresses contained in the subnet are
+// used except for the subnet address itself and the subnet's broadcast
+// address.
+func (s *Stack) AddAddressRange(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[id]; ok {
+		nic.AddAddressRange(protocol, subnet)
+		return nil
+	}
+
+	return tcpip.ErrUnknownNICID
+}
+
+// RemoveAddressRange removes the range of addresses from the specified NIC.
+func (s *Stack) RemoveAddressRange(id tcpip.NICID, subnet tcpip.Subnet) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[id]; ok {
+		nic.RemoveAddressRange(subnet)
+		return nil
+	}
+
+	return tcpip.ErrUnknownNICID
+}
+
+// RemoveAddress removes an existing network-layer address from the specified
+// NIC.
+func (s *Stack) RemoveAddress(id tcpip.NICID, addr tcpip.Address) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[id]; ok {
+		return nic.RemoveAddress(addr)
+	}
+
+	return tcpip.ErrUnknownNICID
+}
+
+// AllAddresses returns a map of NICIDs to their protocol addresses (primary
+// and non-primary).
+func (s *Stack) AllAddresses() map[tcpip.NICID][]tcpip.ProtocolAddress {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nics := make(map[tcpip.NICID][]tcpip.ProtocolAddress)
+	for id, nic := range s.nics {
+		nics[id] = nic.AllAddresses()
+	}
+	return nics
+}
+
+// GetMainNICAddress returns the first non-deprecated primary address and prefix
+// for the given NIC and protocol. If no non-deprecated primary address exists,
+// a deprecated primary address and prefix will be returned. Returns an error if
+// the NIC doesn't exist and an empty value if the NIC doesn't have a primary
+// address for the given protocol.
+func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (tcpip.AddressWithPrefix, *tcpip.Error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.AddressWithPrefix{}, tcpip.ErrUnknownNICID
+	}
+
+	return nic.primaryAddress(protocol), nil
+}
+
+func (s *Stack) getRefEP(nic *NIC, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (ref *referencedNetworkEndpoint) {
+	if len(localAddr) == 0 {
+		return nic.primaryEndpoint(netProto, remoteAddr)
+	}
+	return nic.findEndpoint(netProto, localAddr, CanBePrimaryEndpoint)
+}
+
+// FindRoute creates a route to the given destination address, leaving through
+// the given nic and local address (if provided).
+func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber, multicastLoop bool) (Route, *tcpip.Error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	isBroadcast := remoteAddr == header.IPv4Broadcast
+	isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)
+	needRoute := !(isBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr))
+	if id != 0 && !needRoute {
+		if nic, ok := s.nics[id]; ok && nic.enabled() {
+			if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
+				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback()), nil
+			}
+		}
+	} else {
+		for _, route := range s.routeTable {
+			if (id != 0 && id != route.NIC) || (len(remoteAddr) != 0 && !route.Destination.Contains(remoteAddr)) {
+				continue
+			}
+			if nic, ok := s.nics[route.NIC]; ok && nic.enabled() {
+				if ref := s.getRefEP(nic, localAddr, remoteAddr, netProto); ref != nil {
+					if len(remoteAddr) == 0 {
+						// If no remote address was provided, then the route
+						// provided will refer to the link local address.
+						remoteAddr = ref.ep.ID().LocalAddress
+					}
+
+					r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.isLoopback(), multicastLoop && !nic.isLoopback())
+					if needRoute {
+						r.NextHop = route.Gateway
+					}
+					return r, nil
+				}
+			}
+		}
+	}
+
+	if !needRoute {
+		return Route{}, tcpip.ErrNetworkUnreachable
+	}
+
+	return Route{}, tcpip.ErrNoRoute
+}
+
+// CheckNetworkProtocol checks if a given network protocol is enabled in the
+// stack.
+func (s *Stack) CheckNetworkProtocol(protocol tcpip.NetworkProtocolNumber) bool {
+	_, ok := s.networkProtocols[protocol]
+	return ok
+}
+
+// CheckLocalAddress determines if the given local address exists, and if it
+// does, returns the id of the NIC it's bound to. Returns 0 if the address
+// does not exist.
+func (s *Stack) CheckLocalAddress(nicID tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	// If a NIC is specified, we try to find the address there only.
+	if nicID != 0 {
+		nic := s.nics[nicID]
+		if nic == nil {
+			return 0
+		}
+
+		ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint)
+		if ref == nil {
+			return 0
+		}
+
+		ref.decRef()
+
+		return nic.id
+	}
+
+	// Go through all the NICs.
+	for _, nic := range s.nics {
+		ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint)
+		if ref != nil {
+			ref.decRef()
+			return nic.id
+		}
+	}
+
+	return 0
+}
+
+// SetPromiscuousMode enables or disables promiscuous mode in the given NIC.
+func (s *Stack) SetPromiscuousMode(nicID tcpip.NICID, enable bool) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic := s.nics[nicID]
+	if nic == nil {
+		return tcpip.ErrUnknownNICID
+	}
+
+	nic.setPromiscuousMode(enable)
+
+	return nil
+}
+
+// SetSpoofing enables or disables address spoofing in the given NIC, allowing
+// endpoints to bind to any address in the NIC.
+func (s *Stack) SetSpoofing(nicID tcpip.NICID, enable bool) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic := s.nics[nicID]
+	if nic == nil {
+		return tcpip.ErrUnknownNICID
+	}
+
+	nic.setSpoofing(enable)
+
+	return nil
+}
+
+// AddLinkAddress adds a link address to the stack link cache.
+func (s *Stack) AddLinkAddress(nicID tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress) {
+	fullAddr := tcpip.FullAddress{NIC: nicID, Addr: addr}
+	s.linkAddrCache.add(fullAddr, linkAddr)
+	// TODO: provide a way for a transport endpoint to receive a signal
+	// that AddLinkAddress for a particular address has been called.
+}
+
+// GetLinkAddress implements LinkAddressCache.GetLinkAddress.
+func (s *Stack) GetLinkAddress(nicID tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) {
+	s.mu.RLock()
+	nic := s.nics[nicID]
+	if nic == nil {
+		s.mu.RUnlock()
+		return "", nil, tcpip.ErrUnknownNICID
+	}
+	s.mu.RUnlock()
+
+	fullAddr := tcpip.FullAddress{NIC: nicID, Addr: addr}
+	linkRes := s.linkAddrResolvers[protocol]
+	return s.linkAddrCache.get(fullAddr, linkRes, localAddr, nic.linkEP, waker)
+}
+
+// RemoveWaker implements LinkAddressCache.RemoveWaker.
+func (s *Stack) RemoveWaker(nicID tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic := s.nics[nicID]; nic == nil {
+		fullAddr := tcpip.FullAddress{NIC: nicID, Addr: addr}
+		s.linkAddrCache.removeWaker(fullAddr, waker)
+	}
+}
+
+// RegisterTransportEndpoint registers the given endpoint with the stack
+// transport dispatcher. Received packets that match the provided id will be
+// delivered to the given endpoint; specifying a nic is optional, but
+// nic-specific IDs have precedence over global ones.
+func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+	return s.demux.registerEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
+}
+
+// CheckRegisterTransportEndpoint checks if an endpoint can be registered with
+// the stack transport dispatcher.
+func (s *Stack) CheckRegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+	return s.demux.checkEndpoint(netProtos, protocol, id, flags, bindToDevice)
+}
+
+// UnregisterTransportEndpoint removes the endpoint with the given id from the
+// stack transport dispatcher.
+func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
+	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
+}
+
+// StartTransportEndpointCleanup removes the endpoint with the given id from
+// the stack transport dispatcher. It also transitions it to the cleanup stage.
+func (s *Stack) StartTransportEndpointCleanup(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	s.cleanupEndpoints[ep] = struct{}{}
+
+	s.demux.unregisterEndpoint(netProtos, protocol, id, ep, flags, bindToDevice)
+}
+
+// CompleteTransportEndpointCleanup removes the endpoint from the cleanup
+// stage.
+func (s *Stack) CompleteTransportEndpointCleanup(ep TransportEndpoint) {
+	s.mu.Lock()
+	delete(s.cleanupEndpoints, ep)
+	s.mu.Unlock()
+}
+
+// FindTransportEndpoint finds an endpoint that most closely matches the provided
+// id. If no endpoint is found it returns nil.
+func (s *Stack) FindTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, r *Route) TransportEndpoint {
+	return s.demux.findTransportEndpoint(netProto, transProto, id, r)
+}
+
+// RegisterRawTransportEndpoint registers the given endpoint with the stack
+// transport dispatcher. Received packets that match the provided transport
+// protocol will be delivered to the given endpoint.
+func (s *Stack) RegisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error {
+	return s.demux.registerRawEndpoint(netProto, transProto, ep)
+}
+
+// UnregisterRawTransportEndpoint removes the endpoint for the transport
+// protocol from the stack transport dispatcher.
+func (s *Stack) UnregisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) {
+	s.demux.unregisterRawEndpoint(netProto, transProto, ep)
+}
+
+// RegisterRestoredEndpoint records e as an endpoint that has been restored on
+// this stack.
+func (s *Stack) RegisterRestoredEndpoint(e ResumableEndpoint) {
+	s.mu.Lock()
+	s.resumableEndpoints = append(s.resumableEndpoints, e)
+	s.mu.Unlock()
+}
+
+// RegisteredEndpoints returns all endpoints which are currently registered.
+func (s *Stack) RegisteredEndpoints() []TransportEndpoint {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	var es []TransportEndpoint
+	for _, e := range s.demux.protocol {
+		es = append(es, e.transportEndpoints()...)
+	}
+	return es
+}
+
+// CleanupEndpoints returns endpoints currently in the cleanup state.
+func (s *Stack) CleanupEndpoints() []TransportEndpoint {
+	s.mu.Lock()
+	es := make([]TransportEndpoint, 0, len(s.cleanupEndpoints))
+	for e := range s.cleanupEndpoints {
+		es = append(es, e)
+	}
+	s.mu.Unlock()
+	return es
+}
+
+// RestoreCleanupEndpoints adds endpoints to cleanup tracking. This is useful
+// for restoring a stack after a save.
+func (s *Stack) RestoreCleanupEndpoints(es []TransportEndpoint) {
+	s.mu.Lock()
+	for _, e := range es {
+		s.cleanupEndpoints[e] = struct{}{}
+	}
+	s.mu.Unlock()
+}
+
+// Close closes all currently registered transport endpoints.
+//
+// Endpoints created or modified during this call may not get closed.
+func (s *Stack) Close() {
+	for _, e := range s.RegisteredEndpoints() {
+		e.Abort()
+	}
+	for _, p := range s.transportProtocols {
+		p.proto.Close()
+	}
+	for _, p := range s.networkProtocols {
+		p.Close()
+	}
+}
+
+// Wait waits for all transport and link endpoints to halt their worker
+// goroutines.
+//
+// Endpoints created or modified during this call may not get waited on.
+//
+// Note that link endpoints must be stopped via an implementation specific
+// mechanism.
+func (s *Stack) Wait() {
+	for _, e := range s.RegisteredEndpoints() {
+		e.Wait()
+	}
+	for _, e := range s.CleanupEndpoints() {
+		e.Wait()
+	}
+	for _, p := range s.transportProtocols {
+		p.proto.Wait()
+	}
+	for _, p := range s.networkProtocols {
+		p.Wait()
+	}
+
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	for _, n := range s.nics {
+		n.linkEP.Wait()
+	}
+}
+
+// Resume restarts the stack after a restore. This must be called after the
+// entire system has been restored.
+func (s *Stack) Resume() {
+	// ResumableEndpoint.Resume() may call other methods on s, so we can't hold
+	// s.mu while resuming the endpoints.
+	s.mu.Lock()
+	eps := s.resumableEndpoints
+	s.resumableEndpoints = nil
+	s.mu.Unlock()
+	for _, e := range eps {
+		e.Resume(s)
+	}
+}
+
+// RegisterPacketEndpoint registers ep with the stack, causing it to receive
+// all traffic of the specified netProto on the given NIC. If nicID is 0, it
+// receives traffic from every NIC.
+func (s *Stack) RegisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// If no NIC is specified, capture on all devices.
+	if nicID == 0 {
+		// Register with each NIC.
+		for _, nic := range s.nics {
+			if err := nic.registerPacketEndpoint(netProto, ep); err != nil {
+				s.unregisterPacketEndpointLocked(0, netProto, ep)
+				return err
+			}
+		}
+		return nil
+	}
+
+	// Capture on a specific device.
+	nic, ok := s.nics[nicID]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+	if err := nic.registerPacketEndpoint(netProto, ep); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// UnregisterPacketEndpoint unregisters ep for packets of the specified
+// netProto from the specified NIC. If nicID is 0, ep is unregistered from all
+// NICs.
+func (s *Stack) UnregisterPacketEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	s.unregisterPacketEndpointLocked(nicID, netProto, ep)
+}
+
+func (s *Stack) unregisterPacketEndpointLocked(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, ep PacketEndpoint) {
+	// If no NIC is specified, unregister on all devices.
+	if nicID == 0 {
+		// Unregister with each NIC.
+		for _, nic := range s.nics {
+			nic.unregisterPacketEndpoint(netProto, ep)
+		}
+		return
+	}
+
+	// Unregister in a single device.
+	nic, ok := s.nics[nicID]
+	if !ok {
+		return
+	}
+	nic.unregisterPacketEndpoint(netProto, ep)
+}
+
+// WritePacket writes data directly to the specified NIC. It adds an ethernet
+// header based on the arguments.
+func (s *Stack) WritePacket(nicID tcpip.NICID, dst tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, payload buffer.VectorisedView) *tcpip.Error {
+	s.mu.Lock()
+	nic, ok := s.nics[nicID]
+	s.mu.Unlock()
+	if !ok {
+		return tcpip.ErrUnknownDevice
+	}
+
+	// Add our own fake ethernet header.
+	ethFields := header.EthernetFields{
+		SrcAddr: nic.linkEP.LinkAddress(),
+		DstAddr: dst,
+		Type:    netProto,
+	}
+	fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
+	fakeHeader.Encode(&ethFields)
+	vv := buffer.View(fakeHeader).ToVectorisedView()
+	vv.Append(payload)
+
+	if err := nic.linkEP.WriteRawPacket(vv); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// WriteRawPacket writes data directly to the specified NIC without adding any
+// headers.
+func (s *Stack) WriteRawPacket(nicID tcpip.NICID, payload buffer.VectorisedView) *tcpip.Error {
+	s.mu.Lock()
+	nic, ok := s.nics[nicID]
+	s.mu.Unlock()
+	if !ok {
+		return tcpip.ErrUnknownDevice
+	}
+
+	if err := nic.linkEP.WriteRawPacket(payload); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// NetworkProtocolInstance returns the protocol instance in the stack for the
+// specified network protocol. This method is public for protocol implementers
+// and tests to use.
+func (s *Stack) NetworkProtocolInstance(num tcpip.NetworkProtocolNumber) NetworkProtocol {
+	if p, ok := s.networkProtocols[num]; ok {
+		return p
+	}
+	return nil
+}
+
+// TransportProtocolInstance returns the protocol instance in the stack for the
+// specified transport protocol. This method is public for protocol implementers
+// and tests to use.
+func (s *Stack) TransportProtocolInstance(num tcpip.TransportProtocolNumber) TransportProtocol {
+	if pState, ok := s.transportProtocols[num]; ok {
+		return pState.proto
+	}
+	return nil
+}
+
+// AddTCPProbe installs a probe function that will be invoked on every segment
+// received by a given TCP endpoint. The probe function is passed a copy of the
+// TCP endpoint state before and after processing of the segment.
+//
+// NOTE: TCPProbe is added only to endpoints created after this call. Endpoints
+// created prior to this call will not call the probe function.
+//
+// Further, installing two different probes back to back can result in some
+// endpoints calling the first one and some the second one. There is no
+// guarantee provided on which probe will be invoked. Ideally this should only
+// be called once per stack.
+func (s *Stack) AddTCPProbe(probe TCPProbeFunc) {
+	s.mu.Lock()
+	s.tcpProbeFunc = probe
+	s.mu.Unlock()
+}
+
+// GetTCPProbe returns the TCPProbeFunc if installed with AddTCPProbe, nil
+// otherwise.
+func (s *Stack) GetTCPProbe() TCPProbeFunc {
+	s.mu.Lock()
+	p := s.tcpProbeFunc
+	s.mu.Unlock()
+	return p
+}
+
+// RemoveTCPProbe removes an installed TCP probe.
+//
+// NOTE: This only ensures that endpoints created after this call do not
+// have a probe attached. Endpoints already created will continue to invoke
+// TCP probe.
+func (s *Stack) RemoveTCPProbe() {
+	s.mu.Lock()
+	s.tcpProbeFunc = nil
+	s.mu.Unlock()
+}
+
+// JoinGroup joins the given multicast group on the given NIC.
+func (s *Stack) JoinGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) *tcpip.Error {
+	// TODO: notify network of subscription via igmp protocol.
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[nicID]; ok {
+		return nic.joinGroup(protocol, multicastAddr)
+	}
+	return tcpip.ErrUnknownNICID
+}
+
+// LeaveGroup leaves the given multicast group on the given NIC.
+func (s *Stack) LeaveGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[nicID]; ok {
+		return nic.leaveGroup(multicastAddr)
+	}
+	return tcpip.ErrUnknownNICID
+}
+
+// IsInGroup returns true if the NIC with ID nicID has joined the multicast
+// group multicastAddr.
+func (s *Stack) IsInGroup(nicID tcpip.NICID, multicastAddr tcpip.Address) (bool, *tcpip.Error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[nicID]; ok {
+		return nic.isInGroup(multicastAddr), nil
+	}
+	return false, tcpip.ErrUnknownNICID
+}
+
+// IPTables returns the stack's iptables.
+func (s *Stack) IPTables() *IPTables {
+	return s.tables
+}
+
+// ICMPLimit returns the maximum number of ICMP messages that can be sent
+// in one second.
+func (s *Stack) ICMPLimit() rate.Limit {
+	return s.icmpRateLimiter.Limit()
+}
+
+// SetICMPLimit sets the maximum number of ICMP messages that be sent
+// in one second.
+func (s *Stack) SetICMPLimit(newLimit rate.Limit) {
+	s.icmpRateLimiter.SetLimit(newLimit)
+}
+
+// ICMPBurst returns the maximum number of ICMP messages that can be sent
+// in a single burst.
+func (s *Stack) ICMPBurst() int {
+	return s.icmpRateLimiter.Burst()
+}
+
+// SetICMPBurst sets the maximum number of ICMP messages that can be sent
+// in a single burst.
+func (s *Stack) SetICMPBurst(burst int) {
+	s.icmpRateLimiter.SetBurst(burst)
+}
+
+// AllowICMPMessage returns true if we the rate limiter allows at least one
+// ICMP message to be sent at this instant.
+func (s *Stack) AllowICMPMessage() bool {
+	return s.icmpRateLimiter.Allow()
+}
+
+// IsAddrTentative returns true if addr is tentative on the NIC with ID id.
+//
+// Note that if addr is not associated with a NIC with id ID, then this
+// function will return false. It will only return true if the address is
+// associated with the NIC AND it is tentative.
+func (s *Stack) IsAddrTentative(id tcpip.NICID, addr tcpip.Address) (bool, *tcpip.Error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return false, tcpip.ErrUnknownNICID
+	}
+
+	return nic.isAddrTentative(addr), nil
+}
+
+// DupTentativeAddrDetected attempts to inform the NIC with ID id that a
+// tentative addr on it is a duplicate on a link.
+func (s *Stack) DupTentativeAddrDetected(id tcpip.NICID, addr tcpip.Address) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.dupTentativeAddrDetected(addr)
+}
+
+// SetNDPConfigurations sets the per-interface NDP configurations on the NIC
+// with ID id to c.
+//
+// Note, if c contains invalid NDP configuration values, it will be fixed to
+// use default values for the erroneous values.
+func (s *Stack) SetNDPConfigurations(id tcpip.NICID, c NDPConfigurations) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	nic.setNDPConfigs(c)
+
+	return nil
+}
+
+// HandleNDPRA provides a NIC with ID id a validated NDP Router Advertisement
+// message that it needs to handle.
+func (s *Stack) HandleNDPRA(id tcpip.NICID, ip tcpip.Address, ra header.NDPRouterAdvert) *tcpip.Error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return tcpip.ErrUnknownNICID
+	}
+
+	nic.handleNDPRA(ip, ra)
+
+	return nil
+}
+
+// Seed returns a 32 bit value that can be used as a seed value for port
+// picking, ISN generation etc.
+//
+// NOTE: The seed is generated once during stack initialization only.
+func (s *Stack) Seed() uint32 {
+	return s.seed
+}
+
+// Rand returns a reference to a pseudo random generator that can be used
+// to generate random numbers as required.
+func (s *Stack) Rand() *mathrand.Rand {
+	return s.randomGenerator
+}
+
+func generateRandUint32() uint32 {
+	b := make([]byte, 4)
+	if _, err := rand.Read(b); err != nil {
+		panic(err)
+	}
+	return binary.LittleEndian.Uint32(b)
+}
+
+func generateRandInt64() int64 {
+	b := make([]byte, 8)
+	if _, err := rand.Read(b); err != nil {
+		panic(err)
+	}
+	buf := bytes.NewReader(b)
+	var v int64
+	if err := binary.Read(buf, binary.LittleEndian, &v); err != nil {
+		panic(err)
+	}
+	return v
+}
+
+// FindNetworkEndpoint returns the network endpoint for the given address.
+func (s *Stack) FindNetworkEndpoint(netProto tcpip.NetworkProtocolNumber, address tcpip.Address) (NetworkEndpoint, *tcpip.Error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	for _, nic := range s.nics {
+		id := NetworkEndpointID{address}
+
+		if ref, ok := nic.mu.endpoints[id]; ok {
+			nic.mu.RLock()
+			defer nic.mu.RUnlock()
+
+			// An endpoint with this id exists, check if it can be
+			// used and return it.
+			return ref.ep, nil
+		}
+	}
+	return nil, tcpip.ErrBadAddress
+}
+
+// FindNICNameFromID returns the name of the nic for the given NICID.
+func (s *Stack) FindNICNameFromID(id tcpip.NICID) string {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	nic, ok := s.nics[id]
+	if !ok {
+		return ""
+	}
+
+	return nic.Name()
+}
diff --git a/pkg/tcpip/stack/stack_global_state.go b/pkg/tcpip/stack/stack_global_state.go
new file mode 100644
index 000000000..dfec4258a
--- /dev/null
+++ b/pkg/tcpip/stack/stack_global_state.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+// StackFromEnv is the global stack created in restore run.
+// FIXME(b/36201077)
+var StackFromEnv *Stack
diff --git a/pkg/tcpip/stack/stack_options.go b/pkg/tcpip/stack/stack_options.go
new file mode 100644
index 000000000..0b093e6c5
--- /dev/null
+++ b/pkg/tcpip/stack/stack_options.go
@@ -0,0 +1,106 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import "gvisor.dev/gvisor/pkg/tcpip"
+
+const (
+	// MinBufferSize is the smallest size of a receive or send buffer.
+	MinBufferSize = 4 << 10 // 4 KiB
+
+	// DefaultBufferSize is the default size of the send/recv buffer for a
+	// transport endpoint.
+	DefaultBufferSize = 212 << 10 // 212 KiB
+
+	// DefaultMaxBufferSize is the default maximum permitted size of a
+	// send/receive buffer.
+	DefaultMaxBufferSize = 4 << 20 // 4 MiB
+)
+
+// SendBufferSizeOption is used by stack.(Stack*).Option/SetOption to
+// get/set the default, min and max send buffer sizes.
+type SendBufferSizeOption struct {
+	Min     int
+	Default int
+	Max     int
+}
+
+// ReceiveBufferSizeOption is used by stack.(Stack*).Option/SetOption to
+// get/set the default, min and max receive buffer sizes.
+type ReceiveBufferSizeOption struct {
+	Min     int
+	Default int
+	Max     int
+}
+
+// SetOption allows setting stack wide options.
+func (s *Stack) SetOption(option interface{}) *tcpip.Error {
+	switch v := option.(type) {
+	case SendBufferSizeOption:
+		// Make sure we don't allow lowering the buffer below minimum
+		// required for stack to work.
+		if v.Min < MinBufferSize {
+			return tcpip.ErrInvalidOptionValue
+		}
+
+		if v.Default < v.Min || v.Default > v.Max {
+			return tcpip.ErrInvalidOptionValue
+		}
+
+		s.mu.Lock()
+		s.sendBufferSize = v
+		s.mu.Unlock()
+		return nil
+
+	case ReceiveBufferSizeOption:
+		// Make sure we don't allow lowering the buffer below minimum
+		// required for stack to work.
+		if v.Min < MinBufferSize {
+			return tcpip.ErrInvalidOptionValue
+		}
+
+		if v.Default < v.Min || v.Default > v.Max {
+			return tcpip.ErrInvalidOptionValue
+		}
+
+		s.mu.Lock()
+		s.receiveBufferSize = v
+		s.mu.Unlock()
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// Option allows retrieving stack wide options.
+func (s *Stack) Option(option interface{}) *tcpip.Error {
+	switch v := option.(type) {
+	case *SendBufferSizeOption:
+		s.mu.RLock()
+		*v = s.sendBufferSize
+		s.mu.RUnlock()
+		return nil
+
+	case *ReceiveBufferSizeOption:
+		s.mu.RLock()
+		*v = s.receiveBufferSize
+		s.mu.RUnlock()
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
diff --git a/pkg/tcpip/stack/stack_test.go b/pkg/tcpip/stack/stack_test.go
new file mode 100644
index 000000000..7657a4101
--- /dev/null
+++ b/pkg/tcpip/stack/stack_test.go
@@ -0,0 +1,3420 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package stack_test contains tests for the stack. It is in its own package so
+// that the tests can also validate that all definitions needed to implement
+// transport and network protocols are properly exported by the stack package.
+package stack_test
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"sort"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+)
+
+const (
+	fakeNetNumber        tcpip.NetworkProtocolNumber = math.MaxUint32
+	fakeNetHeaderLen                                 = 12
+	fakeDefaultPrefixLen                             = 8
+
+	// fakeControlProtocol is used for control packets that represent
+	// destination port unreachable.
+	fakeControlProtocol tcpip.TransportProtocolNumber = 2
+
+	// defaultMTU is the MTU, in bytes, used throughout the tests, except
+	// where another value is explicitly used. It is chosen to match the MTU
+	// of loopback interfaces on linux systems.
+	defaultMTU = 65536
+
+	dstAddrOffset        = 0
+	srcAddrOffset        = 1
+	protocolNumberOffset = 2
+)
+
+// fakeNetworkEndpoint is a network-layer protocol endpoint. It counts sent and
+// received packets; the counts of all endpoints are aggregated in the protocol
+// descriptor.
+//
+// Headers of this protocol are fakeNetHeaderLen bytes, but we currently only
+// use the first three: destination address, source address, and transport
+// protocol. They're all one byte fields to simplify parsing.
+type fakeNetworkEndpoint struct {
+	nicID      tcpip.NICID
+	id         stack.NetworkEndpointID
+	prefixLen  int
+	proto      *fakeNetworkProtocol
+	dispatcher stack.TransportDispatcher
+	ep         stack.LinkEndpoint
+}
+
+func (f *fakeNetworkEndpoint) MTU() uint32 {
+	return f.ep.MTU() - uint32(f.MaxHeaderLength())
+}
+
+func (f *fakeNetworkEndpoint) NICID() tcpip.NICID {
+	return f.nicID
+}
+
+func (f *fakeNetworkEndpoint) PrefixLen() int {
+	return f.prefixLen
+}
+
+func (*fakeNetworkEndpoint) DefaultTTL() uint8 {
+	return 123
+}
+
+func (f *fakeNetworkEndpoint) ID() *stack.NetworkEndpointID {
+	return &f.id
+}
+
+func (f *fakeNetworkEndpoint) HandlePacket(r *stack.Route, pkt *stack.PacketBuffer) {
+	// Increment the received packet count in the protocol descriptor.
+	f.proto.packetCount[int(f.id.LocalAddress[0])%len(f.proto.packetCount)]++
+
+	// Handle control packets.
+	if pkt.NetworkHeader[protocolNumberOffset] == uint8(fakeControlProtocol) {
+		nb, ok := pkt.Data.PullUp(fakeNetHeaderLen)
+		if !ok {
+			return
+		}
+		pkt.Data.TrimFront(fakeNetHeaderLen)
+		f.dispatcher.DeliverTransportControlPacket(
+			tcpip.Address(nb[srcAddrOffset:srcAddrOffset+1]),
+			tcpip.Address(nb[dstAddrOffset:dstAddrOffset+1]),
+			fakeNetNumber,
+			tcpip.TransportProtocolNumber(nb[protocolNumberOffset]),
+			stack.ControlPortUnreachable, 0, pkt)
+		return
+	}
+
+	// Dispatch the packet to the transport protocol.
+	f.dispatcher.DeliverTransportPacket(r, tcpip.TransportProtocolNumber(pkt.NetworkHeader[protocolNumberOffset]), pkt)
+}
+
+func (f *fakeNetworkEndpoint) MaxHeaderLength() uint16 {
+	return f.ep.MaxHeaderLength() + fakeNetHeaderLen
+}
+
+func (f *fakeNetworkEndpoint) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, dstAddr tcpip.Address) uint16 {
+	return 0
+}
+
+func (f *fakeNetworkEndpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return f.ep.Capabilities()
+}
+
+func (f *fakeNetworkEndpoint) NetworkProtocolNumber() tcpip.NetworkProtocolNumber {
+	return f.proto.Number()
+}
+
+func (f *fakeNetworkEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, params stack.NetworkHeaderParams, pkt *stack.PacketBuffer) *tcpip.Error {
+	// Increment the sent packet count in the protocol descriptor.
+	f.proto.sendPacketCount[int(r.RemoteAddress[0])%len(f.proto.sendPacketCount)]++
+
+	// Add the protocol's header to the packet and send it to the link
+	// endpoint.
+	pkt.NetworkHeader = pkt.Header.Prepend(fakeNetHeaderLen)
+	pkt.NetworkHeader[dstAddrOffset] = r.RemoteAddress[0]
+	pkt.NetworkHeader[srcAddrOffset] = f.id.LocalAddress[0]
+	pkt.NetworkHeader[protocolNumberOffset] = byte(params.Protocol)
+
+	if r.Loop&stack.PacketLoop != 0 {
+		f.HandlePacket(r, pkt)
+	}
+	if r.Loop&stack.PacketOut == 0 {
+		return nil
+	}
+
+	return f.ep.WritePacket(r, gso, fakeNetNumber, pkt)
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (f *fakeNetworkEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, params stack.NetworkHeaderParams) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+func (*fakeNetworkEndpoint) WriteHeaderIncludedPacket(r *stack.Route, pkt *stack.PacketBuffer) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+func (*fakeNetworkEndpoint) Close() {}
+
+type fakeNetGoodOption bool
+
+type fakeNetBadOption bool
+
+type fakeNetInvalidValueOption int
+
+type fakeNetOptions struct {
+	good bool
+}
+
+// fakeNetworkProtocol is a network-layer protocol descriptor. It aggregates the
+// number of packets sent and received via endpoints of this protocol. The index
+// where packets are added is given by the packet's destination address MOD 10.
+type fakeNetworkProtocol struct {
+	packetCount     [10]int
+	sendPacketCount [10]int
+	opts            fakeNetOptions
+}
+
+func (f *fakeNetworkProtocol) Number() tcpip.NetworkProtocolNumber {
+	return fakeNetNumber
+}
+
+func (f *fakeNetworkProtocol) MinimumPacketSize() int {
+	return fakeNetHeaderLen
+}
+
+func (f *fakeNetworkProtocol) DefaultPrefixLen() int {
+	return fakeDefaultPrefixLen
+}
+
+func (f *fakeNetworkProtocol) PacketCount(intfAddr byte) int {
+	return f.packetCount[int(intfAddr)%len(f.packetCount)]
+}
+
+func (*fakeNetworkProtocol) ParseAddresses(v buffer.View) (src, dst tcpip.Address) {
+	return tcpip.Address(v[srcAddrOffset : srcAddrOffset+1]), tcpip.Address(v[dstAddrOffset : dstAddrOffset+1])
+}
+
+func (f *fakeNetworkProtocol) NewEndpoint(nicID tcpip.NICID, addrWithPrefix tcpip.AddressWithPrefix, linkAddrCache stack.LinkAddressCache, dispatcher stack.TransportDispatcher, ep stack.LinkEndpoint, _ *stack.Stack) (stack.NetworkEndpoint, *tcpip.Error) {
+	return &fakeNetworkEndpoint{
+		nicID:      nicID,
+		id:         stack.NetworkEndpointID{LocalAddress: addrWithPrefix.Address},
+		prefixLen:  addrWithPrefix.PrefixLen,
+		proto:      f,
+		dispatcher: dispatcher,
+		ep:         ep,
+	}, nil
+}
+
+func (f *fakeNetworkProtocol) SetOption(option interface{}) *tcpip.Error {
+	switch v := option.(type) {
+	case fakeNetGoodOption:
+		f.opts.good = bool(v)
+		return nil
+	case fakeNetInvalidValueOption:
+		return tcpip.ErrInvalidOptionValue
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+func (f *fakeNetworkProtocol) Option(option interface{}) *tcpip.Error {
+	switch v := option.(type) {
+	case *fakeNetGoodOption:
+		*v = fakeNetGoodOption(f.opts.good)
+		return nil
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// Close implements TransportProtocol.Close.
+func (*fakeNetworkProtocol) Close() {}
+
+// Wait implements TransportProtocol.Wait.
+func (*fakeNetworkProtocol) Wait() {}
+
+// Parse implements TransportProtocol.Parse.
+func (*fakeNetworkProtocol) Parse(pkt *stack.PacketBuffer) (tcpip.TransportProtocolNumber, bool, bool) {
+	hdr, ok := pkt.Data.PullUp(fakeNetHeaderLen)
+	if !ok {
+		return 0, false, false
+	}
+	pkt.NetworkHeader = hdr
+	pkt.Data.TrimFront(fakeNetHeaderLen)
+	return tcpip.TransportProtocolNumber(hdr[protocolNumberOffset]), true, true
+}
+
+func fakeNetFactory() stack.NetworkProtocol {
+	return &fakeNetworkProtocol{}
+}
+
+// linkEPWithMockedAttach is a stack.LinkEndpoint that tests can use to verify
+// that LinkEndpoint.Attach was called.
+type linkEPWithMockedAttach struct {
+	stack.LinkEndpoint
+	attached bool
+}
+
+// Attach implements stack.LinkEndpoint.Attach.
+func (l *linkEPWithMockedAttach) Attach(d stack.NetworkDispatcher) {
+	l.LinkEndpoint.Attach(d)
+	l.attached = d != nil
+}
+
+func (l *linkEPWithMockedAttach) isAttached() bool {
+	return l.attached
+}
+
+func TestNetworkReceive(t *testing.T) {
+	// Create a stack with the fake network protocol, one nic, and two
+	// addresses attached to it: 1 & 2.
+	ep := channel.New(10, defaultMTU, "")
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+
+	if err := s.AddAddress(1, fakeNetNumber, "\x02"); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+
+	fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol)
+
+	buf := buffer.NewView(30)
+
+	// Make sure packet with wrong address is not delivered.
+	buf[dstAddrOffset] = 3
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+	if fakeNet.packetCount[1] != 0 {
+		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 0)
+	}
+	if fakeNet.packetCount[2] != 0 {
+		t.Errorf("packetCount[2] = %d, want %d", fakeNet.packetCount[2], 0)
+	}
+
+	// Make sure packet is delivered to first endpoint.
+	buf[dstAddrOffset] = 1
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+	if fakeNet.packetCount[1] != 1 {
+		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
+	}
+	if fakeNet.packetCount[2] != 0 {
+		t.Errorf("packetCount[2] = %d, want %d", fakeNet.packetCount[2], 0)
+	}
+
+	// Make sure packet is delivered to second endpoint.
+	buf[dstAddrOffset] = 2
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+	if fakeNet.packetCount[1] != 1 {
+		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
+	}
+	if fakeNet.packetCount[2] != 1 {
+		t.Errorf("packetCount[2] = %d, want %d", fakeNet.packetCount[2], 1)
+	}
+
+	// Make sure packet is not delivered if protocol number is wrong.
+	ep.InjectInbound(fakeNetNumber-1, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+	if fakeNet.packetCount[1] != 1 {
+		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
+	}
+	if fakeNet.packetCount[2] != 1 {
+		t.Errorf("packetCount[2] = %d, want %d", fakeNet.packetCount[2], 1)
+	}
+
+	// Make sure packet that is too small is dropped.
+	buf.CapLength(2)
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+	if fakeNet.packetCount[1] != 1 {
+		t.Errorf("packetCount[1] = %d, want %d", fakeNet.packetCount[1], 1)
+	}
+	if fakeNet.packetCount[2] != 1 {
+		t.Errorf("packetCount[2] = %d, want %d", fakeNet.packetCount[2], 1)
+	}
+}
+
+func sendTo(s *stack.Stack, addr tcpip.Address, payload buffer.View) *tcpip.Error {
+	r, err := s.FindRoute(0, "", addr, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		return err
+	}
+	defer r.Release()
+	return send(r, payload)
+}
+
+func send(r stack.Route, payload buffer.View) *tcpip.Error {
+	hdr := buffer.NewPrependable(int(r.MaxHeaderLength()))
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   payload.ToVectorisedView(),
+	})
+}
+
+func testSendTo(t *testing.T, s *stack.Stack, addr tcpip.Address, ep *channel.Endpoint, payload buffer.View) {
+	t.Helper()
+	ep.Drain()
+	if err := sendTo(s, addr, payload); err != nil {
+		t.Error("sendTo failed:", err)
+	}
+	if got, want := ep.Drain(), 1; got != want {
+		t.Errorf("sendTo packet count: got = %d, want %d", got, want)
+	}
+}
+
+func testSend(t *testing.T, r stack.Route, ep *channel.Endpoint, payload buffer.View) {
+	t.Helper()
+	ep.Drain()
+	if err := send(r, payload); err != nil {
+		t.Error("send failed:", err)
+	}
+	if got, want := ep.Drain(), 1; got != want {
+		t.Errorf("send packet count: got = %d, want %d", got, want)
+	}
+}
+
+func testFailingSend(t *testing.T, r stack.Route, ep *channel.Endpoint, payload buffer.View, wantErr *tcpip.Error) {
+	t.Helper()
+	if gotErr := send(r, payload); gotErr != wantErr {
+		t.Errorf("send failed: got = %s, want = %s ", gotErr, wantErr)
+	}
+}
+
+func testFailingSendTo(t *testing.T, s *stack.Stack, addr tcpip.Address, ep *channel.Endpoint, payload buffer.View, wantErr *tcpip.Error) {
+	t.Helper()
+	if gotErr := sendTo(s, addr, payload); gotErr != wantErr {
+		t.Errorf("sendto failed: got = %s, want = %s ", gotErr, wantErr)
+	}
+}
+
+func testRecv(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte byte, ep *channel.Endpoint, buf buffer.View) {
+	t.Helper()
+	// testRecvInternal injects one packet, and we expect to receive it.
+	want := fakeNet.PacketCount(localAddrByte) + 1
+	testRecvInternal(t, fakeNet, localAddrByte, ep, buf, want)
+}
+
+func testFailingRecv(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte byte, ep *channel.Endpoint, buf buffer.View) {
+	t.Helper()
+	// testRecvInternal injects one packet, and we do NOT expect to receive it.
+	want := fakeNet.PacketCount(localAddrByte)
+	testRecvInternal(t, fakeNet, localAddrByte, ep, buf, want)
+}
+
+func testRecvInternal(t *testing.T, fakeNet *fakeNetworkProtocol, localAddrByte byte, ep *channel.Endpoint, buf buffer.View, want int) {
+	t.Helper()
+	ep.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+	if got := fakeNet.PacketCount(localAddrByte); got != want {
+		t.Errorf("receive packet count: got = %d, want %d", got, want)
+	}
+}
+
+func TestNetworkSend(t *testing.T) {
+	// Create a stack with the fake network protocol, one nic, and one
+	// address: 1. The route table sends all packets through the only
+	// existing nic.
+	ep := channel.New(10, defaultMTU, "")
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatal("NewNIC failed:", err)
+	}
+
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+	}
+
+	if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+
+	// Make sure that the link-layer endpoint received the outbound packet.
+	testSendTo(t, s, "\x03", ep, nil)
+}
+
+func TestNetworkSendMultiRoute(t *testing.T) {
+	// Create a stack with the fake network protocol, two nics, and two
+	// addresses per nic, the first nic has odd address, the second one has
+	// even addresses.
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep1 := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep1); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+
+	if err := s.AddAddress(1, fakeNetNumber, "\x03"); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+
+	ep2 := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(2, ep2); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	if err := s.AddAddress(2, fakeNetNumber, "\x02"); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+
+	if err := s.AddAddress(2, fakeNetNumber, "\x04"); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+
+	// Set a route table that sends all packets with odd destination
+	// addresses through the first NIC, and all even destination address
+	// through the second one.
+	{
+		subnet0, err := tcpip.NewSubnet("\x00", "\x01")
+		if err != nil {
+			t.Fatal(err)
+		}
+		subnet1, err := tcpip.NewSubnet("\x01", "\x01")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{
+			{Destination: subnet1, Gateway: "\x00", NIC: 1},
+			{Destination: subnet0, Gateway: "\x00", NIC: 2},
+		})
+	}
+
+	// Send a packet to an odd destination.
+	testSendTo(t, s, "\x05", ep1, nil)
+
+	// Send a packet to an even destination.
+	testSendTo(t, s, "\x06", ep2, nil)
+}
+
+func testRoute(t *testing.T, s *stack.Stack, nic tcpip.NICID, srcAddr, dstAddr, expectedSrcAddr tcpip.Address) {
+	r, err := s.FindRoute(nic, srcAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatal("FindRoute failed:", err)
+	}
+
+	defer r.Release()
+
+	if r.LocalAddress != expectedSrcAddr {
+		t.Fatalf("Bad source address: expected %v, got %v", expectedSrcAddr, r.LocalAddress)
+	}
+
+	if r.RemoteAddress != dstAddr {
+		t.Fatalf("Bad destination address: expected %v, got %v", dstAddr, r.RemoteAddress)
+	}
+}
+
+func testNoRoute(t *testing.T, s *stack.Stack, nic tcpip.NICID, srcAddr, dstAddr tcpip.Address) {
+	_, err := s.FindRoute(nic, srcAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
+	if err != tcpip.ErrNoRoute {
+		t.Fatalf("FindRoute returned unexpected error, got = %v, want = %s", err, tcpip.ErrNoRoute)
+	}
+}
+
+// TestAttachToLinkEndpointImmediately tests that a LinkEndpoint is attached to
+// a NetworkDispatcher when the NIC is created.
+func TestAttachToLinkEndpointImmediately(t *testing.T) {
+	const nicID = 1
+
+	tests := []struct {
+		name    string
+		nicOpts stack.NICOptions
+	}{
+		{
+			name:    "Create enabled NIC",
+			nicOpts: stack.NICOptions{Disabled: false},
+		},
+		{
+			name:    "Create disabled NIC",
+			nicOpts: stack.NICOptions{Disabled: true},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+			})
+
+			e := linkEPWithMockedAttach{
+				LinkEndpoint: loopback.New(),
+			}
+
+			if err := s.CreateNICWithOptions(nicID, &e, test.nicOpts); err != nil {
+				t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, test.nicOpts, err)
+			}
+			if !e.isAttached() {
+				t.Fatal("link endpoint not attached to a network dispatcher")
+			}
+		})
+	}
+}
+
+func TestDisableUnknownNIC(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	if err := s.DisableNIC(1); err != tcpip.ErrUnknownNICID {
+		t.Fatalf("got s.DisableNIC(1) = %v, want = %s", err, tcpip.ErrUnknownNICID)
+	}
+}
+
+func TestDisabledNICsNICInfoAndCheckNIC(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	e := loopback.New()
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	checkNIC := func(enabled bool) {
+		t.Helper()
+
+		allNICInfo := s.NICInfo()
+		nicInfo, ok := allNICInfo[nicID]
+		if !ok {
+			t.Errorf("entry for %d missing from allNICInfo = %+v", nicID, allNICInfo)
+		} else if nicInfo.Flags.Running != enabled {
+			t.Errorf("got nicInfo.Flags.Running = %t, want = %t", nicInfo.Flags.Running, enabled)
+		}
+
+		if got := s.CheckNIC(nicID); got != enabled {
+			t.Errorf("got s.CheckNIC(%d) = %t, want = %t", nicID, got, enabled)
+		}
+	}
+
+	// NIC should initially report itself as disabled.
+	checkNIC(false)
+
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	checkNIC(true)
+
+	// If the NIC is not reporting a correct enabled status, we cannot trust the
+	// next check so end the test here.
+	if t.Failed() {
+		t.FailNow()
+	}
+
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+	}
+	checkNIC(false)
+}
+
+func TestRemoveUnknownNIC(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	if err := s.RemoveNIC(1); err != tcpip.ErrUnknownNICID {
+		t.Fatalf("got s.RemoveNIC(1) = %v, want = %s", err, tcpip.ErrUnknownNICID)
+	}
+}
+
+func TestRemoveNIC(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	e := linkEPWithMockedAttach{
+		LinkEndpoint: loopback.New(),
+	}
+	if err := s.CreateNIC(nicID, &e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	// NIC should be present in NICInfo and attached to a NetworkDispatcher.
+	allNICInfo := s.NICInfo()
+	if _, ok := allNICInfo[nicID]; !ok {
+		t.Errorf("entry for %d missing from allNICInfo = %+v", nicID, allNICInfo)
+	}
+	if !e.isAttached() {
+		t.Fatal("link endpoint not attached to a network dispatcher")
+	}
+
+	// Removing a NIC should remove it from NICInfo and e should be detached from
+	// the NetworkDispatcher.
+	if err := s.RemoveNIC(nicID); err != nil {
+		t.Fatalf("s.RemoveNIC(%d): %s", nicID, err)
+	}
+	if nicInfo, ok := s.NICInfo()[nicID]; ok {
+		t.Errorf("got unexpected NICInfo entry for deleted NIC %d = %+v", nicID, nicInfo)
+	}
+	if e.isAttached() {
+		t.Error("link endpoint for removed NIC still attached to a network dispatcher")
+	}
+}
+
+func TestRouteWithDownNIC(t *testing.T) {
+	tests := []struct {
+		name   string
+		downFn func(s *stack.Stack, nicID tcpip.NICID) *tcpip.Error
+		upFn   func(s *stack.Stack, nicID tcpip.NICID) *tcpip.Error
+	}{
+		{
+			name:   "Disabled NIC",
+			downFn: (*stack.Stack).DisableNIC,
+			upFn:   (*stack.Stack).EnableNIC,
+		},
+
+		// Once a NIC is removed, it cannot be brought up.
+		{
+			name:   "Removed NIC",
+			downFn: (*stack.Stack).RemoveNIC,
+		},
+	}
+
+	const unspecifiedNIC = 0
+	const nicID1 = 1
+	const nicID2 = 2
+	const addr1 = tcpip.Address("\x01")
+	const addr2 = tcpip.Address("\x02")
+	const nic1Dst = tcpip.Address("\x05")
+	const nic2Dst = tcpip.Address("\x06")
+
+	setup := func(t *testing.T) (*stack.Stack, *channel.Endpoint, *channel.Endpoint) {
+		s := stack.New(stack.Options{
+			NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+		})
+
+		ep1 := channel.New(1, defaultMTU, "")
+		if err := s.CreateNIC(nicID1, ep1); err != nil {
+			t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
+		}
+
+		if err := s.AddAddress(nicID1, fakeNetNumber, addr1); err != nil {
+			t.Fatalf("AddAddress(%d, %d, %s): %s", nicID1, fakeNetNumber, addr1, err)
+		}
+
+		ep2 := channel.New(1, defaultMTU, "")
+		if err := s.CreateNIC(nicID2, ep2); err != nil {
+			t.Fatalf("CreateNIC(%d, _): %s", nicID2, err)
+		}
+
+		if err := s.AddAddress(nicID2, fakeNetNumber, addr2); err != nil {
+			t.Fatalf("AddAddress(%d, %d, %s): %s", nicID2, fakeNetNumber, addr2, err)
+		}
+
+		// Set a route table that sends all packets with odd destination
+		// addresses through the first NIC, and all even destination address
+		// through the second one.
+		{
+			subnet0, err := tcpip.NewSubnet("\x00", "\x01")
+			if err != nil {
+				t.Fatal(err)
+			}
+			subnet1, err := tcpip.NewSubnet("\x01", "\x01")
+			if err != nil {
+				t.Fatal(err)
+			}
+			s.SetRouteTable([]tcpip.Route{
+				{Destination: subnet1, Gateway: "\x00", NIC: nicID1},
+				{Destination: subnet0, Gateway: "\x00", NIC: nicID2},
+			})
+		}
+
+		return s, ep1, ep2
+	}
+
+	// Tests that routes through a down NIC are not used when looking up a route
+	// for a destination.
+	t.Run("Find", func(t *testing.T) {
+		for _, test := range tests {
+			t.Run(test.name, func(t *testing.T) {
+				s, _, _ := setup(t)
+
+				// Test routes to odd address.
+				testRoute(t, s, unspecifiedNIC, "", "\x05", addr1)
+				testRoute(t, s, unspecifiedNIC, addr1, "\x05", addr1)
+				testRoute(t, s, nicID1, addr1, "\x05", addr1)
+
+				// Test routes to even address.
+				testRoute(t, s, unspecifiedNIC, "", "\x06", addr2)
+				testRoute(t, s, unspecifiedNIC, addr2, "\x06", addr2)
+				testRoute(t, s, nicID2, addr2, "\x06", addr2)
+
+				// Bringing NIC1 down should result in no routes to odd addresses. Routes to
+				// even addresses should continue to be available as NIC2 is still up.
+				if err := test.downFn(s, nicID1); err != nil {
+					t.Fatalf("test.downFn(_, %d): %s", nicID1, err)
+				}
+				testNoRoute(t, s, unspecifiedNIC, "", nic1Dst)
+				testNoRoute(t, s, unspecifiedNIC, addr1, nic1Dst)
+				testNoRoute(t, s, nicID1, addr1, nic1Dst)
+				testRoute(t, s, unspecifiedNIC, "", nic2Dst, addr2)
+				testRoute(t, s, unspecifiedNIC, addr2, nic2Dst, addr2)
+				testRoute(t, s, nicID2, addr2, nic2Dst, addr2)
+
+				// Bringing NIC2 down should result in no routes to even addresses. No
+				// route should be available to any address as routes to odd addresses
+				// were made unavailable by bringing NIC1 down above.
+				if err := test.downFn(s, nicID2); err != nil {
+					t.Fatalf("test.downFn(_, %d): %s", nicID2, err)
+				}
+				testNoRoute(t, s, unspecifiedNIC, "", nic1Dst)
+				testNoRoute(t, s, unspecifiedNIC, addr1, nic1Dst)
+				testNoRoute(t, s, nicID1, addr1, nic1Dst)
+				testNoRoute(t, s, unspecifiedNIC, "", nic2Dst)
+				testNoRoute(t, s, unspecifiedNIC, addr2, nic2Dst)
+				testNoRoute(t, s, nicID2, addr2, nic2Dst)
+
+				if upFn := test.upFn; upFn != nil {
+					// Bringing NIC1 up should make routes to odd addresses available
+					// again. Routes to even addresses should continue to be unavailable
+					// as NIC2 is still down.
+					if err := upFn(s, nicID1); err != nil {
+						t.Fatalf("test.upFn(_, %d): %s", nicID1, err)
+					}
+					testRoute(t, s, unspecifiedNIC, "", nic1Dst, addr1)
+					testRoute(t, s, unspecifiedNIC, addr1, nic1Dst, addr1)
+					testRoute(t, s, nicID1, addr1, nic1Dst, addr1)
+					testNoRoute(t, s, unspecifiedNIC, "", nic2Dst)
+					testNoRoute(t, s, unspecifiedNIC, addr2, nic2Dst)
+					testNoRoute(t, s, nicID2, addr2, nic2Dst)
+				}
+			})
+		}
+	})
+
+	// Tests that writing a packet using a Route through a down NIC fails.
+	t.Run("WritePacket", func(t *testing.T) {
+		for _, test := range tests {
+			t.Run(test.name, func(t *testing.T) {
+				s, ep1, ep2 := setup(t)
+
+				r1, err := s.FindRoute(nicID1, addr1, nic1Dst, fakeNetNumber, false /* multicastLoop */)
+				if err != nil {
+					t.Errorf("FindRoute(%d, %s, %s, %d, false): %s", nicID1, addr1, nic1Dst, fakeNetNumber, err)
+				}
+				defer r1.Release()
+
+				r2, err := s.FindRoute(nicID2, addr2, nic2Dst, fakeNetNumber, false /* multicastLoop */)
+				if err != nil {
+					t.Errorf("FindRoute(%d, %s, %s, %d, false): %s", nicID2, addr2, nic2Dst, fakeNetNumber, err)
+				}
+				defer r2.Release()
+
+				// If we failed to get routes r1 or r2, we cannot proceed with the test.
+				if t.Failed() {
+					t.FailNow()
+				}
+
+				buf := buffer.View([]byte{1})
+				testSend(t, r1, ep1, buf)
+				testSend(t, r2, ep2, buf)
+
+				// Writes with Routes that use NIC1 after being brought down should fail.
+				if err := test.downFn(s, nicID1); err != nil {
+					t.Fatalf("test.downFn(_, %d): %s", nicID1, err)
+				}
+				testFailingSend(t, r1, ep1, buf, tcpip.ErrInvalidEndpointState)
+				testSend(t, r2, ep2, buf)
+
+				// Writes with Routes that use NIC2 after being brought down should fail.
+				if err := test.downFn(s, nicID2); err != nil {
+					t.Fatalf("test.downFn(_, %d): %s", nicID2, err)
+				}
+				testFailingSend(t, r1, ep1, buf, tcpip.ErrInvalidEndpointState)
+				testFailingSend(t, r2, ep2, buf, tcpip.ErrInvalidEndpointState)
+
+				if upFn := test.upFn; upFn != nil {
+					// Writes with Routes that use NIC1 after being brought up should
+					// succeed.
+					//
+					// TODO(b/147015577): Should we instead completely invalidate all
+					// Routes that were bound to a NIC that was brought down at some
+					// point?
+					if err := upFn(s, nicID1); err != nil {
+						t.Fatalf("test.upFn(_, %d): %s", nicID1, err)
+					}
+					testSend(t, r1, ep1, buf)
+					testFailingSend(t, r2, ep2, buf, tcpip.ErrInvalidEndpointState)
+				}
+			})
+		}
+	})
+}
+
+func TestRoutes(t *testing.T) {
+	// Create a stack with the fake network protocol, two nics, and two
+	// addresses per nic, the first nic has odd address, the second one has
+	// even addresses.
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep1 := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep1); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+
+	if err := s.AddAddress(1, fakeNetNumber, "\x03"); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+
+	ep2 := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(2, ep2); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	if err := s.AddAddress(2, fakeNetNumber, "\x02"); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+
+	if err := s.AddAddress(2, fakeNetNumber, "\x04"); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+
+	// Set a route table that sends all packets with odd destination
+	// addresses through the first NIC, and all even destination address
+	// through the second one.
+	{
+		subnet0, err := tcpip.NewSubnet("\x00", "\x01")
+		if err != nil {
+			t.Fatal(err)
+		}
+		subnet1, err := tcpip.NewSubnet("\x01", "\x01")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{
+			{Destination: subnet1, Gateway: "\x00", NIC: 1},
+			{Destination: subnet0, Gateway: "\x00", NIC: 2},
+		})
+	}
+
+	// Test routes to odd address.
+	testRoute(t, s, 0, "", "\x05", "\x01")
+	testRoute(t, s, 0, "\x01", "\x05", "\x01")
+	testRoute(t, s, 1, "\x01", "\x05", "\x01")
+	testRoute(t, s, 0, "\x03", "\x05", "\x03")
+	testRoute(t, s, 1, "\x03", "\x05", "\x03")
+
+	// Test routes to even address.
+	testRoute(t, s, 0, "", "\x06", "\x02")
+	testRoute(t, s, 0, "\x02", "\x06", "\x02")
+	testRoute(t, s, 2, "\x02", "\x06", "\x02")
+	testRoute(t, s, 0, "\x04", "\x06", "\x04")
+	testRoute(t, s, 2, "\x04", "\x06", "\x04")
+
+	// Try to send to odd numbered address from even numbered ones, then
+	// vice-versa.
+	testNoRoute(t, s, 0, "\x02", "\x05")
+	testNoRoute(t, s, 2, "\x02", "\x05")
+	testNoRoute(t, s, 0, "\x04", "\x05")
+	testNoRoute(t, s, 2, "\x04", "\x05")
+
+	testNoRoute(t, s, 0, "\x01", "\x06")
+	testNoRoute(t, s, 1, "\x01", "\x06")
+	testNoRoute(t, s, 0, "\x03", "\x06")
+	testNoRoute(t, s, 1, "\x03", "\x06")
+}
+
+func TestAddressRemoval(t *testing.T) {
+	const localAddrByte byte = 0x01
+	localAddr := tcpip.Address([]byte{localAddrByte})
+	remoteAddr := tcpip.Address("\x02")
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	if err := s.AddAddress(1, fakeNetNumber, localAddr); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+	}
+
+	fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol)
+
+	buf := buffer.NewView(30)
+
+	// Send and receive packets, and verify they are received.
+	buf[dstAddrOffset] = localAddrByte
+	testRecv(t, fakeNet, localAddrByte, ep, buf)
+	testSendTo(t, s, remoteAddr, ep, nil)
+
+	// Remove the address, then check that send/receive doesn't work anymore.
+	if err := s.RemoveAddress(1, localAddr); err != nil {
+		t.Fatal("RemoveAddress failed:", err)
+	}
+	testFailingRecv(t, fakeNet, localAddrByte, ep, buf)
+	testFailingSendTo(t, s, remoteAddr, ep, nil, tcpip.ErrNoRoute)
+
+	// Check that removing the same address fails.
+	if err := s.RemoveAddress(1, localAddr); err != tcpip.ErrBadLocalAddress {
+		t.Fatalf("RemoveAddress returned unexpected error, got = %v, want = %s", err, tcpip.ErrBadLocalAddress)
+	}
+}
+
+func TestAddressRemovalWithRouteHeld(t *testing.T) {
+	const localAddrByte byte = 0x01
+	localAddr := tcpip.Address([]byte{localAddrByte})
+	remoteAddr := tcpip.Address("\x02")
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatalf("CreateNIC failed: %v", err)
+	}
+	fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol)
+	buf := buffer.NewView(30)
+
+	if err := s.AddAddress(1, fakeNetNumber, localAddr); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+	}
+
+	r, err := s.FindRoute(0, "", remoteAddr, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatal("FindRoute failed:", err)
+	}
+
+	// Send and receive packets, and verify they are received.
+	buf[dstAddrOffset] = localAddrByte
+	testRecv(t, fakeNet, localAddrByte, ep, buf)
+	testSend(t, r, ep, nil)
+	testSendTo(t, s, remoteAddr, ep, nil)
+
+	// Remove the address, then check that send/receive doesn't work anymore.
+	if err := s.RemoveAddress(1, localAddr); err != nil {
+		t.Fatal("RemoveAddress failed:", err)
+	}
+	testFailingRecv(t, fakeNet, localAddrByte, ep, buf)
+	testFailingSend(t, r, ep, nil, tcpip.ErrInvalidEndpointState)
+	testFailingSendTo(t, s, remoteAddr, ep, nil, tcpip.ErrNoRoute)
+
+	// Check that removing the same address fails.
+	if err := s.RemoveAddress(1, localAddr); err != tcpip.ErrBadLocalAddress {
+		t.Fatalf("RemoveAddress returned unexpected error, got = %v, want = %s", err, tcpip.ErrBadLocalAddress)
+	}
+}
+
+func verifyAddress(t *testing.T, s *stack.Stack, nicID tcpip.NICID, addr tcpip.Address) {
+	t.Helper()
+	info, ok := s.NICInfo()[nicID]
+	if !ok {
+		t.Fatalf("NICInfo() failed to find nicID=%d", nicID)
+	}
+	if len(addr) == 0 {
+		// No address given, verify that there is no address assigned to the NIC.
+		for _, a := range info.ProtocolAddresses {
+			if a.Protocol == fakeNetNumber && a.AddressWithPrefix != (tcpip.AddressWithPrefix{}) {
+				t.Errorf("verify no-address: got = %s, want = %s", a.AddressWithPrefix, (tcpip.AddressWithPrefix{}))
+			}
+		}
+		return
+	}
+	// Address given, verify the address is assigned to the NIC and no other
+	// address is.
+	found := false
+	for _, a := range info.ProtocolAddresses {
+		if a.Protocol == fakeNetNumber {
+			if a.AddressWithPrefix.Address == addr {
+				found = true
+			} else {
+				t.Errorf("verify address: got = %s, want = %s", a.AddressWithPrefix.Address, addr)
+			}
+		}
+	}
+	if !found {
+		t.Errorf("verify address: couldn't find %s on the NIC", addr)
+	}
+}
+
+func TestEndpointExpiration(t *testing.T) {
+	const (
+		localAddrByte byte          = 0x01
+		remoteAddr    tcpip.Address = "\x03"
+		noAddr        tcpip.Address = ""
+		nicID         tcpip.NICID   = 1
+	)
+	localAddr := tcpip.Address([]byte{localAddrByte})
+
+	for _, promiscuous := range []bool{true, false} {
+		for _, spoofing := range []bool{true, false} {
+			t.Run(fmt.Sprintf("promiscuous=%t spoofing=%t", promiscuous, spoofing), func(t *testing.T) {
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+				})
+
+				ep := channel.New(10, defaultMTU, "")
+				if err := s.CreateNIC(nicID, ep); err != nil {
+					t.Fatal("CreateNIC failed:", err)
+				}
+
+				{
+					subnet, err := tcpip.NewSubnet("\x00", "\x00")
+					if err != nil {
+						t.Fatal(err)
+					}
+					s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+				}
+
+				fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol)
+				buf := buffer.NewView(30)
+				buf[dstAddrOffset] = localAddrByte
+
+				if promiscuous {
+					if err := s.SetPromiscuousMode(nicID, true); err != nil {
+						t.Fatal("SetPromiscuousMode failed:", err)
+					}
+				}
+
+				if spoofing {
+					if err := s.SetSpoofing(nicID, true); err != nil {
+						t.Fatal("SetSpoofing failed:", err)
+					}
+				}
+
+				// 1. No Address yet, send should only work for spoofing, receive for
+				// promiscuous mode.
+				//-----------------------
+				verifyAddress(t, s, nicID, noAddr)
+				if promiscuous {
+					testRecv(t, fakeNet, localAddrByte, ep, buf)
+				} else {
+					testFailingRecv(t, fakeNet, localAddrByte, ep, buf)
+				}
+				if spoofing {
+					// FIXME(b/139841518):Spoofing doesn't work if there is no primary address.
+					// testSendTo(t, s, remoteAddr, ep, nil)
+				} else {
+					testFailingSendTo(t, s, remoteAddr, ep, nil, tcpip.ErrNoRoute)
+				}
+
+				// 2. Add Address, everything should work.
+				//-----------------------
+				if err := s.AddAddress(nicID, fakeNetNumber, localAddr); err != nil {
+					t.Fatal("AddAddress failed:", err)
+				}
+				verifyAddress(t, s, nicID, localAddr)
+				testRecv(t, fakeNet, localAddrByte, ep, buf)
+				testSendTo(t, s, remoteAddr, ep, nil)
+
+				// 3. Remove the address, send should only work for spoofing, receive
+				// for promiscuous mode.
+				//-----------------------
+				if err := s.RemoveAddress(nicID, localAddr); err != nil {
+					t.Fatal("RemoveAddress failed:", err)
+				}
+				verifyAddress(t, s, nicID, noAddr)
+				if promiscuous {
+					testRecv(t, fakeNet, localAddrByte, ep, buf)
+				} else {
+					testFailingRecv(t, fakeNet, localAddrByte, ep, buf)
+				}
+				if spoofing {
+					// FIXME(b/139841518):Spoofing doesn't work if there is no primary address.
+					// testSendTo(t, s, remoteAddr, ep, nil)
+				} else {
+					testFailingSendTo(t, s, remoteAddr, ep, nil, tcpip.ErrNoRoute)
+				}
+
+				// 4. Add Address back, everything should work again.
+				//-----------------------
+				if err := s.AddAddress(nicID, fakeNetNumber, localAddr); err != nil {
+					t.Fatal("AddAddress failed:", err)
+				}
+				verifyAddress(t, s, nicID, localAddr)
+				testRecv(t, fakeNet, localAddrByte, ep, buf)
+				testSendTo(t, s, remoteAddr, ep, nil)
+
+				// 5. Take a reference to the endpoint by getting a route. Verify that
+				// we can still send/receive, including sending using the route.
+				//-----------------------
+				r, err := s.FindRoute(0, "", remoteAddr, fakeNetNumber, false /* multicastLoop */)
+				if err != nil {
+					t.Fatal("FindRoute failed:", err)
+				}
+				testRecv(t, fakeNet, localAddrByte, ep, buf)
+				testSendTo(t, s, remoteAddr, ep, nil)
+				testSend(t, r, ep, nil)
+
+				// 6. Remove the address. Send should only work for spoofing, receive
+				// for promiscuous mode.
+				//-----------------------
+				if err := s.RemoveAddress(nicID, localAddr); err != nil {
+					t.Fatal("RemoveAddress failed:", err)
+				}
+				verifyAddress(t, s, nicID, noAddr)
+				if promiscuous {
+					testRecv(t, fakeNet, localAddrByte, ep, buf)
+				} else {
+					testFailingRecv(t, fakeNet, localAddrByte, ep, buf)
+				}
+				if spoofing {
+					testSend(t, r, ep, nil)
+					testSendTo(t, s, remoteAddr, ep, nil)
+				} else {
+					testFailingSend(t, r, ep, nil, tcpip.ErrInvalidEndpointState)
+					testFailingSendTo(t, s, remoteAddr, ep, nil, tcpip.ErrNoRoute)
+				}
+
+				// 7. Add Address back, everything should work again.
+				//-----------------------
+				if err := s.AddAddress(nicID, fakeNetNumber, localAddr); err != nil {
+					t.Fatal("AddAddress failed:", err)
+				}
+				verifyAddress(t, s, nicID, localAddr)
+				testRecv(t, fakeNet, localAddrByte, ep, buf)
+				testSendTo(t, s, remoteAddr, ep, nil)
+				testSend(t, r, ep, nil)
+
+				// 8. Remove the route, sendTo/recv should still work.
+				//-----------------------
+				r.Release()
+				verifyAddress(t, s, nicID, localAddr)
+				testRecv(t, fakeNet, localAddrByte, ep, buf)
+				testSendTo(t, s, remoteAddr, ep, nil)
+
+				// 9. Remove the address. Send should only work for spoofing, receive
+				// for promiscuous mode.
+				//-----------------------
+				if err := s.RemoveAddress(nicID, localAddr); err != nil {
+					t.Fatal("RemoveAddress failed:", err)
+				}
+				verifyAddress(t, s, nicID, noAddr)
+				if promiscuous {
+					testRecv(t, fakeNet, localAddrByte, ep, buf)
+				} else {
+					testFailingRecv(t, fakeNet, localAddrByte, ep, buf)
+				}
+				if spoofing {
+					// FIXME(b/139841518):Spoofing doesn't work if there is no primary address.
+					// testSendTo(t, s, remoteAddr, ep, nil)
+				} else {
+					testFailingSendTo(t, s, remoteAddr, ep, nil, tcpip.ErrNoRoute)
+				}
+			})
+		}
+	}
+}
+
+func TestPromiscuousMode(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+	}
+
+	fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol)
+
+	buf := buffer.NewView(30)
+
+	// Write a packet, and check that it doesn't get delivered as we don't
+	// have a matching endpoint.
+	const localAddrByte byte = 0x01
+	buf[dstAddrOffset] = localAddrByte
+	testFailingRecv(t, fakeNet, localAddrByte, ep, buf)
+
+	// Set promiscuous mode, then check that packet is delivered.
+	if err := s.SetPromiscuousMode(1, true); err != nil {
+		t.Fatal("SetPromiscuousMode failed:", err)
+	}
+	testRecv(t, fakeNet, localAddrByte, ep, buf)
+
+	// Check that we can't get a route as there is no local address.
+	_, err := s.FindRoute(0, "", "\x02", fakeNetNumber, false /* multicastLoop */)
+	if err != tcpip.ErrNoRoute {
+		t.Fatalf("FindRoute returned unexpected error: got = %v, want = %s", err, tcpip.ErrNoRoute)
+	}
+
+	// Set promiscuous mode to false, then check that packet can't be
+	// delivered anymore.
+	if err := s.SetPromiscuousMode(1, false); err != nil {
+		t.Fatal("SetPromiscuousMode failed:", err)
+	}
+	testFailingRecv(t, fakeNet, localAddrByte, ep, buf)
+}
+
+func TestSpoofingWithAddress(t *testing.T) {
+	localAddr := tcpip.Address("\x01")
+	nonExistentLocalAddr := tcpip.Address("\x02")
+	dstAddr := tcpip.Address("\x03")
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	if err := s.AddAddress(1, fakeNetNumber, localAddr); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+	}
+
+	// With address spoofing disabled, FindRoute does not permit an address
+	// that was not added to the NIC to be used as the source.
+	r, err := s.FindRoute(0, nonExistentLocalAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
+	if err == nil {
+		t.Errorf("FindRoute succeeded with route %+v when it should have failed", r)
+	}
+
+	// With address spoofing enabled, FindRoute permits any address to be used
+	// as the source.
+	if err := s.SetSpoofing(1, true); err != nil {
+		t.Fatal("SetSpoofing failed:", err)
+	}
+	r, err = s.FindRoute(0, nonExistentLocalAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatal("FindRoute failed:", err)
+	}
+	if r.LocalAddress != nonExistentLocalAddr {
+		t.Errorf("got Route.LocalAddress = %s, want = %s", r.LocalAddress, nonExistentLocalAddr)
+	}
+	if r.RemoteAddress != dstAddr {
+		t.Errorf("got Route.RemoteAddress = %s, want = %s", r.RemoteAddress, dstAddr)
+	}
+	// Sending a packet works.
+	testSendTo(t, s, dstAddr, ep, nil)
+	testSend(t, r, ep, nil)
+
+	// FindRoute should also work with a local address that exists on the NIC.
+	r, err = s.FindRoute(0, localAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatal("FindRoute failed:", err)
+	}
+	if r.LocalAddress != localAddr {
+		t.Errorf("got Route.LocalAddress = %s, want = %s", r.LocalAddress, nonExistentLocalAddr)
+	}
+	if r.RemoteAddress != dstAddr {
+		t.Errorf("got Route.RemoteAddress = %s, want = %s", r.RemoteAddress, dstAddr)
+	}
+	// Sending a packet using the route works.
+	testSend(t, r, ep, nil)
+}
+
+func TestSpoofingNoAddress(t *testing.T) {
+	nonExistentLocalAddr := tcpip.Address("\x01")
+	dstAddr := tcpip.Address("\x02")
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+	}
+
+	// With address spoofing disabled, FindRoute does not permit an address
+	// that was not added to the NIC to be used as the source.
+	r, err := s.FindRoute(0, nonExistentLocalAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
+	if err == nil {
+		t.Errorf("FindRoute succeeded with route %+v when it should have failed", r)
+	}
+	// Sending a packet fails.
+	testFailingSendTo(t, s, dstAddr, ep, nil, tcpip.ErrNoRoute)
+
+	// With address spoofing enabled, FindRoute permits any address to be used
+	// as the source.
+	if err := s.SetSpoofing(1, true); err != nil {
+		t.Fatal("SetSpoofing failed:", err)
+	}
+	r, err = s.FindRoute(0, nonExistentLocalAddr, dstAddr, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatal("FindRoute failed:", err)
+	}
+	if r.LocalAddress != nonExistentLocalAddr {
+		t.Errorf("got Route.LocalAddress = %s, want = %s", r.LocalAddress, nonExistentLocalAddr)
+	}
+	if r.RemoteAddress != dstAddr {
+		t.Errorf("got Route.RemoteAddress = %s, want = %s", r.RemoteAddress, dstAddr)
+	}
+	// Sending a packet works.
+	// FIXME(b/139841518):Spoofing doesn't work if there is no primary address.
+	// testSendTo(t, s, remoteAddr, ep, nil)
+}
+
+func verifyRoute(gotRoute, wantRoute stack.Route) error {
+	if gotRoute.LocalAddress != wantRoute.LocalAddress {
+		return fmt.Errorf("bad local address: got %s, want = %s", gotRoute.LocalAddress, wantRoute.LocalAddress)
+	}
+	if gotRoute.RemoteAddress != wantRoute.RemoteAddress {
+		return fmt.Errorf("bad remote address: got %s, want = %s", gotRoute.RemoteAddress, wantRoute.RemoteAddress)
+	}
+	if gotRoute.RemoteLinkAddress != wantRoute.RemoteLinkAddress {
+		return fmt.Errorf("bad remote link address: got %s, want = %s", gotRoute.RemoteLinkAddress, wantRoute.RemoteLinkAddress)
+	}
+	if gotRoute.NextHop != wantRoute.NextHop {
+		return fmt.Errorf("bad next-hop address: got %s, want = %s", gotRoute.NextHop, wantRoute.NextHop)
+	}
+	return nil
+}
+
+func TestOutgoingBroadcastWithEmptyRouteTable(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+	s.SetRouteTable([]tcpip.Route{})
+
+	// If there is no endpoint, it won't work.
+	if _, err := s.FindRoute(1, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */); err != tcpip.ErrNetworkUnreachable {
+		t.Fatalf("got FindRoute(1, %s, %s, %d) = %s, want = %s", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err, tcpip.ErrNetworkUnreachable)
+	}
+
+	protoAddr := tcpip.ProtocolAddress{Protocol: fakeNetNumber, AddressWithPrefix: tcpip.AddressWithPrefix{header.IPv4Any, 0}}
+	if err := s.AddProtocolAddress(1, protoAddr); err != nil {
+		t.Fatalf("AddProtocolAddress(1, %v) failed: %v", protoAddr, err)
+	}
+	r, err := s.FindRoute(1, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("FindRoute(1, %v, %v, %d) failed: %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err)
+	}
+	if err := verifyRoute(r, stack.Route{LocalAddress: header.IPv4Any, RemoteAddress: header.IPv4Broadcast}); err != nil {
+		t.Errorf("FindRoute(1, %v, %v, %d) returned unexpected Route: %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err)
+	}
+
+	// If the NIC doesn't exist, it won't work.
+	if _, err := s.FindRoute(2, header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */); err != tcpip.ErrNetworkUnreachable {
+		t.Fatalf("got FindRoute(2, %v, %v, %d) = %v want = %v", header.IPv4Any, header.IPv4Broadcast, fakeNetNumber, err, tcpip.ErrNetworkUnreachable)
+	}
+}
+
+func TestOutgoingBroadcastWithRouteTable(t *testing.T) {
+	defaultAddr := tcpip.AddressWithPrefix{header.IPv4Any, 0}
+	// Local subnet on NIC1: 192.168.1.58/24, gateway 192.168.1.1.
+	nic1Addr := tcpip.AddressWithPrefix{"\xc0\xa8\x01\x3a", 24}
+	nic1Gateway := tcpip.Address("\xc0\xa8\x01\x01")
+	// Local subnet on NIC2: 10.10.10.5/24, gateway 10.10.10.1.
+	nic2Addr := tcpip.AddressWithPrefix{"\x0a\x0a\x0a\x05", 24}
+	nic2Gateway := tcpip.Address("\x0a\x0a\x0a\x01")
+
+	// Create a new stack with two NICs.
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatalf("CreateNIC failed: %s", err)
+	}
+	if err := s.CreateNIC(2, ep); err != nil {
+		t.Fatalf("CreateNIC failed: %s", err)
+	}
+	nic1ProtoAddr := tcpip.ProtocolAddress{fakeNetNumber, nic1Addr}
+	if err := s.AddProtocolAddress(1, nic1ProtoAddr); err != nil {
+		t.Fatalf("AddProtocolAddress(1, %v) failed: %v", nic1ProtoAddr, err)
+	}
+
+	nic2ProtoAddr := tcpip.ProtocolAddress{fakeNetNumber, nic2Addr}
+	if err := s.AddProtocolAddress(2, nic2ProtoAddr); err != nil {
+		t.Fatalf("AddAddress(2, %v) failed: %v", nic2ProtoAddr, err)
+	}
+
+	// Set the initial route table.
+	rt := []tcpip.Route{
+		{Destination: nic1Addr.Subnet(), NIC: 1},
+		{Destination: nic2Addr.Subnet(), NIC: 2},
+		{Destination: defaultAddr.Subnet(), Gateway: nic2Gateway, NIC: 2},
+		{Destination: defaultAddr.Subnet(), Gateway: nic1Gateway, NIC: 1},
+	}
+	s.SetRouteTable(rt)
+
+	// When an interface is given, the route for a broadcast goes through it.
+	r, err := s.FindRoute(1, nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("FindRoute(1, %v, %v, %d) failed: %v", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err)
+	}
+	if err := verifyRoute(r, stack.Route{LocalAddress: nic1Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil {
+		t.Errorf("FindRoute(1, %v, %v, %d) returned unexpected Route: %v", nic1Addr.Address, header.IPv4Broadcast, fakeNetNumber, err)
+	}
+
+	// When an interface is not given, it consults the route table.
+	// 1. Case: Using the default route.
+	r, err = s.FindRoute(0, "", header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("FindRoute(0, \"\", %s, %d) failed: %s", header.IPv4Broadcast, fakeNetNumber, err)
+	}
+	if err := verifyRoute(r, stack.Route{LocalAddress: nic2Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil {
+		t.Errorf("FindRoute(0, \"\", %s, %d) returned unexpected Route: %s)", header.IPv4Broadcast, fakeNetNumber, err)
+	}
+
+	// 2. Case: Having an explicit route for broadcast will select that one.
+	rt = append(
+		[]tcpip.Route{
+			{Destination: tcpip.AddressWithPrefix{header.IPv4Broadcast, 8 * header.IPv4AddressSize}.Subnet(), NIC: 1},
+		},
+		rt...,
+	)
+	s.SetRouteTable(rt)
+	r, err = s.FindRoute(0, "", header.IPv4Broadcast, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		t.Fatalf("FindRoute(0, \"\", %s, %d) failed: %s", header.IPv4Broadcast, fakeNetNumber, err)
+	}
+	if err := verifyRoute(r, stack.Route{LocalAddress: nic1Addr.Address, RemoteAddress: header.IPv4Broadcast}); err != nil {
+		t.Errorf("FindRoute(0, \"\", %s, %d) returned unexpected Route: %s)", header.IPv4Broadcast, fakeNetNumber, err)
+	}
+}
+
+func TestMulticastOrIPv6LinkLocalNeedsNoRoute(t *testing.T) {
+	for _, tc := range []struct {
+		name        string
+		routeNeeded bool
+		address     tcpip.Address
+	}{
+		// IPv4 multicast address range: 224.0.0.0 - 239.255.255.255
+		//                <=>  0xe0.0x00.0x00.0x00 - 0xef.0xff.0xff.0xff
+		{"IPv4 Multicast 1", false, "\xe0\x00\x00\x00"},
+		{"IPv4 Multicast 2", false, "\xef\xff\xff\xff"},
+		{"IPv4 Unicast 1", true, "\xdf\xff\xff\xff"},
+		{"IPv4 Unicast 2", true, "\xf0\x00\x00\x00"},
+		{"IPv4 Unicast 3", true, "\x00\x00\x00\x00"},
+
+		// IPv6 multicast address is 0xff[8] + flags[4] + scope[4] + groupId[112]
+		{"IPv6 Multicast 1", false, "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"},
+		{"IPv6 Multicast 2", false, "\xff\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"},
+		{"IPv6 Multicast 3", false, "\xff\x0f\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"},
+
+		// IPv6 link-local address starts with fe80::/10.
+		{"IPv6 Unicast Link-Local 1", false, "\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"},
+		{"IPv6 Unicast Link-Local 2", false, "\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"},
+		{"IPv6 Unicast Link-Local 3", false, "\xfe\x80\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff"},
+		{"IPv6 Unicast Link-Local 4", false, "\xfe\xbf\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"},
+		{"IPv6 Unicast Link-Local 5", false, "\xfe\xbf\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"},
+
+		// IPv6 addresses that are neither multicast nor link-local.
+		{"IPv6 Unicast Not Link-Local 1", true, "\xf0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"},
+		{"IPv6 Unicast Not Link-Local 2", true, "\xf0\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff"},
+		{"IPv6 Unicast Not Link-local 3", true, "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"},
+		{"IPv6 Unicast Not Link-Local 4", true, "\xfe\xc0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"},
+		{"IPv6 Unicast Not Link-Local 5", true, "\xfe\xdf\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"},
+		{"IPv6 Unicast Not Link-Local 6", true, "\xfd\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"},
+		{"IPv6 Unicast Not Link-Local 7", true, "\xf0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+			})
+
+			ep := channel.New(10, defaultMTU, "")
+			if err := s.CreateNIC(1, ep); err != nil {
+				t.Fatal("CreateNIC failed:", err)
+			}
+
+			s.SetRouteTable([]tcpip.Route{})
+
+			var anyAddr tcpip.Address
+			if len(tc.address) == header.IPv4AddressSize {
+				anyAddr = header.IPv4Any
+			} else {
+				anyAddr = header.IPv6Any
+			}
+
+			want := tcpip.ErrNetworkUnreachable
+			if tc.routeNeeded {
+				want = tcpip.ErrNoRoute
+			}
+
+			// If there is no endpoint, it won't work.
+			if _, err := s.FindRoute(1, anyAddr, tc.address, fakeNetNumber, false /* multicastLoop */); err != want {
+				t.Fatalf("got FindRoute(1, %v, %v, %v) = %v, want = %v", anyAddr, tc.address, fakeNetNumber, err, want)
+			}
+
+			if err := s.AddAddress(1, fakeNetNumber, anyAddr); err != nil {
+				t.Fatalf("AddAddress(%v, %v) failed: %v", fakeNetNumber, anyAddr, err)
+			}
+
+			if r, err := s.FindRoute(1, anyAddr, tc.address, fakeNetNumber, false /* multicastLoop */); tc.routeNeeded {
+				// Route table is empty but we need a route, this should cause an error.
+				if err != tcpip.ErrNoRoute {
+					t.Fatalf("got FindRoute(1, %v, %v, %v) = %v, want = %v", anyAddr, tc.address, fakeNetNumber, err, tcpip.ErrNoRoute)
+				}
+			} else {
+				if err != nil {
+					t.Fatalf("FindRoute(1, %v, %v, %v) failed: %v", anyAddr, tc.address, fakeNetNumber, err)
+				}
+				if r.LocalAddress != anyAddr {
+					t.Errorf("Bad local address: got %v, want = %v", r.LocalAddress, anyAddr)
+				}
+				if r.RemoteAddress != tc.address {
+					t.Errorf("Bad remote address: got %v, want = %v", r.RemoteAddress, tc.address)
+				}
+			}
+			// If the NIC doesn't exist, it won't work.
+			if _, err := s.FindRoute(2, anyAddr, tc.address, fakeNetNumber, false /* multicastLoop */); err != want {
+				t.Fatalf("got FindRoute(2, %v, %v, %v) = %v want = %v", anyAddr, tc.address, fakeNetNumber, err, want)
+			}
+		})
+	}
+}
+
+// Add a range of addresses, then check that a packet is delivered.
+func TestAddressRangeAcceptsMatchingPacket(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+	}
+
+	fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol)
+
+	buf := buffer.NewView(30)
+
+	const localAddrByte byte = 0x01
+	buf[dstAddrOffset] = localAddrByte
+	subnet, err := tcpip.NewSubnet(tcpip.Address("\x00"), tcpip.AddressMask("\xF0"))
+	if err != nil {
+		t.Fatal("NewSubnet failed:", err)
+	}
+	if err := s.AddAddressRange(1, fakeNetNumber, subnet); err != nil {
+		t.Fatal("AddAddressRange failed:", err)
+	}
+
+	testRecv(t, fakeNet, localAddrByte, ep, buf)
+}
+
+func testNicForAddressRange(t *testing.T, nicID tcpip.NICID, s *stack.Stack, subnet tcpip.Subnet, rangeExists bool) {
+	t.Helper()
+
+	// Loop over all addresses and check them.
+	numOfAddresses := 1 << uint(8-subnet.Prefix())
+	if numOfAddresses < 1 || numOfAddresses > 255 {
+		t.Fatalf("got numOfAddresses = %d, want = [1 .. 255] (subnet=%s)", numOfAddresses, subnet)
+	}
+
+	addrBytes := []byte(subnet.ID())
+	for i := 0; i < numOfAddresses; i++ {
+		addr := tcpip.Address(addrBytes)
+		wantNicID := nicID
+		// The subnet and broadcast addresses are skipped.
+		if !rangeExists || addr == subnet.ID() || addr == subnet.Broadcast() {
+			wantNicID = 0
+		}
+		if gotNicID := s.CheckLocalAddress(0, fakeNetNumber, addr); gotNicID != wantNicID {
+			t.Errorf("got CheckLocalAddress(0, %d, %s) = %d, want = %d", fakeNetNumber, addr, gotNicID, wantNicID)
+		}
+		addrBytes[0]++
+	}
+
+	// Trying the next address should always fail since it is outside the range.
+	if gotNicID := s.CheckLocalAddress(0, fakeNetNumber, tcpip.Address(addrBytes)); gotNicID != 0 {
+		t.Errorf("got CheckLocalAddress(0, %d, %s) = %d, want = %d", fakeNetNumber, tcpip.Address(addrBytes), gotNicID, 0)
+	}
+}
+
+// Set a range of addresses, then remove it again, and check at each step that
+// CheckLocalAddress returns the correct NIC for each address or zero if not
+// existent.
+func TestCheckLocalAddressForSubnet(t *testing.T) {
+	const nicID tcpip.NICID = 1
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(nicID, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: nicID}})
+	}
+
+	subnet, err := tcpip.NewSubnet(tcpip.Address("\xa0"), tcpip.AddressMask("\xf0"))
+	if err != nil {
+		t.Fatal("NewSubnet failed:", err)
+	}
+
+	testNicForAddressRange(t, nicID, s, subnet, false /* rangeExists */)
+
+	if err := s.AddAddressRange(nicID, fakeNetNumber, subnet); err != nil {
+		t.Fatal("AddAddressRange failed:", err)
+	}
+
+	testNicForAddressRange(t, nicID, s, subnet, true /* rangeExists */)
+
+	if err := s.RemoveAddressRange(nicID, subnet); err != nil {
+		t.Fatal("RemoveAddressRange failed:", err)
+	}
+
+	testNicForAddressRange(t, nicID, s, subnet, false /* rangeExists */)
+}
+
+// Set a range of addresses, then send a packet to a destination outside the
+// range and then check it doesn't get delivered.
+func TestAddressRangeRejectsNonmatchingPacket(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+	}
+
+	fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol)
+
+	buf := buffer.NewView(30)
+
+	const localAddrByte byte = 0x01
+	buf[dstAddrOffset] = localAddrByte
+	subnet, err := tcpip.NewSubnet(tcpip.Address("\x10"), tcpip.AddressMask("\xF0"))
+	if err != nil {
+		t.Fatal("NewSubnet failed:", err)
+	}
+	if err := s.AddAddressRange(1, fakeNetNumber, subnet); err != nil {
+		t.Fatal("AddAddressRange failed:", err)
+	}
+	testFailingRecv(t, fakeNet, localAddrByte, ep, buf)
+}
+
+func TestNetworkOptions(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
+		TransportProtocols: []stack.TransportProtocol{},
+	})
+
+	// Try an unsupported network protocol.
+	if err := s.SetNetworkProtocolOption(tcpip.NetworkProtocolNumber(99999), fakeNetGoodOption(false)); err != tcpip.ErrUnknownProtocol {
+		t.Fatalf("SetNetworkProtocolOption(fakeNet2, blah, false) = %v, want = tcpip.ErrUnknownProtocol", err)
+	}
+
+	testCases := []struct {
+		option   interface{}
+		wantErr  *tcpip.Error
+		verifier func(t *testing.T, p stack.NetworkProtocol)
+	}{
+		{fakeNetGoodOption(true), nil, func(t *testing.T, p stack.NetworkProtocol) {
+			t.Helper()
+			fakeNet := p.(*fakeNetworkProtocol)
+			if fakeNet.opts.good != true {
+				t.Fatalf("fakeNet.opts.good = false, want = true")
+			}
+			var v fakeNetGoodOption
+			if err := s.NetworkProtocolOption(fakeNetNumber, &v); err != nil {
+				t.Fatalf("s.NetworkProtocolOption(fakeNetNumber, &v) = %v, want = nil, where v is option %T", v, err)
+			}
+			if v != true {
+				t.Fatalf("s.NetworkProtocolOption(fakeNetNumber, &v) returned v = %v, want = true", v)
+			}
+		}},
+		{fakeNetBadOption(true), tcpip.ErrUnknownProtocolOption, nil},
+		{fakeNetInvalidValueOption(1), tcpip.ErrInvalidOptionValue, nil},
+	}
+	for _, tc := range testCases {
+		if got := s.SetNetworkProtocolOption(fakeNetNumber, tc.option); got != tc.wantErr {
+			t.Errorf("s.SetNetworkProtocolOption(fakeNet, %v) = %v, want = %v", tc.option, got, tc.wantErr)
+		}
+		if tc.verifier != nil {
+			tc.verifier(t, s.NetworkProtocolInstance(fakeNetNumber))
+		}
+	}
+}
+
+func stackContainsAddressRange(s *stack.Stack, id tcpip.NICID, addrRange tcpip.Subnet) bool {
+	ranges, ok := s.NICAddressRanges()[id]
+	if !ok {
+		return false
+	}
+	for _, r := range ranges {
+		if r == addrRange {
+			return true
+		}
+	}
+	return false
+}
+
+func TestAddresRangeAddRemove(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	addr := tcpip.Address("\x01\x01\x01\x01")
+	mask := tcpip.AddressMask(strings.Repeat("\xff", len(addr)))
+	addrRange, err := tcpip.NewSubnet(addr, mask)
+	if err != nil {
+		t.Fatal("NewSubnet failed:", err)
+	}
+
+	if got, want := stackContainsAddressRange(s, 1, addrRange), false; got != want {
+		t.Fatalf("got stackContainsAddressRange(...) = %t, want = %t", got, want)
+	}
+
+	if err := s.AddAddressRange(1, fakeNetNumber, addrRange); err != nil {
+		t.Fatal("AddAddressRange failed:", err)
+	}
+
+	if got, want := stackContainsAddressRange(s, 1, addrRange), true; got != want {
+		t.Fatalf("got stackContainsAddressRange(...) = %t, want = %t", got, want)
+	}
+
+	if err := s.RemoveAddressRange(1, addrRange); err != nil {
+		t.Fatal("RemoveAddressRange failed:", err)
+	}
+
+	if got, want := stackContainsAddressRange(s, 1, addrRange), false; got != want {
+		t.Fatalf("got stackContainsAddressRange(...) = %t, want = %t", got, want)
+	}
+}
+
+func TestGetMainNICAddressAddPrimaryNonPrimary(t *testing.T) {
+	for _, addrLen := range []int{4, 16} {
+		t.Run(fmt.Sprintf("addrLen=%d", addrLen), func(t *testing.T) {
+			for canBe := 0; canBe < 3; canBe++ {
+				t.Run(fmt.Sprintf("canBe=%d", canBe), func(t *testing.T) {
+					for never := 0; never < 3; never++ {
+						t.Run(fmt.Sprintf("never=%d", never), func(t *testing.T) {
+							s := stack.New(stack.Options{
+								NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+							})
+							ep := channel.New(10, defaultMTU, "")
+							if err := s.CreateNIC(1, ep); err != nil {
+								t.Fatal("CreateNIC failed:", err)
+							}
+							// Insert <canBe> primary and <never> never-primary addresses.
+							// Each one will add a network endpoint to the NIC.
+							primaryAddrAdded := make(map[tcpip.AddressWithPrefix]struct{})
+							for i := 0; i < canBe+never; i++ {
+								var behavior stack.PrimaryEndpointBehavior
+								if i < canBe {
+									behavior = stack.CanBePrimaryEndpoint
+								} else {
+									behavior = stack.NeverPrimaryEndpoint
+								}
+								// Add an address and in case of a primary one include a
+								// prefixLen.
+								address := tcpip.Address(bytes.Repeat([]byte{byte(i)}, addrLen))
+								if behavior == stack.CanBePrimaryEndpoint {
+									protocolAddress := tcpip.ProtocolAddress{
+										Protocol: fakeNetNumber,
+										AddressWithPrefix: tcpip.AddressWithPrefix{
+											Address:   address,
+											PrefixLen: addrLen * 8,
+										},
+									}
+									if err := s.AddProtocolAddressWithOptions(1, protocolAddress, behavior); err != nil {
+										t.Fatal("AddProtocolAddressWithOptions failed:", err)
+									}
+									// Remember the address/prefix.
+									primaryAddrAdded[protocolAddress.AddressWithPrefix] = struct{}{}
+								} else {
+									if err := s.AddAddressWithOptions(1, fakeNetNumber, address, behavior); err != nil {
+										t.Fatal("AddAddressWithOptions failed:", err)
+									}
+								}
+							}
+							// Check that GetMainNICAddress returns an address if at least
+							// one primary address was added. In that case make sure the
+							// address/prefixLen matches what we added.
+							gotAddr, err := s.GetMainNICAddress(1, fakeNetNumber)
+							if err != nil {
+								t.Fatal("GetMainNICAddress failed:", err)
+							}
+							if len(primaryAddrAdded) == 0 {
+								// No primary addresses present.
+								if wantAddr := (tcpip.AddressWithPrefix{}); gotAddr != wantAddr {
+									t.Fatalf("GetMainNICAddress: got addr = %s, want = %s", gotAddr, wantAddr)
+								}
+							} else {
+								// At least one primary address was added, verify the returned
+								// address is in the list of primary addresses we added.
+								if _, ok := primaryAddrAdded[gotAddr]; !ok {
+									t.Fatalf("GetMainNICAddress: got = %s, want any in {%v}", gotAddr, primaryAddrAdded)
+								}
+							}
+						})
+					}
+				})
+			}
+		})
+	}
+}
+
+func TestGetMainNICAddressAddRemove(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	for _, tc := range []struct {
+		name      string
+		address   tcpip.Address
+		prefixLen int
+	}{
+		{"IPv4", "\x01\x01\x01\x01", 24},
+		{"IPv6", "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01", 116},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			protocolAddress := tcpip.ProtocolAddress{
+				Protocol: fakeNetNumber,
+				AddressWithPrefix: tcpip.AddressWithPrefix{
+					Address:   tc.address,
+					PrefixLen: tc.prefixLen,
+				},
+			}
+			if err := s.AddProtocolAddress(1, protocolAddress); err != nil {
+				t.Fatal("AddProtocolAddress failed:", err)
+			}
+
+			// Check that we get the right initial address and prefix length.
+			gotAddr, err := s.GetMainNICAddress(1, fakeNetNumber)
+			if err != nil {
+				t.Fatal("GetMainNICAddress failed:", err)
+			}
+			if wantAddr := protocolAddress.AddressWithPrefix; gotAddr != wantAddr {
+				t.Fatalf("got s.GetMainNICAddress(...) = %s, want = %s", gotAddr, wantAddr)
+			}
+
+			if err := s.RemoveAddress(1, protocolAddress.AddressWithPrefix.Address); err != nil {
+				t.Fatal("RemoveAddress failed:", err)
+			}
+
+			// Check that we get no address after removal.
+			gotAddr, err = s.GetMainNICAddress(1, fakeNetNumber)
+			if err != nil {
+				t.Fatal("GetMainNICAddress failed:", err)
+			}
+			if wantAddr := (tcpip.AddressWithPrefix{}); gotAddr != wantAddr {
+				t.Fatalf("got GetMainNICAddress(...) = %s, want = %s", gotAddr, wantAddr)
+			}
+		})
+	}
+}
+
+// Simple network address generator. Good for 255 addresses.
+type addressGenerator struct{ cnt byte }
+
+func (g *addressGenerator) next(addrLen int) tcpip.Address {
+	g.cnt++
+	return tcpip.Address(bytes.Repeat([]byte{g.cnt}, addrLen))
+}
+
+func verifyAddresses(t *testing.T, expectedAddresses, gotAddresses []tcpip.ProtocolAddress) {
+	t.Helper()
+
+	if len(gotAddresses) != len(expectedAddresses) {
+		t.Fatalf("got len(addresses) = %d, want = %d", len(gotAddresses), len(expectedAddresses))
+	}
+
+	sort.Slice(gotAddresses, func(i, j int) bool {
+		return gotAddresses[i].AddressWithPrefix.Address < gotAddresses[j].AddressWithPrefix.Address
+	})
+	sort.Slice(expectedAddresses, func(i, j int) bool {
+		return expectedAddresses[i].AddressWithPrefix.Address < expectedAddresses[j].AddressWithPrefix.Address
+	})
+
+	for i, gotAddr := range gotAddresses {
+		expectedAddr := expectedAddresses[i]
+		if gotAddr != expectedAddr {
+			t.Errorf("got address = %+v, wanted = %+v", gotAddr, expectedAddr)
+		}
+	}
+}
+
+func TestAddAddress(t *testing.T) {
+	const nicID = 1
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(nicID, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	var addrGen addressGenerator
+	expectedAddresses := make([]tcpip.ProtocolAddress, 0, 2)
+	for _, addrLen := range []int{4, 16} {
+		address := addrGen.next(addrLen)
+		if err := s.AddAddress(nicID, fakeNetNumber, address); err != nil {
+			t.Fatalf("AddAddress(address=%s) failed: %s", address, err)
+		}
+		expectedAddresses = append(expectedAddresses, tcpip.ProtocolAddress{
+			Protocol:          fakeNetNumber,
+			AddressWithPrefix: tcpip.AddressWithPrefix{address, fakeDefaultPrefixLen},
+		})
+	}
+
+	gotAddresses := s.AllAddresses()[nicID]
+	verifyAddresses(t, expectedAddresses, gotAddresses)
+}
+
+func TestAddProtocolAddress(t *testing.T) {
+	const nicID = 1
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(nicID, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	var addrGen addressGenerator
+	addrLenRange := []int{4, 16}
+	prefixLenRange := []int{8, 13, 20, 32}
+	expectedAddresses := make([]tcpip.ProtocolAddress, 0, len(addrLenRange)*len(prefixLenRange))
+	for _, addrLen := range addrLenRange {
+		for _, prefixLen := range prefixLenRange {
+			protocolAddress := tcpip.ProtocolAddress{
+				Protocol: fakeNetNumber,
+				AddressWithPrefix: tcpip.AddressWithPrefix{
+					Address:   addrGen.next(addrLen),
+					PrefixLen: prefixLen,
+				},
+			}
+			if err := s.AddProtocolAddress(nicID, protocolAddress); err != nil {
+				t.Errorf("AddProtocolAddress(%+v) failed: %s", protocolAddress, err)
+			}
+			expectedAddresses = append(expectedAddresses, protocolAddress)
+		}
+	}
+
+	gotAddresses := s.AllAddresses()[nicID]
+	verifyAddresses(t, expectedAddresses, gotAddresses)
+}
+
+func TestAddAddressWithOptions(t *testing.T) {
+	const nicID = 1
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(nicID, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	addrLenRange := []int{4, 16}
+	behaviorRange := []stack.PrimaryEndpointBehavior{stack.CanBePrimaryEndpoint, stack.FirstPrimaryEndpoint, stack.NeverPrimaryEndpoint}
+	expectedAddresses := make([]tcpip.ProtocolAddress, 0, len(addrLenRange)*len(behaviorRange))
+	var addrGen addressGenerator
+	for _, addrLen := range addrLenRange {
+		for _, behavior := range behaviorRange {
+			address := addrGen.next(addrLen)
+			if err := s.AddAddressWithOptions(nicID, fakeNetNumber, address, behavior); err != nil {
+				t.Fatalf("AddAddressWithOptions(address=%s, behavior=%d) failed: %s", address, behavior, err)
+			}
+			expectedAddresses = append(expectedAddresses, tcpip.ProtocolAddress{
+				Protocol:          fakeNetNumber,
+				AddressWithPrefix: tcpip.AddressWithPrefix{address, fakeDefaultPrefixLen},
+			})
+		}
+	}
+
+	gotAddresses := s.AllAddresses()[nicID]
+	verifyAddresses(t, expectedAddresses, gotAddresses)
+}
+
+func TestAddProtocolAddressWithOptions(t *testing.T) {
+	const nicID = 1
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+	ep := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(nicID, ep); err != nil {
+		t.Fatal("CreateNIC failed:", err)
+	}
+
+	addrLenRange := []int{4, 16}
+	prefixLenRange := []int{8, 13, 20, 32}
+	behaviorRange := []stack.PrimaryEndpointBehavior{stack.CanBePrimaryEndpoint, stack.FirstPrimaryEndpoint, stack.NeverPrimaryEndpoint}
+	expectedAddresses := make([]tcpip.ProtocolAddress, 0, len(addrLenRange)*len(prefixLenRange)*len(behaviorRange))
+	var addrGen addressGenerator
+	for _, addrLen := range addrLenRange {
+		for _, prefixLen := range prefixLenRange {
+			for _, behavior := range behaviorRange {
+				protocolAddress := tcpip.ProtocolAddress{
+					Protocol: fakeNetNumber,
+					AddressWithPrefix: tcpip.AddressWithPrefix{
+						Address:   addrGen.next(addrLen),
+						PrefixLen: prefixLen,
+					},
+				}
+				if err := s.AddProtocolAddressWithOptions(nicID, protocolAddress, behavior); err != nil {
+					t.Fatalf("AddProtocolAddressWithOptions(%+v, %d) failed: %s", protocolAddress, behavior, err)
+				}
+				expectedAddresses = append(expectedAddresses, protocolAddress)
+			}
+		}
+	}
+
+	gotAddresses := s.AllAddresses()[nicID]
+	verifyAddresses(t, expectedAddresses, gotAddresses)
+}
+
+func TestCreateNICWithOptions(t *testing.T) {
+	type callArgsAndExpect struct {
+		nicID tcpip.NICID
+		opts  stack.NICOptions
+		err   *tcpip.Error
+	}
+
+	tests := []struct {
+		desc  string
+		calls []callArgsAndExpect
+	}{
+		{
+			desc: "DuplicateNICID",
+			calls: []callArgsAndExpect{
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{Name: "eth1"},
+					err:   nil,
+				},
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{Name: "eth2"},
+					err:   tcpip.ErrDuplicateNICID,
+				},
+			},
+		},
+		{
+			desc: "DuplicateName",
+			calls: []callArgsAndExpect{
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{Name: "lo"},
+					err:   nil,
+				},
+				{
+					nicID: tcpip.NICID(2),
+					opts:  stack.NICOptions{Name: "lo"},
+					err:   tcpip.ErrDuplicateNICID,
+				},
+			},
+		},
+		{
+			desc: "Unnamed",
+			calls: []callArgsAndExpect{
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{},
+					err:   nil,
+				},
+				{
+					nicID: tcpip.NICID(2),
+					opts:  stack.NICOptions{},
+					err:   nil,
+				},
+			},
+		},
+		{
+			desc: "UnnamedDuplicateNICID",
+			calls: []callArgsAndExpect{
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{},
+					err:   nil,
+				},
+				{
+					nicID: tcpip.NICID(1),
+					opts:  stack.NICOptions{},
+					err:   tcpip.ErrDuplicateNICID,
+				},
+			},
+		},
+	}
+	for _, test := range tests {
+		t.Run(test.desc, func(t *testing.T) {
+			s := stack.New(stack.Options{})
+			ep := channel.New(0, 0, tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"))
+			for _, call := range test.calls {
+				if got, want := s.CreateNICWithOptions(call.nicID, ep, call.opts), call.err; got != want {
+					t.Fatalf("CreateNICWithOptions(%v, _, %+v) = %v, want %v", call.nicID, call.opts, got, want)
+				}
+			}
+		})
+	}
+}
+
+func TestNICStats(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+	})
+	ep1 := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(1, ep1); err != nil {
+		t.Fatal("CreateNIC failed: ", err)
+	}
+	if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil {
+		t.Fatal("AddAddress failed:", err)
+	}
+	// Route all packets for address \x01 to NIC 1.
+	{
+		subnet, err := tcpip.NewSubnet("\x01", "\xff")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+	}
+
+	// Send a packet to address 1.
+	buf := buffer.NewView(30)
+	ep1.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+	if got, want := s.NICInfo()[1].Stats.Rx.Packets.Value(), uint64(1); got != want {
+		t.Errorf("got Rx.Packets.Value() = %d, want = %d", got, want)
+	}
+
+	if got, want := s.NICInfo()[1].Stats.Rx.Bytes.Value(), uint64(len(buf)); got != want {
+		t.Errorf("got Rx.Bytes.Value() = %d, want = %d", got, want)
+	}
+
+	payload := buffer.NewView(10)
+	// Write a packet out via the address for NIC 1
+	if err := sendTo(s, "\x01", payload); err != nil {
+		t.Fatal("sendTo failed: ", err)
+	}
+	want := uint64(ep1.Drain())
+	if got := s.NICInfo()[1].Stats.Tx.Packets.Value(); got != want {
+		t.Errorf("got Tx.Packets.Value() = %d, ep1.Drain() = %d", got, want)
+	}
+
+	if got, want := s.NICInfo()[1].Stats.Tx.Bytes.Value(), uint64(len(payload)); got != want {
+		t.Errorf("got Tx.Bytes.Value() = %d, want = %d", got, want)
+	}
+}
+
+func TestNICForwarding(t *testing.T) {
+	const nicID1 = 1
+	const nicID2 = 2
+	const dstAddr = tcpip.Address("\x03")
+
+	tests := []struct {
+		name      string
+		headerLen uint16
+	}{
+		{
+			name: "Zero header length",
+		},
+		{
+			name:      "Non-zero header length",
+			headerLen: 16,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+			})
+			s.SetForwarding(true)
+
+			ep1 := channel.New(10, defaultMTU, "")
+			if err := s.CreateNIC(nicID1, ep1); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID1, err)
+			}
+			if err := s.AddAddress(nicID1, fakeNetNumber, "\x01"); err != nil {
+				t.Fatalf("AddAddress(%d, %d, 0x01): %s", nicID1, fakeNetNumber, err)
+			}
+
+			ep2 := channelLinkWithHeaderLength{
+				Endpoint:     channel.New(10, defaultMTU, ""),
+				headerLength: test.headerLen,
+			}
+			if err := s.CreateNIC(nicID2, &ep2); err != nil {
+				t.Fatalf("CreateNIC(%d, _): %s", nicID2, err)
+			}
+			if err := s.AddAddress(nicID2, fakeNetNumber, "\x02"); err != nil {
+				t.Fatalf("AddAddress(%d, %d, 0x02): %s", nicID2, fakeNetNumber, err)
+			}
+
+			// Route all packets to dstAddr to NIC 2.
+			{
+				subnet, err := tcpip.NewSubnet(dstAddr, "\xff")
+				if err != nil {
+					t.Fatal(err)
+				}
+				s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: nicID2}})
+			}
+
+			// Send a packet to dstAddr.
+			buf := buffer.NewView(30)
+			buf[dstAddrOffset] = dstAddr[0]
+			ep1.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+				Data: buf.ToVectorisedView(),
+			})
+
+			pkt, ok := ep2.Read()
+			if !ok {
+				t.Fatal("packet not forwarded")
+			}
+
+			// Test that the link's MaxHeaderLength is honoured.
+			if capacity, want := pkt.Pkt.Header.AvailableLength(), int(test.headerLen); capacity != want {
+				t.Errorf("got Header.AvailableLength() = %d, want = %d", capacity, want)
+			}
+
+			// Test that forwarding increments Tx stats correctly.
+			if got, want := s.NICInfo()[nicID2].Stats.Tx.Packets.Value(), uint64(1); got != want {
+				t.Errorf("got Tx.Packets.Value() = %d, want = %d", got, want)
+			}
+
+			if got, want := s.NICInfo()[nicID2].Stats.Tx.Bytes.Value(), uint64(len(buf)); got != want {
+				t.Errorf("got Tx.Bytes.Value() = %d, want = %d", got, want)
+			}
+		})
+	}
+}
+
+// TestNICContextPreservation tests that you can read out via stack.NICInfo the
+// Context data you pass via NICContext.Context in stack.CreateNICWithOptions.
+func TestNICContextPreservation(t *testing.T) {
+	var ctx *int
+	tests := []struct {
+		name string
+		opts stack.NICOptions
+		want stack.NICContext
+	}{
+		{
+			"context_set",
+			stack.NICOptions{Context: ctx},
+			ctx,
+		},
+		{
+			"context_not_set",
+			stack.NICOptions{},
+			nil,
+		},
+	}
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{})
+			id := tcpip.NICID(1)
+			ep := channel.New(0, 0, tcpip.LinkAddress("\x00\x00\x00\x00\x00\x00"))
+			if err := s.CreateNICWithOptions(id, ep, test.opts); err != nil {
+				t.Fatalf("got stack.CreateNICWithOptions(%d, %+v, %+v) = %s, want nil", id, ep, test.opts, err)
+			}
+			nicinfos := s.NICInfo()
+			nicinfo, ok := nicinfos[id]
+			if !ok {
+				t.Fatalf("got nicinfos[%d] = _, %t, want _, true; nicinfos = %+v", id, ok, nicinfos)
+			}
+			if got, want := nicinfo.Context == test.want, true; got != want {
+				t.Fatalf("got nicinfo.Context == ctx = %t, want %t; nicinfo.Context = %p, ctx = %p", got, want, nicinfo.Context, test.want)
+			}
+		})
+	}
+}
+
+// TestNICAutoGenLinkLocalAddr tests the auto-generation of IPv6 link-local
+// addresses.
+func TestNICAutoGenLinkLocalAddr(t *testing.T) {
+	const nicID = 1
+
+	var secretKey [header.OpaqueIIDSecretKeyMinBytes]byte
+	n, err := rand.Read(secretKey[:])
+	if err != nil {
+		t.Fatalf("rand.Read(_): %s", err)
+	}
+	if n != header.OpaqueIIDSecretKeyMinBytes {
+		t.Fatalf("expected rand.Read to read %d bytes, read %d bytes", header.OpaqueIIDSecretKeyMinBytes, n)
+	}
+
+	nicNameFunc := func(_ tcpip.NICID, name string) string {
+		return name
+	}
+
+	tests := []struct {
+		name         string
+		nicName      string
+		autoGen      bool
+		linkAddr     tcpip.LinkAddress
+		iidOpts      stack.OpaqueInterfaceIdentifierOptions
+		shouldGen    bool
+		expectedAddr tcpip.Address
+	}{
+		{
+			name:      "Disabled",
+			nicName:   "nic1",
+			autoGen:   false,
+			linkAddr:  linkAddr1,
+			shouldGen: false,
+		},
+		{
+			name:     "Disabled without OIID options",
+			nicName:  "nic1",
+			autoGen:  false,
+			linkAddr: linkAddr1,
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:],
+			},
+			shouldGen: false,
+		},
+
+		// Tests for EUI64 based addresses.
+		{
+			name:         "EUI64 Enabled",
+			autoGen:      true,
+			linkAddr:     linkAddr1,
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddr(linkAddr1),
+		},
+		{
+			name:      "EUI64 Empty MAC",
+			autoGen:   true,
+			shouldGen: false,
+		},
+		{
+			name:      "EUI64 Invalid MAC",
+			autoGen:   true,
+			linkAddr:  "\x01\x02\x03",
+			shouldGen: false,
+		},
+		{
+			name:      "EUI64 Multicast MAC",
+			autoGen:   true,
+			linkAddr:  "\x01\x02\x03\x04\x05\x06",
+			shouldGen: false,
+		},
+		{
+			name:      "EUI64 Unspecified MAC",
+			autoGen:   true,
+			linkAddr:  "\x00\x00\x00\x00\x00\x00",
+			shouldGen: false,
+		},
+
+		// Tests for Opaque IID based addresses.
+		{
+			name:     "OIID Enabled",
+			nicName:  "nic1",
+			autoGen:  true,
+			linkAddr: linkAddr1,
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:],
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("nic1", 0, secretKey[:]),
+		},
+		// These are all cases where we would not have generated a
+		// link-local address if opaque IIDs were disabled.
+		{
+			name:    "OIID Empty MAC and empty nicName",
+			autoGen: true,
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:1],
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("", 0, secretKey[:1]),
+		},
+		{
+			name:     "OIID Invalid MAC",
+			nicName:  "test",
+			autoGen:  true,
+			linkAddr: "\x01\x02\x03",
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:2],
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("test", 0, secretKey[:2]),
+		},
+		{
+			name:     "OIID Multicast MAC",
+			nicName:  "test2",
+			autoGen:  true,
+			linkAddr: "\x01\x02\x03\x04\x05\x06",
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+				SecretKey:     secretKey[:3],
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("test2", 0, secretKey[:3]),
+		},
+		{
+			name:     "OIID Unspecified MAC and nil SecretKey",
+			nicName:  "test3",
+			autoGen:  true,
+			linkAddr: "\x00\x00\x00\x00\x00\x00",
+			iidOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: nicNameFunc,
+			},
+			shouldGen:    true,
+			expectedAddr: header.LinkLocalAddrWithOpaqueIID("test3", 0, nil),
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			ndpDisp := ndpDispatcher{
+				autoGenAddrC: make(chan ndpAutoGenAddrEvent, 1),
+			}
+			opts := stack.Options{
+				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+				AutoGenIPv6LinkLocal: test.autoGen,
+				NDPDisp:              &ndpDisp,
+				OpaqueIIDOpts:        test.iidOpts,
+			}
+
+			e := channel.New(0, 1280, test.linkAddr)
+			s := stack.New(opts)
+			nicOpts := stack.NICOptions{Name: test.nicName, Disabled: true}
+			if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+				t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, opts, err)
+			}
+
+			// A new disabled NIC should not have any address, even if auto generation
+			// was enabled.
+			allStackAddrs := s.AllAddresses()
+			allNICAddrs, ok := allStackAddrs[nicID]
+			if !ok {
+				t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+			}
+			if l := len(allNICAddrs); l != 0 {
+				t.Fatalf("got len(allNICAddrs) = %d, want = 0", l)
+			}
+
+			// Enabling the NIC should attempt auto-generation of a link-local
+			// address.
+			if err := s.EnableNIC(nicID); err != nil {
+				t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+			}
+
+			var expectedMainAddr tcpip.AddressWithPrefix
+			if test.shouldGen {
+				expectedMainAddr = tcpip.AddressWithPrefix{
+					Address:   test.expectedAddr,
+					PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen,
+				}
+
+				// Should have auto-generated an address and resolved immediately (DAD
+				// is disabled).
+				select {
+				case e := <-ndpDisp.autoGenAddrC:
+					if diff := checkAutoGenAddrEvent(e, expectedMainAddr, newAddr); diff != "" {
+						t.Errorf("auto-gen addr event mismatch (-want +got):\n%s", diff)
+					}
+				default:
+					t.Fatal("expected addr auto gen event")
+				}
+			} else {
+				// Should not have auto-generated an address.
+				select {
+				case <-ndpDisp.autoGenAddrC:
+					t.Fatal("unexpectedly auto-generated an address")
+				default:
+				}
+			}
+
+			gotMainAddr, err := s.GetMainNICAddress(1, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("stack.GetMainNICAddress(_, _) err = %s", err)
+			}
+			if gotMainAddr != expectedMainAddr {
+				t.Fatalf("got stack.GetMainNICAddress(_, _) = %s, want = %s", gotMainAddr, expectedMainAddr)
+			}
+		})
+	}
+}
+
+// TestNoLinkLocalAutoGenForLoopbackNIC tests that IPv6 link-local addresses are
+// not auto-generated for loopback NICs.
+func TestNoLinkLocalAutoGenForLoopbackNIC(t *testing.T) {
+	const nicID = 1
+	const nicName = "nicName"
+
+	tests := []struct {
+		name          string
+		opaqueIIDOpts stack.OpaqueInterfaceIdentifierOptions
+	}{
+		{
+			name:          "IID From MAC",
+			opaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{},
+		},
+		{
+			name: "Opaque IID",
+			opaqueIIDOpts: stack.OpaqueInterfaceIdentifierOptions{
+				NICNameFromID: func(_ tcpip.NICID, nicName string) string {
+					return nicName
+				},
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			opts := stack.Options{
+				NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+				AutoGenIPv6LinkLocal: true,
+				OpaqueIIDOpts:        test.opaqueIIDOpts,
+			}
+
+			e := loopback.New()
+			s := stack.New(opts)
+			nicOpts := stack.NICOptions{Name: nicName}
+			if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+				t.Fatalf("CreateNICWithOptions(%d, _, %+v) = %s", nicID, nicOpts, err)
+			}
+
+			addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+			if err != nil {
+				t.Fatalf("stack.GetMainNICAddress(%d, _) err = %s", nicID, err)
+			}
+			if want := (tcpip.AddressWithPrefix{}); addr != want {
+				t.Errorf("got stack.GetMainNICAddress(%d, _) = %s, want = %s", nicID, addr, want)
+			}
+		})
+	}
+}
+
+// TestNICAutoGenAddrDoesDAD tests that the successful auto-generation of IPv6
+// link-local addresses will only be assigned after the DAD process resolves.
+func TestNICAutoGenAddrDoesDAD(t *testing.T) {
+	const nicID = 1
+
+	ndpDisp := ndpDispatcher{
+		dadC: make(chan ndpDADEvent),
+	}
+	ndpConfigs := stack.DefaultNDPConfigurations()
+	opts := stack.Options{
+		NetworkProtocols:     []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs:           ndpConfigs,
+		AutoGenIPv6LinkLocal: true,
+		NDPDisp:              &ndpDisp,
+	}
+
+	e := channel.New(int(ndpConfigs.DupAddrDetectTransmits), 1280, linkAddr1)
+	s := stack.New(opts)
+	if err := s.CreateNIC(nicID, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+	}
+
+	// Address should not be considered bound to the
+	// NIC yet (DAD ongoing).
+	addr, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if want := (tcpip.AddressWithPrefix{}); addr != want {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+	}
+
+	linkLocalAddr := header.LinkLocalAddr(linkAddr1)
+
+	// Wait for DAD to resolve.
+	select {
+	case <-time.After(time.Duration(ndpConfigs.DupAddrDetectTransmits)*ndpConfigs.RetransmitTimer + time.Second):
+		// We should get a resolution event after 1s (default time to
+		// resolve as per default NDP configurations). Waiting for that
+		// resolution time + an extra 1s without a resolution event
+		// means something is wrong.
+		t.Fatal("timed out waiting for DAD resolution")
+	case e := <-ndpDisp.dadC:
+		if diff := checkDADEvent(e, nicID, linkLocalAddr, true, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+		}
+	}
+	addr, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if want := (tcpip.AddressWithPrefix{Address: linkLocalAddr, PrefixLen: header.IPv6LinkLocalPrefix.PrefixLen}); addr != want {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, addr, want)
+	}
+}
+
+// TestNewPEB tests that a new PrimaryEndpointBehavior value (peb) is respected
+// when an address's kind gets "promoted" to permanent from permanentExpired.
+func TestNewPEBOnPromotionToPermanent(t *testing.T) {
+	pebs := []stack.PrimaryEndpointBehavior{
+		stack.NeverPrimaryEndpoint,
+		stack.CanBePrimaryEndpoint,
+		stack.FirstPrimaryEndpoint,
+	}
+
+	for _, pi := range pebs {
+		for _, ps := range pebs {
+			t.Run(fmt.Sprintf("%d-to-%d", pi, ps), func(t *testing.T) {
+				s := stack.New(stack.Options{
+					NetworkProtocols: []stack.NetworkProtocol{fakeNetFactory()},
+				})
+				ep1 := channel.New(10, defaultMTU, "")
+				if err := s.CreateNIC(1, ep1); err != nil {
+					t.Fatal("CreateNIC failed:", err)
+				}
+
+				// Add a permanent address with initial
+				// PrimaryEndpointBehavior (peb), pi. If pi is
+				// NeverPrimaryEndpoint, the address should not
+				// be returned by a call to GetMainNICAddress;
+				// else, it should.
+				if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x01", pi); err != nil {
+					t.Fatal("AddAddressWithOptions failed:", err)
+				}
+				addr, err := s.GetMainNICAddress(1, fakeNetNumber)
+				if err != nil {
+					t.Fatal("s.GetMainNICAddress failed:", err)
+				}
+				if pi == stack.NeverPrimaryEndpoint {
+					if want := (tcpip.AddressWithPrefix{}); addr != want {
+						t.Fatalf("got GetMainNICAddress = %s, want = %s", addr, want)
+
+					}
+				} else if addr.Address != "\x01" {
+					t.Fatalf("got GetMainNICAddress = %s, want = 1", addr.Address)
+				}
+
+				{
+					subnet, err := tcpip.NewSubnet("\x00", "\x00")
+					if err != nil {
+						t.Fatalf("NewSubnet failed: %v", err)
+					}
+					s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+				}
+
+				// Take a route through the address so its ref
+				// count gets incremented and does not actually
+				// get deleted when RemoveAddress is called
+				// below. This is because we want to test that a
+				// new peb is respected when an address gets
+				// "promoted" to permanent from a
+				// permanentExpired kind.
+				r, err := s.FindRoute(1, "\x01", "\x02", fakeNetNumber, false)
+				if err != nil {
+					t.Fatalf("FindRoute failed: %v", err)
+				}
+				defer r.Release()
+				if err := s.RemoveAddress(1, "\x01"); err != nil {
+					t.Fatalf("RemoveAddress failed: %v", err)
+				}
+
+				//
+				// At this point, the address should still be
+				// known by the NIC, but have its
+				// kind = permanentExpired.
+				//
+
+				// Add some other address with peb set to
+				// FirstPrimaryEndpoint.
+				if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x03", stack.FirstPrimaryEndpoint); err != nil {
+					t.Fatalf("AddAddressWithOptions failed: %v", err)
+
+				}
+
+				// Add back the address we removed earlier and
+				// make sure the new peb was respected.
+				// (The address should just be promoted now).
+				if err := s.AddAddressWithOptions(1, fakeNetNumber, "\x01", ps); err != nil {
+					t.Fatalf("AddAddressWithOptions failed: %v", err)
+				}
+				var primaryAddrs []tcpip.Address
+				for _, pa := range s.NICInfo()[1].ProtocolAddresses {
+					primaryAddrs = append(primaryAddrs, pa.AddressWithPrefix.Address)
+				}
+				var expectedList []tcpip.Address
+				switch ps {
+				case stack.FirstPrimaryEndpoint:
+					expectedList = []tcpip.Address{
+						"\x01",
+						"\x03",
+					}
+				case stack.CanBePrimaryEndpoint:
+					expectedList = []tcpip.Address{
+						"\x03",
+						"\x01",
+					}
+				case stack.NeverPrimaryEndpoint:
+					expectedList = []tcpip.Address{
+						"\x03",
+					}
+				}
+				if !cmp.Equal(primaryAddrs, expectedList) {
+					t.Fatalf("got NIC's primary addresses = %v, want = %v", primaryAddrs, expectedList)
+				}
+
+				// Once we remove the other address, if the new
+				// peb, ps, was NeverPrimaryEndpoint, no address
+				// should be returned by a call to
+				// GetMainNICAddress; else, our original address
+				// should be returned.
+				if err := s.RemoveAddress(1, "\x03"); err != nil {
+					t.Fatalf("RemoveAddress failed: %v", err)
+				}
+				addr, err = s.GetMainNICAddress(1, fakeNetNumber)
+				if err != nil {
+					t.Fatalf("s.GetMainNICAddress failed: %v", err)
+				}
+				if ps == stack.NeverPrimaryEndpoint {
+					if want := (tcpip.AddressWithPrefix{}); addr != want {
+						t.Fatalf("got GetMainNICAddress = %s, want = %s", addr, want)
+
+					}
+				} else {
+					if addr.Address != "\x01" {
+						t.Fatalf("got GetMainNICAddress = %s, want = 1", addr.Address)
+					}
+				}
+			})
+		}
+	}
+}
+
+func TestIPv6SourceAddressSelectionScopeAndSameAddress(t *testing.T) {
+	const (
+		linkLocalAddr1         = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		linkLocalAddr2         = tcpip.Address("\xfe\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		linkLocalMulticastAddr = tcpip.Address("\xff\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		uniqueLocalAddr1       = tcpip.Address("\xfc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		uniqueLocalAddr2       = tcpip.Address("\xfd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		globalAddr1            = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01")
+		globalAddr2            = tcpip.Address("\xa0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02")
+		nicID                  = 1
+		lifetimeSeconds        = 9999
+	)
+
+	prefix1, _, stableGlobalAddr1 := prefixSubnetAddr(0, linkAddr1)
+	prefix2, _, stableGlobalAddr2 := prefixSubnetAddr(1, linkAddr1)
+
+	var tempIIDHistory [header.IIDSize]byte
+	header.InitialTempIID(tempIIDHistory[:], nil, nicID)
+	tempGlobalAddr1 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], stableGlobalAddr1.Address).Address
+	tempGlobalAddr2 := header.GenerateTempIPv6SLAACAddr(tempIIDHistory[:], stableGlobalAddr2.Address).Address
+
+	// Rule 3 is not tested here, and is instead tested by NDP's AutoGenAddr test.
+	tests := []struct {
+		name                                   string
+		slaacPrefixForTempAddrBeforeNICAddrAdd tcpip.AddressWithPrefix
+		nicAddrs                               []tcpip.Address
+		slaacPrefixForTempAddrAfterNICAddrAdd  tcpip.AddressWithPrefix
+		connectAddr                            tcpip.Address
+		expectedLocalAddr                      tcpip.Address
+	}{
+		// Test Rule 1 of RFC 6724 section 5.
+		{
+			name:              "Same Global most preferred (last address)",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			connectAddr:       globalAddr1,
+			expectedLocalAddr: globalAddr1,
+		},
+		{
+			name:              "Same Global most preferred (first address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, linkLocalAddr1, uniqueLocalAddr1},
+			connectAddr:       globalAddr1,
+			expectedLocalAddr: globalAddr1,
+		},
+		{
+			name:              "Same Link Local most preferred (last address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, uniqueLocalAddr1, linkLocalAddr1},
+			connectAddr:       linkLocalAddr1,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Same Link Local most preferred (first address)",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			connectAddr:       linkLocalAddr1,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Same Unique Local most preferred (last address)",
+			nicAddrs:          []tcpip.Address{uniqueLocalAddr1, globalAddr1, linkLocalAddr1},
+			connectAddr:       uniqueLocalAddr1,
+			expectedLocalAddr: uniqueLocalAddr1,
+		},
+		{
+			name:              "Same Unique Local most preferred (first address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, linkLocalAddr1, uniqueLocalAddr1},
+			connectAddr:       uniqueLocalAddr1,
+			expectedLocalAddr: uniqueLocalAddr1,
+		},
+
+		// Test Rule 2 of RFC 6724 section 5.
+		{
+			name:              "Global most preferred (last address)",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			connectAddr:       globalAddr2,
+			expectedLocalAddr: globalAddr1,
+		},
+		{
+			name:              "Global most preferred (first address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, linkLocalAddr1, uniqueLocalAddr1},
+			connectAddr:       globalAddr2,
+			expectedLocalAddr: globalAddr1,
+		},
+		{
+			name:              "Link Local most preferred (last address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, uniqueLocalAddr1, linkLocalAddr1},
+			connectAddr:       linkLocalAddr2,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Link Local most preferred (first address)",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			connectAddr:       linkLocalAddr2,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Link Local most preferred for link local multicast (last address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, uniqueLocalAddr1, linkLocalAddr1},
+			connectAddr:       linkLocalMulticastAddr,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Link Local most preferred for link local multicast (first address)",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			connectAddr:       linkLocalMulticastAddr,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Unique Local most preferred (last address)",
+			nicAddrs:          []tcpip.Address{uniqueLocalAddr1, globalAddr1, linkLocalAddr1},
+			connectAddr:       uniqueLocalAddr2,
+			expectedLocalAddr: uniqueLocalAddr1,
+		},
+		{
+			name:              "Unique Local most preferred (first address)",
+			nicAddrs:          []tcpip.Address{globalAddr1, linkLocalAddr1, uniqueLocalAddr1},
+			connectAddr:       uniqueLocalAddr2,
+			expectedLocalAddr: uniqueLocalAddr1,
+		},
+
+		// Test Rule 7 of RFC 6724 section 5.
+		{
+			name:                                   "Temp Global most preferred (last address)",
+			slaacPrefixForTempAddrBeforeNICAddrAdd: prefix1,
+			nicAddrs:                               []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			connectAddr:                            globalAddr2,
+			expectedLocalAddr:                      tempGlobalAddr1,
+		},
+		{
+			name:                                  "Temp Global most preferred (first address)",
+			nicAddrs:                              []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, globalAddr1},
+			slaacPrefixForTempAddrAfterNICAddrAdd: prefix1,
+			connectAddr:                           globalAddr2,
+			expectedLocalAddr:                     tempGlobalAddr1,
+		},
+
+		// Test returning the endpoint that is closest to the front when
+		// candidate addresses are "equal" from the perspective of RFC 6724
+		// section 5.
+		{
+			name:              "Unique Local for Global",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, uniqueLocalAddr1, uniqueLocalAddr2},
+			connectAddr:       globalAddr2,
+			expectedLocalAddr: uniqueLocalAddr1,
+		},
+		{
+			name:              "Link Local for Global",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, linkLocalAddr2},
+			connectAddr:       globalAddr2,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:              "Link Local for Unique Local",
+			nicAddrs:          []tcpip.Address{linkLocalAddr1, linkLocalAddr2},
+			connectAddr:       uniqueLocalAddr2,
+			expectedLocalAddr: linkLocalAddr1,
+		},
+		{
+			name:                                   "Temp Global for Global",
+			slaacPrefixForTempAddrBeforeNICAddrAdd: prefix1,
+			slaacPrefixForTempAddrAfterNICAddrAdd:  prefix2,
+			connectAddr:                            globalAddr1,
+			expectedLocalAddr:                      tempGlobalAddr2,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			e := channel.New(0, 1280, linkAddr1)
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{ipv6.NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				NDPConfigs: stack.NDPConfigurations{
+					HandleRAs:                  true,
+					AutoGenGlobalAddresses:     true,
+					AutoGenTempGlobalAddresses: true,
+				},
+				NDPDisp: &ndpDispatcher{},
+			})
+			if err := s.CreateNIC(nicID, e); err != nil {
+				t.Fatalf("CreateNIC(%d, _) = %s", nicID, err)
+			}
+			s.SetRouteTable([]tcpip.Route{{
+				Destination: header.IPv6EmptySubnet,
+				Gateway:     llAddr3,
+				NIC:         nicID,
+			}})
+			s.AddLinkAddress(nicID, llAddr3, linkAddr3)
+
+			if test.slaacPrefixForTempAddrBeforeNICAddrAdd != (tcpip.AddressWithPrefix{}) {
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, test.slaacPrefixForTempAddrBeforeNICAddrAdd, true, true, lifetimeSeconds, lifetimeSeconds))
+			}
+
+			for _, a := range test.nicAddrs {
+				if err := s.AddAddress(nicID, ipv6.ProtocolNumber, a); err != nil {
+					t.Errorf("s.AddAddress(%d, %d, %s): %s", nicID, ipv6.ProtocolNumber, a, err)
+				}
+			}
+
+			if test.slaacPrefixForTempAddrAfterNICAddrAdd != (tcpip.AddressWithPrefix{}) {
+				e.InjectInbound(header.IPv6ProtocolNumber, raBufWithPI(llAddr3, 0, test.slaacPrefixForTempAddrAfterNICAddrAdd, true, true, lifetimeSeconds, lifetimeSeconds))
+			}
+
+			if t.Failed() {
+				t.FailNow()
+			}
+
+			if got := addrForNewConnectionTo(t, s, tcpip.FullAddress{Addr: test.connectAddr, NIC: nicID, Port: 1234}); got != test.expectedLocalAddr {
+				t.Errorf("got local address = %s, want = %s", got, test.expectedLocalAddr)
+			}
+		})
+	}
+}
+
+func TestAddRemoveIPv4BroadcastAddressOnNICEnableDisable(t *testing.T) {
+	const nicID = 1
+
+	e := loopback.New()
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv4.NewProtocol()},
+	})
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	allStackAddrs := s.AllAddresses()
+	allNICAddrs, ok := allStackAddrs[nicID]
+	if !ok {
+		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+	}
+	if l := len(allNICAddrs); l != 0 {
+		t.Fatalf("got len(allNICAddrs) = %d, want = 0", l)
+	}
+
+	// Enabling the NIC should add the IPv4 broadcast address.
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	allStackAddrs = s.AllAddresses()
+	allNICAddrs, ok = allStackAddrs[nicID]
+	if !ok {
+		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+	}
+	if l := len(allNICAddrs); l != 1 {
+		t.Fatalf("got len(allNICAddrs) = %d, want = 1", l)
+	}
+	want := tcpip.ProtocolAddress{
+		Protocol: header.IPv4ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   header.IPv4Broadcast,
+			PrefixLen: 32,
+		},
+	}
+	if allNICAddrs[0] != want {
+		t.Fatalf("got allNICAddrs[0] = %+v, want = %+v", allNICAddrs[0], want)
+	}
+
+	// Disabling the NIC should remove the IPv4 broadcast address.
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+	}
+	allStackAddrs = s.AllAddresses()
+	allNICAddrs, ok = allStackAddrs[nicID]
+	if !ok {
+		t.Fatalf("entry for %d missing from allStackAddrs = %+v", nicID, allStackAddrs)
+	}
+	if l := len(allNICAddrs); l != 0 {
+		t.Fatalf("got len(allNICAddrs) = %d, want = 0", l)
+	}
+}
+
+// TestLeaveIPv6SolicitedNodeAddrBeforeAddrRemoval tests that removing an IPv6
+// address after leaving its solicited node multicast address does not result in
+// an error.
+func TestLeaveIPv6SolicitedNodeAddrBeforeAddrRemoval(t *testing.T) {
+	const nicID = 1
+
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+	})
+	e := channel.New(10, 1280, linkAddr1)
+	if err := s.CreateNIC(1, e); err != nil {
+		t.Fatalf("CreateNIC(%d, _): %s", nicID, err)
+	}
+
+	if err := s.AddAddress(nicID, ipv6.ProtocolNumber, addr1); err != nil {
+		t.Fatalf("AddAddress(%d, %d, %s): %s", nicID, ipv6.ProtocolNumber, addr1, err)
+	}
+
+	// The NIC should have joined addr1's solicited node multicast address.
+	snmc := header.SolicitedNodeAddr(addr1)
+	in, err := s.IsInGroup(nicID, snmc)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, snmc, err)
+	}
+	if !in {
+		t.Fatalf("got IsInGroup(%d, %s) = false, want = true", nicID, snmc)
+	}
+
+	if err := s.LeaveGroup(ipv6.ProtocolNumber, nicID, snmc); err != nil {
+		t.Fatalf("LeaveGroup(%d, %d, %s): %s", ipv6.ProtocolNumber, nicID, snmc, err)
+	}
+	in, err = s.IsInGroup(nicID, snmc)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, snmc, err)
+	}
+	if in {
+		t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, snmc)
+	}
+
+	if err := s.RemoveAddress(nicID, addr1); err != nil {
+		t.Fatalf("RemoveAddress(%d, %s) = %s", nicID, addr1, err)
+	}
+}
+
+func TestJoinLeaveAllNodesMulticastOnNICEnableDisable(t *testing.T) {
+	const nicID = 1
+
+	e := loopback.New()
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+	})
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	// Should not be in the IPv6 all-nodes multicast group yet because the NIC has
+	// not been enabled yet.
+	isInGroup, err := s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
+	}
+	if isInGroup {
+		t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, header.IPv6AllNodesMulticastAddress)
+	}
+
+	// The all-nodes multicast group should be joined when the NIC is enabled.
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	isInGroup, err = s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
+	}
+	if !isInGroup {
+		t.Fatalf("got IsInGroup(%d, %s) = false, want = true", nicID, header.IPv6AllNodesMulticastAddress)
+	}
+
+	// The all-nodes multicast group should be left when the NIC is disabled.
+	if err := s.DisableNIC(nicID); err != nil {
+		t.Fatalf("s.DisableNIC(%d): %s", nicID, err)
+	}
+	isInGroup, err = s.IsInGroup(nicID, header.IPv6AllNodesMulticastAddress)
+	if err != nil {
+		t.Fatalf("IsInGroup(%d, %s): %s", nicID, header.IPv6AllNodesMulticastAddress, err)
+	}
+	if isInGroup {
+		t.Fatalf("got IsInGroup(%d, %s) = true, want = false", nicID, header.IPv6AllNodesMulticastAddress)
+	}
+}
+
+// TestDoDADWhenNICEnabled tests that IPv6 endpoints that were added while a NIC
+// was disabled have DAD performed on them when the NIC is enabled.
+func TestDoDADWhenNICEnabled(t *testing.T) {
+	const dadTransmits = 1
+	const retransmitTimer = time.Second
+	const nicID = 1
+
+	ndpDisp := ndpDispatcher{
+		dadC: make(chan ndpDADEvent),
+	}
+	opts := stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{ipv6.NewProtocol()},
+		NDPConfigs: stack.NDPConfigurations{
+			DupAddrDetectTransmits: dadTransmits,
+			RetransmitTimer:        retransmitTimer,
+		},
+		NDPDisp: &ndpDisp,
+	}
+
+	e := channel.New(dadTransmits, 1280, linkAddr1)
+	s := stack.New(opts)
+	nicOpts := stack.NICOptions{Disabled: true}
+	if err := s.CreateNICWithOptions(nicID, e, nicOpts); err != nil {
+		t.Fatalf("CreateNIC(%d, _, %+v) = %s", nicID, nicOpts, err)
+	}
+
+	addr := tcpip.ProtocolAddress{
+		Protocol: header.IPv6ProtocolNumber,
+		AddressWithPrefix: tcpip.AddressWithPrefix{
+			Address:   llAddr1,
+			PrefixLen: 128,
+		},
+	}
+	if err := s.AddProtocolAddress(nicID, addr); err != nil {
+		t.Fatalf("AddProtocolAddress(%d, %+v): %s", nicID, addr, err)
+	}
+
+	// Address should be in the list of all addresses.
+	if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) {
+		t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr)
+	}
+
+	// Address should be tentative so it should not be a main address.
+	got, err := s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if want := (tcpip.AddressWithPrefix{}); got != want {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, want)
+	}
+
+	// Enabling the NIC should start DAD for the address.
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) {
+		t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr)
+	}
+
+	// Address should not be considered bound to the NIC yet (DAD ongoing).
+	got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if want := (tcpip.AddressWithPrefix{}); got != want {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, want)
+	}
+
+	// Wait for DAD to resolve.
+	select {
+	case <-time.After(dadTransmits*retransmitTimer + defaultAsyncPositiveEventTimeout):
+		t.Fatal("timed out waiting for DAD resolution")
+	case e := <-ndpDisp.dadC:
+		if diff := checkDADEvent(e, nicID, addr.AddressWithPrefix.Address, true, nil); diff != "" {
+			t.Errorf("dad event mismatch (-want +got):\n%s", diff)
+		}
+	}
+	if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) {
+		t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr)
+	}
+	got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if got != addr.AddressWithPrefix {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = %s, want = %s", nicID, header.IPv6ProtocolNumber, got, addr.AddressWithPrefix)
+	}
+
+	// Enabling the NIC again should be a no-op.
+	if err := s.EnableNIC(nicID); err != nil {
+		t.Fatalf("s.EnableNIC(%d): %s", nicID, err)
+	}
+	if addrs := s.AllAddresses()[nicID]; !containsV6Addr(addrs, addr.AddressWithPrefix) {
+		t.Fatalf("got s.AllAddresses()[%d] = %+v, want = %+v", nicID, addrs, addr)
+	}
+	got, err = s.GetMainNICAddress(nicID, header.IPv6ProtocolNumber)
+	if err != nil {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (_, %v), want = (_, nil)", nicID, header.IPv6ProtocolNumber, err)
+	}
+	if got != addr.AddressWithPrefix {
+		t.Fatalf("got stack.GetMainNICAddress(%d, %d) = (%s, nil), want = (%s, nil)", nicID, header.IPv6ProtocolNumber, got, addr.AddressWithPrefix)
+	}
+}
+
+func TestStackReceiveBufferSizeOption(t *testing.T) {
+	const sMin = stack.MinBufferSize
+	testCases := []struct {
+		name string
+		rs   stack.ReceiveBufferSizeOption
+		err  *tcpip.Error
+	}{
+		// Invalid configurations.
+		{"min_below_zero", stack.ReceiveBufferSizeOption{Min: -1, Default: sMin, Max: sMin}, tcpip.ErrInvalidOptionValue},
+		{"min_zero", stack.ReceiveBufferSizeOption{Min: 0, Default: sMin, Max: sMin}, tcpip.ErrInvalidOptionValue},
+		{"default_below_min", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin - 1, Max: sMin - 1}, tcpip.ErrInvalidOptionValue},
+		{"default_above_max", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin}, tcpip.ErrInvalidOptionValue},
+		{"max_below_min", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin - 1}, tcpip.ErrInvalidOptionValue},
+
+		// Valid Configurations
+		{"in_ascending_order", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 2}, nil},
+		{"all_equal", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin, Max: sMin}, nil},
+		{"min_default_equal", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin, Max: sMin + 1}, nil},
+		{"default_max_equal", stack.ReceiveBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 1}, nil},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			s := stack.New(stack.Options{})
+			defer s.Close()
+			if err := s.SetOption(tc.rs); err != tc.err {
+				t.Fatalf("s.SetOption(%#v) = %v, want: %v", tc.rs, err, tc.err)
+			}
+			var rs stack.ReceiveBufferSizeOption
+			if tc.err == nil {
+				if err := s.Option(&rs); err != nil {
+					t.Fatalf("s.Option(%#v) = %v, want: nil", rs, err)
+				}
+				if got, want := rs, tc.rs; got != want {
+					t.Fatalf("s.Option(..) returned unexpected value got: %#v, want: %#v", got, want)
+				}
+			}
+		})
+	}
+}
+
+func TestStackSendBufferSizeOption(t *testing.T) {
+	const sMin = stack.MinBufferSize
+	testCases := []struct {
+		name string
+		ss   stack.SendBufferSizeOption
+		err  *tcpip.Error
+	}{
+		// Invalid configurations.
+		{"min_below_zero", stack.SendBufferSizeOption{Min: -1, Default: sMin, Max: sMin}, tcpip.ErrInvalidOptionValue},
+		{"min_zero", stack.SendBufferSizeOption{Min: 0, Default: sMin, Max: sMin}, tcpip.ErrInvalidOptionValue},
+		{"default_below_min", stack.SendBufferSizeOption{Min: 0, Default: sMin - 1, Max: sMin - 1}, tcpip.ErrInvalidOptionValue},
+		{"default_above_max", stack.SendBufferSizeOption{Min: 0, Default: sMin + 1, Max: sMin}, tcpip.ErrInvalidOptionValue},
+		{"max_below_min", stack.SendBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin - 1}, tcpip.ErrInvalidOptionValue},
+
+		// Valid Configurations
+		{"in_ascending_order", stack.SendBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 2}, nil},
+		{"all_equal", stack.SendBufferSizeOption{Min: sMin, Default: sMin, Max: sMin}, nil},
+		{"min_default_equal", stack.SendBufferSizeOption{Min: sMin, Default: sMin, Max: sMin + 1}, nil},
+		{"default_max_equal", stack.SendBufferSizeOption{Min: sMin, Default: sMin + 1, Max: sMin + 1}, nil},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			s := stack.New(stack.Options{})
+			defer s.Close()
+			if err := s.SetOption(tc.ss); err != tc.err {
+				t.Fatalf("s.SetOption(%+v) = %v, want: %v", tc.ss, err, tc.err)
+			}
+			var ss stack.SendBufferSizeOption
+			if tc.err == nil {
+				if err := s.Option(&ss); err != nil {
+					t.Fatalf("s.Option(%+v) = %v, want: nil", ss, err)
+				}
+				if got, want := ss, tc.ss; got != want {
+					t.Fatalf("s.Option(..) returned unexpected value got: %#v, want: %#v", got, want)
+				}
+			}
+		})
+	}
+}
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
new file mode 100644
index 000000000..b902c6ca9
--- /dev/null
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -0,0 +1,686 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+	"math/rand"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
+)
+
+type protocolIDs struct {
+	network   tcpip.NetworkProtocolNumber
+	transport tcpip.TransportProtocolNumber
+}
+
+// transportEndpoints manages all endpoints of a given protocol. It has its own
+// mutex so as to reduce interference between protocols.
+type transportEndpoints struct {
+	// mu protects all fields of the transportEndpoints.
+	mu        sync.RWMutex
+	endpoints map[TransportEndpointID]*endpointsByNIC
+	// rawEndpoints contains endpoints for raw sockets, which receive all
+	// traffic of a given protocol regardless of port.
+	rawEndpoints []RawTransportEndpoint
+}
+
+// unregisterEndpoint unregisters the endpoint with the given id such that it
+// won't receive any more packets.
+func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
+	eps.mu.Lock()
+	defer eps.mu.Unlock()
+	epsByNIC, ok := eps.endpoints[id]
+	if !ok {
+		return
+	}
+	if !epsByNIC.unregisterEndpoint(bindToDevice, ep, flags) {
+		return
+	}
+	delete(eps.endpoints, id)
+}
+
+func (eps *transportEndpoints) transportEndpoints() []TransportEndpoint {
+	eps.mu.RLock()
+	defer eps.mu.RUnlock()
+	es := make([]TransportEndpoint, 0, len(eps.endpoints))
+	for _, e := range eps.endpoints {
+		es = append(es, e.transportEndpoints()...)
+	}
+	return es
+}
+
+// iterEndpointsLocked yields all endpointsByNIC in eps that match id, in
+// descending order of match quality. If a call to yield returns false,
+// iterEndpointsLocked stops iteration and returns immediately.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) iterEndpointsLocked(id TransportEndpointID, yield func(*endpointsByNIC) bool) {
+	// Try to find a match with the id as provided.
+	if ep, ok := eps.endpoints[id]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+
+	// Try to find a match with the id minus the local address.
+	nid := id
+
+	nid.LocalAddress = ""
+	if ep, ok := eps.endpoints[nid]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+
+	// Try to find a match with the id minus the remote part.
+	nid.LocalAddress = id.LocalAddress
+	nid.RemoteAddress = ""
+	nid.RemotePort = 0
+	if ep, ok := eps.endpoints[nid]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+
+	// Try to find a match with only the local port.
+	nid.LocalAddress = ""
+	if ep, ok := eps.endpoints[nid]; ok {
+		if !yield(ep) {
+			return
+		}
+	}
+}
+
+// findAllEndpointsLocked returns all endpointsByNIC in eps that match id, in
+// descending order of match quality.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) findAllEndpointsLocked(id TransportEndpointID) []*endpointsByNIC {
+	var matchedEPs []*endpointsByNIC
+	eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool {
+		matchedEPs = append(matchedEPs, ep)
+		return true
+	})
+	return matchedEPs
+}
+
+// findEndpointLocked returns the endpoint that most closely matches the given id.
+//
+// Preconditions: eps.mu must be locked.
+func (eps *transportEndpoints) findEndpointLocked(id TransportEndpointID) *endpointsByNIC {
+	var matchedEP *endpointsByNIC
+	eps.iterEndpointsLocked(id, func(ep *endpointsByNIC) bool {
+		matchedEP = ep
+		return false
+	})
+	return matchedEP
+}
+
+type endpointsByNIC struct {
+	mu        sync.RWMutex
+	endpoints map[tcpip.NICID]*multiPortEndpoint
+	// seed is a random secret for a jenkins hash.
+	seed uint32
+}
+
+func (epsByNIC *endpointsByNIC) transportEndpoints() []TransportEndpoint {
+	epsByNIC.mu.RLock()
+	defer epsByNIC.mu.RUnlock()
+	var eps []TransportEndpoint
+	for _, ep := range epsByNIC.endpoints {
+		eps = append(eps, ep.transportEndpoints()...)
+	}
+	return eps
+}
+
+// HandlePacket is called by the stack when new packets arrive to this transport
+// endpoint.
+func (epsByNIC *endpointsByNIC) handlePacket(r *Route, id TransportEndpointID, pkt *PacketBuffer) {
+	epsByNIC.mu.RLock()
+
+	mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
+	if !ok {
+		if mpep, ok = epsByNIC.endpoints[0]; !ok {
+			epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
+			return
+		}
+	}
+
+	// If this is a broadcast or multicast datagram, deliver the datagram to all
+	// endpoints bound to the right device.
+	if isMulticastOrBroadcast(id.LocalAddress) {
+		mpep.handlePacketAll(r, id, pkt)
+		epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
+		return
+	}
+	// multiPortEndpoints are guaranteed to have at least one element.
+	transEP := selectEndpoint(id, mpep, epsByNIC.seed)
+	if queuedProtocol, mustQueue := mpep.demux.queuedProtocols[protocolIDs{mpep.netProto, mpep.transProto}]; mustQueue {
+		queuedProtocol.QueuePacket(r, transEP, id, pkt)
+		epsByNIC.mu.RUnlock()
+		return
+	}
+
+	transEP.HandlePacket(r, id, pkt)
+	epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
+}
+
+// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
+func (epsByNIC *endpointsByNIC) handleControlPacket(n *NIC, id TransportEndpointID, typ ControlType, extra uint32, pkt *PacketBuffer) {
+	epsByNIC.mu.RLock()
+	defer epsByNIC.mu.RUnlock()
+
+	mpep, ok := epsByNIC.endpoints[n.ID()]
+	if !ok {
+		mpep, ok = epsByNIC.endpoints[0]
+	}
+	if !ok {
+		return
+	}
+
+	// TODO(eyalsoha): Why don't we look at id to see if this packet needs to
+	// broadcast like we are doing with handlePacket above?
+
+	// multiPortEndpoints are guaranteed to have at least one element.
+	selectEndpoint(id, mpep, epsByNIC.seed).HandleControlPacket(id, typ, extra, pkt)
+}
+
+// registerEndpoint returns true if it succeeds. It fails and returns
+// false if ep already has an element with the same key.
+func (epsByNIC *endpointsByNIC) registerEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, t TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+	epsByNIC.mu.Lock()
+	defer epsByNIC.mu.Unlock()
+
+	multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
+	if !ok {
+		multiPortEp = &multiPortEndpoint{
+			demux:      d,
+			netProto:   netProto,
+			transProto: transProto,
+		}
+		epsByNIC.endpoints[bindToDevice] = multiPortEp
+	}
+
+	return multiPortEp.singleRegisterEndpoint(t, flags)
+}
+
+func (epsByNIC *endpointsByNIC) checkEndpoint(d *transportDemuxer, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+	epsByNIC.mu.RLock()
+	defer epsByNIC.mu.RUnlock()
+
+	multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
+	if !ok {
+		return nil
+	}
+
+	return multiPortEp.singleCheckEndpoint(flags)
+}
+
+// unregisterEndpoint returns true if endpointsByNIC has to be unregistered.
+func (epsByNIC *endpointsByNIC) unregisterEndpoint(bindToDevice tcpip.NICID, t TransportEndpoint, flags ports.Flags) bool {
+	epsByNIC.mu.Lock()
+	defer epsByNIC.mu.Unlock()
+	multiPortEp, ok := epsByNIC.endpoints[bindToDevice]
+	if !ok {
+		return false
+	}
+	if multiPortEp.unregisterEndpoint(t, flags) {
+		delete(epsByNIC.endpoints, bindToDevice)
+	}
+	return len(epsByNIC.endpoints) == 0
+}
+
+// transportDemuxer demultiplexes packets targeted at a transport endpoint
+// (i.e., after they've been parsed by the network layer). It does two levels
+// of demultiplexing: first based on the network and transport protocols, then
+// based on endpoints IDs. It should only be instantiated via
+// newTransportDemuxer.
+type transportDemuxer struct {
+	// protocol is immutable.
+	protocol        map[protocolIDs]*transportEndpoints
+	queuedProtocols map[protocolIDs]queuedTransportProtocol
+}
+
+// queuedTransportProtocol if supported by a protocol implementation will cause
+// the dispatcher to delivery packets to the QueuePacket method instead of
+// calling HandlePacket directly on the endpoint.
+type queuedTransportProtocol interface {
+	QueuePacket(r *Route, ep TransportEndpoint, id TransportEndpointID, pkt *PacketBuffer)
+}
+
+func newTransportDemuxer(stack *Stack) *transportDemuxer {
+	d := &transportDemuxer{
+		protocol:        make(map[protocolIDs]*transportEndpoints),
+		queuedProtocols: make(map[protocolIDs]queuedTransportProtocol),
+	}
+
+	// Add each network and transport pair to the demuxer.
+	for netProto := range stack.networkProtocols {
+		for proto := range stack.transportProtocols {
+			protoIDs := protocolIDs{netProto, proto}
+			d.protocol[protoIDs] = &transportEndpoints{
+				endpoints: make(map[TransportEndpointID]*endpointsByNIC),
+			}
+			qTransProto, isQueued := (stack.transportProtocols[proto].proto).(queuedTransportProtocol)
+			if isQueued {
+				d.queuedProtocols[protoIDs] = qTransProto
+			}
+		}
+	}
+
+	return d
+}
+
+// registerEndpoint registers the given endpoint with the dispatcher such that
+// packets that match the endpoint ID are delivered to it.
+func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+	for i, n := range netProtos {
+		if err := d.singleRegisterEndpoint(n, protocol, id, ep, flags, bindToDevice); err != nil {
+			d.unregisterEndpoint(netProtos[:i], protocol, id, ep, flags, bindToDevice)
+			return err
+		}
+	}
+
+	return nil
+}
+
+// checkEndpoint checks if an endpoint can be registered with the dispatcher.
+func (d *transportDemuxer) checkEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+	for _, n := range netProtos {
+		if err := d.singleCheckEndpoint(n, protocol, id, flags, bindToDevice); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// multiPortEndpoint is a container for TransportEndpoints which are bound to
+// the same pair of address and port. endpointsArr always has at least one
+// element.
+//
+// FIXME(gvisor.dev/issue/873): Restore this properly. Currently, we just save
+// this to ensure that the underlying endpoints get saved/restored, but not not
+// use the restored copy.
+//
+// +stateify savable
+type multiPortEndpoint struct {
+	mu         sync.RWMutex `state:"nosave"`
+	demux      *transportDemuxer
+	netProto   tcpip.NetworkProtocolNumber
+	transProto tcpip.TransportProtocolNumber
+
+	// endpoints stores the transport endpoints in the order in which they
+	// were bound. This is required for UDP SO_REUSEADDR.
+	endpoints []TransportEndpoint
+	flags     ports.FlagCounter
+}
+
+func (ep *multiPortEndpoint) transportEndpoints() []TransportEndpoint {
+	ep.mu.RLock()
+	eps := append([]TransportEndpoint(nil), ep.endpoints...)
+	ep.mu.RUnlock()
+	return eps
+}
+
+// reciprocalScale scales a value into range [0, n).
+//
+// This is similar to val % n, but faster.
+// See http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+func reciprocalScale(val, n uint32) uint32 {
+	return uint32((uint64(val) * uint64(n)) >> 32)
+}
+
+// selectEndpoint calculates a hash of destination and source addresses and
+// ports then uses it to select a socket. In this case, all packets from one
+// address will be sent to same endpoint.
+func selectEndpoint(id TransportEndpointID, mpep *multiPortEndpoint, seed uint32) TransportEndpoint {
+	if len(mpep.endpoints) == 1 {
+		return mpep.endpoints[0]
+	}
+
+	if mpep.flags.IntersectionRefs().ToFlags().Effective().MostRecent {
+		return mpep.endpoints[len(mpep.endpoints)-1]
+	}
+
+	payload := []byte{
+		byte(id.LocalPort),
+		byte(id.LocalPort >> 8),
+		byte(id.RemotePort),
+		byte(id.RemotePort >> 8),
+	}
+
+	h := jenkins.Sum32(seed)
+	h.Write(payload)
+	h.Write([]byte(id.LocalAddress))
+	h.Write([]byte(id.RemoteAddress))
+	hash := h.Sum32()
+
+	idx := reciprocalScale(hash, uint32(len(mpep.endpoints)))
+	return mpep.endpoints[idx]
+}
+
+func (ep *multiPortEndpoint) handlePacketAll(r *Route, id TransportEndpointID, pkt *PacketBuffer) {
+	ep.mu.RLock()
+	queuedProtocol, mustQueue := ep.demux.queuedProtocols[protocolIDs{ep.netProto, ep.transProto}]
+	// HandlePacket takes ownership of pkt, so each endpoint needs
+	// its own copy except for the final one.
+	for _, endpoint := range ep.endpoints[:len(ep.endpoints)-1] {
+		if mustQueue {
+			queuedProtocol.QueuePacket(r, endpoint, id, pkt.Clone())
+		} else {
+			endpoint.HandlePacket(r, id, pkt.Clone())
+		}
+	}
+	if endpoint := ep.endpoints[len(ep.endpoints)-1]; mustQueue {
+		queuedProtocol.QueuePacket(r, endpoint, id, pkt)
+	} else {
+		endpoint.HandlePacket(r, id, pkt)
+	}
+	ep.mu.RUnlock() // Don't use defer for performance reasons.
+}
+
+// singleRegisterEndpoint tries to add an endpoint to the multiPortEndpoint
+// list. The list might be empty already.
+func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint, flags ports.Flags) *tcpip.Error {
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	bits := flags.Bits() & ports.MultiBindFlagMask
+
+	if len(ep.endpoints) != 0 {
+		// If it was previously bound, we need to check if we can bind again.
+		if ep.flags.TotalRefs() > 0 && bits&ep.flags.IntersectionRefs() == 0 {
+			return tcpip.ErrPortInUse
+		}
+	}
+
+	ep.endpoints = append(ep.endpoints, t)
+	ep.flags.AddRef(bits)
+
+	return nil
+}
+
+func (ep *multiPortEndpoint) singleCheckEndpoint(flags ports.Flags) *tcpip.Error {
+	ep.mu.RLock()
+	defer ep.mu.RUnlock()
+
+	bits := flags.Bits() & ports.MultiBindFlagMask
+
+	if len(ep.endpoints) != 0 {
+		// If it was previously bound, we need to check if we can bind again.
+		if ep.flags.TotalRefs() > 0 && bits&ep.flags.IntersectionRefs() == 0 {
+			return tcpip.ErrPortInUse
+		}
+	}
+
+	return nil
+}
+
+// unregisterEndpoint returns true if multiPortEndpoint has to be unregistered.
+func (ep *multiPortEndpoint) unregisterEndpoint(t TransportEndpoint, flags ports.Flags) bool {
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	for i, endpoint := range ep.endpoints {
+		if endpoint == t {
+			copy(ep.endpoints[i:], ep.endpoints[i+1:])
+			ep.endpoints[len(ep.endpoints)-1] = nil
+			ep.endpoints = ep.endpoints[:len(ep.endpoints)-1]
+
+			ep.flags.DropRef(flags.Bits() & ports.MultiBindFlagMask)
+			break
+		}
+	}
+	return len(ep.endpoints) == 0
+}
+
+func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+	if id.RemotePort != 0 {
+		// SO_REUSEPORT only applies to bound/listening endpoints.
+		flags.LoadBalanced = false
+	}
+
+	eps, ok := d.protocol[protocolIDs{netProto, protocol}]
+	if !ok {
+		return tcpip.ErrUnknownProtocol
+	}
+
+	eps.mu.Lock()
+	defer eps.mu.Unlock()
+
+	epsByNIC, ok := eps.endpoints[id]
+	if !ok {
+		epsByNIC = &endpointsByNIC{
+			endpoints: make(map[tcpip.NICID]*multiPortEndpoint),
+			seed:      rand.Uint32(),
+		}
+		eps.endpoints[id] = epsByNIC
+	}
+
+	return epsByNIC.registerEndpoint(d, netProto, protocol, ep, flags, bindToDevice)
+}
+
+func (d *transportDemuxer) singleCheckEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, flags ports.Flags, bindToDevice tcpip.NICID) *tcpip.Error {
+	if id.RemotePort != 0 {
+		// SO_REUSEPORT only applies to bound/listening endpoints.
+		flags.LoadBalanced = false
+	}
+
+	eps, ok := d.protocol[protocolIDs{netProto, protocol}]
+	if !ok {
+		return tcpip.ErrUnknownProtocol
+	}
+
+	eps.mu.RLock()
+	defer eps.mu.RUnlock()
+
+	epsByNIC, ok := eps.endpoints[id]
+	if !ok {
+		return nil
+	}
+
+	return epsByNIC.checkEndpoint(d, netProto, protocol, flags, bindToDevice)
+}
+
+// unregisterEndpoint unregisters the endpoint with the given id such that it
+// won't receive any more packets.
+func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, flags ports.Flags, bindToDevice tcpip.NICID) {
+	if id.RemotePort != 0 {
+		// SO_REUSEPORT only applies to bound/listening endpoints.
+		flags.LoadBalanced = false
+	}
+
+	for _, n := range netProtos {
+		if eps, ok := d.protocol[protocolIDs{n, protocol}]; ok {
+			eps.unregisterEndpoint(id, ep, flags, bindToDevice)
+		}
+	}
+}
+
+// deliverPacket attempts to find one or more matching transport endpoints, and
+// then, if matches are found, delivers the packet to them. Returns true if
+// the packet no longer needs to be handled.
+func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer, id TransportEndpointID) bool {
+	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
+	if !ok {
+		return false
+	}
+
+	// If the packet is a UDP broadcast or multicast, then find all matching
+	// transport endpoints.
+	if protocol == header.UDPProtocolNumber && isMulticastOrBroadcast(id.LocalAddress) {
+		eps.mu.RLock()
+		destEPs := eps.findAllEndpointsLocked(id)
+		eps.mu.RUnlock()
+		// Fail if we didn't find at least one matching transport endpoint.
+		if len(destEPs) == 0 {
+			r.Stats().UDP.UnknownPortErrors.Increment()
+			return false
+		}
+		// handlePacket takes ownership of pkt, so each endpoint needs its own
+		// copy except for the final one.
+		for _, ep := range destEPs[:len(destEPs)-1] {
+			ep.handlePacket(r, id, pkt.Clone())
+		}
+		destEPs[len(destEPs)-1].handlePacket(r, id, pkt)
+		return true
+	}
+
+	// If the packet is a TCP packet with a non-unicast source or destination
+	// address, then do nothing further and instruct the caller to do the same.
+	if protocol == header.TCPProtocolNumber && (!isUnicast(r.LocalAddress) || !isUnicast(r.RemoteAddress)) {
+		// TCP can only be used to communicate between a single source and a
+		// single destination; the addresses must be unicast.
+		r.Stats().TCP.InvalidSegmentsReceived.Increment()
+		return true
+	}
+
+	eps.mu.RLock()
+	ep := eps.findEndpointLocked(id)
+	eps.mu.RUnlock()
+	if ep == nil {
+		if protocol == header.UDPProtocolNumber {
+			r.Stats().UDP.UnknownPortErrors.Increment()
+		}
+		return false
+	}
+	ep.handlePacket(r, id, pkt)
+	return true
+}
+
+// deliverRawPacket attempts to deliver the given packet and returns whether it
+// was delivered successfully.
+func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, pkt *PacketBuffer) bool {
+	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
+	if !ok {
+		return false
+	}
+
+	// As in net/ipv4/ip_input.c:ip_local_deliver, attempt to deliver via
+	// raw endpoint first. If there are multiple raw endpoints, they all
+	// receive the packet.
+	foundRaw := false
+	eps.mu.RLock()
+	for _, rawEP := range eps.rawEndpoints {
+		// Each endpoint gets its own copy of the packet for the sake
+		// of save/restore.
+		rawEP.HandlePacket(r, pkt)
+		foundRaw = true
+	}
+	eps.mu.RUnlock()
+
+	return foundRaw
+}
+
+// deliverControlPacket attempts to deliver the given control packet. Returns
+// true if it found an endpoint, false otherwise.
+func (d *transportDemuxer) deliverControlPacket(n *NIC, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, pkt *PacketBuffer, id TransportEndpointID) bool {
+	eps, ok := d.protocol[protocolIDs{net, trans}]
+	if !ok {
+		return false
+	}
+
+	eps.mu.RLock()
+	ep := eps.findEndpointLocked(id)
+	eps.mu.RUnlock()
+	if ep == nil {
+		return false
+	}
+
+	ep.handleControlPacket(n, id, typ, extra, pkt)
+	return true
+}
+
+// findTransportEndpoint find a single endpoint that most closely matches the provided id.
+func (d *transportDemuxer) findTransportEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, id TransportEndpointID, r *Route) TransportEndpoint {
+	eps, ok := d.protocol[protocolIDs{netProto, transProto}]
+	if !ok {
+		return nil
+	}
+
+	eps.mu.RLock()
+	epsByNIC := eps.findEndpointLocked(id)
+	if epsByNIC == nil {
+		eps.mu.RUnlock()
+		return nil
+	}
+
+	epsByNIC.mu.RLock()
+	eps.mu.RUnlock()
+
+	mpep, ok := epsByNIC.endpoints[r.ref.nic.ID()]
+	if !ok {
+		if mpep, ok = epsByNIC.endpoints[0]; !ok {
+			epsByNIC.mu.RUnlock() // Don't use defer for performance reasons.
+			return nil
+		}
+	}
+
+	ep := selectEndpoint(id, mpep, epsByNIC.seed)
+	epsByNIC.mu.RUnlock()
+	return ep
+}
+
+// registerRawEndpoint registers the given endpoint with the dispatcher such
+// that packets of the appropriate protocol are delivered to it. A single
+// packet can be sent to one or more raw endpoints along with a non-raw
+// endpoint.
+func (d *transportDemuxer) registerRawEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error {
+	eps, ok := d.protocol[protocolIDs{netProto, transProto}]
+	if !ok {
+		return tcpip.ErrNotSupported
+	}
+
+	eps.mu.Lock()
+	eps.rawEndpoints = append(eps.rawEndpoints, ep)
+	eps.mu.Unlock()
+
+	return nil
+}
+
+// unregisterRawEndpoint unregisters the raw endpoint for the given transport
+// protocol such that it won't receive any more packets.
+func (d *transportDemuxer) unregisterRawEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) {
+	eps, ok := d.protocol[protocolIDs{netProto, transProto}]
+	if !ok {
+		panic(fmt.Errorf("tried to unregister endpoint with unsupported network and transport protocol pair: %d, %d", netProto, transProto))
+	}
+
+	eps.mu.Lock()
+	for i, rawEP := range eps.rawEndpoints {
+		if rawEP == ep {
+			lastIdx := len(eps.rawEndpoints) - 1
+			eps.rawEndpoints[i] = eps.rawEndpoints[lastIdx]
+			eps.rawEndpoints[lastIdx] = nil
+			eps.rawEndpoints = eps.rawEndpoints[:lastIdx]
+			break
+		}
+	}
+	eps.mu.Unlock()
+}
+
+func isMulticastOrBroadcast(addr tcpip.Address) bool {
+	return addr == header.IPv4Broadcast || header.IsV4MulticastAddress(addr) || header.IsV6MulticastAddress(addr)
+}
+
+func isUnicast(addr tcpip.Address) bool {
+	return addr != header.IPv4Any && addr != header.IPv6Any && !isMulticastOrBroadcast(addr)
+}
diff --git a/pkg/tcpip/stack/transport_demuxer_test.go b/pkg/tcpip/stack/transport_demuxer_test.go
new file mode 100644
index 000000000..73dada928
--- /dev/null
+++ b/pkg/tcpip/stack/transport_demuxer_test.go
@@ -0,0 +1,390 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack_test
+
+import (
+	"math"
+	"math/rand"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	testSrcAddrV6 = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+	testDstAddrV6 = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+
+	testSrcAddrV4 = "\x0a\x00\x00\x01"
+	testDstAddrV4 = "\x0a\x00\x00\x02"
+
+	testDstPort = 1234
+	testSrcPort = 4096
+)
+
+type testContext struct {
+	linkEps map[tcpip.NICID]*channel.Endpoint
+	s       *stack.Stack
+	wq      waiter.Queue
+}
+
+// newDualTestContextMultiNIC creates the testing context and also linkEpIDs NICs.
+func newDualTestContextMultiNIC(t *testing.T, mtu uint32, linkEpIDs []tcpip.NICID) *testContext {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+	})
+	linkEps := make(map[tcpip.NICID]*channel.Endpoint)
+	for _, linkEpID := range linkEpIDs {
+		channelEp := channel.New(256, mtu, "")
+		if err := s.CreateNIC(linkEpID, channelEp); err != nil {
+			t.Fatalf("CreateNIC failed: %s", err)
+		}
+		linkEps[linkEpID] = channelEp
+
+		if err := s.AddAddress(linkEpID, ipv4.ProtocolNumber, testDstAddrV4); err != nil {
+			t.Fatalf("AddAddress IPv4 failed: %s", err)
+		}
+
+		if err := s.AddAddress(linkEpID, ipv6.ProtocolNumber, testDstAddrV6); err != nil {
+			t.Fatalf("AddAddress IPv6 failed: %s", err)
+		}
+	}
+
+	s.SetRouteTable([]tcpip.Route{
+		{Destination: header.IPv4EmptySubnet, NIC: 1},
+		{Destination: header.IPv6EmptySubnet, NIC: 1},
+	})
+
+	return &testContext{
+		s:       s,
+		linkEps: linkEps,
+	}
+}
+
+type headers struct {
+	srcPort, dstPort uint16
+}
+
+func newPayload() []byte {
+	b := make([]byte, 30+rand.Intn(100))
+	for i := range b {
+		b[i] = byte(rand.Intn(256))
+	}
+	return b
+}
+
+func (c *testContext) sendV4Packet(payload []byte, h *headers, linkEpID tcpip.NICID) {
+	buf := buffer.NewView(header.UDPMinimumSize + header.IPv4MinimumSize + len(payload))
+	payloadStart := len(buf) - len(payload)
+	copy(buf[payloadStart:], payload)
+
+	// Initialize the IP header.
+	ip := header.IPv4(buf)
+	ip.Encode(&header.IPv4Fields{
+		IHL:         header.IPv4MinimumSize,
+		TOS:         0x80,
+		TotalLength: uint16(len(buf)),
+		TTL:         65,
+		Protocol:    uint8(udp.ProtocolNumber),
+		SrcAddr:     testSrcAddrV4,
+		DstAddr:     testDstAddrV4,
+	})
+	ip.SetChecksum(^ip.CalculateChecksum())
+
+	// Initialize the UDP header.
+	u := header.UDP(buf[header.IPv4MinimumSize:])
+	u.Encode(&header.UDPFields{
+		SrcPort: h.srcPort,
+		DstPort: h.dstPort,
+		Length:  uint16(header.UDPMinimumSize + len(payload)),
+	})
+
+	// Calculate the UDP pseudo-header checksum.
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, testSrcAddrV4, testDstAddrV4, uint16(len(u)))
+
+	// Calculate the UDP checksum and set it.
+	xsum = header.Checksum(payload, xsum)
+	u.SetChecksum(^u.CalculateChecksum(xsum))
+
+	// Inject packet.
+	c.linkEps[linkEpID].InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Data:            buf.ToVectorisedView(),
+		NetworkHeader:   buffer.View(ip),
+		TransportHeader: buffer.View(u),
+	})
+}
+
+func (c *testContext) sendV6Packet(payload []byte, h *headers, linkEpID tcpip.NICID) {
+	// Allocate a buffer for data and headers.
+	buf := buffer.NewView(header.UDPMinimumSize + header.IPv6MinimumSize + len(payload))
+	copy(buf[len(buf)-len(payload):], payload)
+
+	// Initialize the IP header.
+	ip := header.IPv6(buf)
+	ip.Encode(&header.IPv6Fields{
+		PayloadLength: uint16(header.UDPMinimumSize + len(payload)),
+		NextHeader:    uint8(udp.ProtocolNumber),
+		HopLimit:      65,
+		SrcAddr:       testSrcAddrV6,
+		DstAddr:       testDstAddrV6,
+	})
+
+	// Initialize the UDP header.
+	u := header.UDP(buf[header.IPv6MinimumSize:])
+	u.Encode(&header.UDPFields{
+		SrcPort: h.srcPort,
+		DstPort: h.dstPort,
+		Length:  uint16(header.UDPMinimumSize + len(payload)),
+	})
+
+	// Calculate the UDP pseudo-header checksum.
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, testSrcAddrV6, testDstAddrV6, uint16(len(u)))
+
+	// Calculate the UDP checksum and set it.
+	xsum = header.Checksum(payload, xsum)
+	u.SetChecksum(^u.CalculateChecksum(xsum))
+
+	// Inject packet.
+	c.linkEps[linkEpID].InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
+		Data:            buf.ToVectorisedView(),
+		NetworkHeader:   buffer.View(ip),
+		TransportHeader: buffer.View(u),
+	})
+}
+
+func TestTransportDemuxerRegister(t *testing.T) {
+	for _, test := range []struct {
+		name  string
+		proto tcpip.NetworkProtocolNumber
+		want  *tcpip.Error
+	}{
+		{"failure", ipv6.ProtocolNumber, tcpip.ErrUnknownProtocol},
+		{"success", ipv4.ProtocolNumber, nil},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			s := stack.New(stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+			})
+			var wq waiter.Queue
+			ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
+			if err != nil {
+				t.Fatal(err)
+			}
+			tEP, ok := ep.(stack.TransportEndpoint)
+			if !ok {
+				t.Fatalf("%T does not implement stack.TransportEndpoint", ep)
+			}
+			if got, want := s.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{test.proto}, udp.ProtocolNumber, stack.TransportEndpointID{}, tEP, ports.Flags{}, 0), test.want; got != want {
+				t.Fatalf("s.RegisterTransportEndpoint(...) = %s, want %s", got, want)
+			}
+		})
+	}
+}
+
+// TestBindToDeviceDistribution injects varied packets on input devices and checks that
+// the distribution of packets received matches expectations.
+func TestBindToDeviceDistribution(t *testing.T) {
+	type endpointSockopts struct {
+		reuse        bool
+		bindToDevice tcpip.NICID
+	}
+	for _, test := range []struct {
+		name string
+		// endpoints will received the inject packets.
+		endpoints []endpointSockopts
+		// wantDistributions is the want ratio of packets received on each
+		// endpoint for each NIC on which packets are injected.
+		wantDistributions map[tcpip.NICID][]float64
+	}{
+		{
+			"BindPortReuse",
+			// 5 endpoints that all have reuse set.
+			[]endpointSockopts{
+				{reuse: true, bindToDevice: 0},
+				{reuse: true, bindToDevice: 0},
+				{reuse: true, bindToDevice: 0},
+				{reuse: true, bindToDevice: 0},
+				{reuse: true, bindToDevice: 0},
+			},
+			map[tcpip.NICID][]float64{
+				// Injected packets on dev0 get distributed evenly.
+				1: {0.2, 0.2, 0.2, 0.2, 0.2},
+			},
+		},
+		{
+			"BindToDevice",
+			// 3 endpoints with various bindings.
+			[]endpointSockopts{
+				{reuse: false, bindToDevice: 1},
+				{reuse: false, bindToDevice: 2},
+				{reuse: false, bindToDevice: 3},
+			},
+			map[tcpip.NICID][]float64{
+				// Injected packets on dev0 go only to the endpoint bound to dev0.
+				1: {1, 0, 0},
+				// Injected packets on dev1 go only to the endpoint bound to dev1.
+				2: {0, 1, 0},
+				// Injected packets on dev2 go only to the endpoint bound to dev2.
+				3: {0, 0, 1},
+			},
+		},
+		{
+			"ReuseAndBindToDevice",
+			// 6 endpoints with various bindings.
+			[]endpointSockopts{
+				{reuse: true, bindToDevice: 1},
+				{reuse: true, bindToDevice: 1},
+				{reuse: true, bindToDevice: 2},
+				{reuse: true, bindToDevice: 2},
+				{reuse: true, bindToDevice: 2},
+				{reuse: true, bindToDevice: 0},
+			},
+			map[tcpip.NICID][]float64{
+				// Injected packets on dev0 get distributed among endpoints bound to
+				// dev0.
+				1: {0.5, 0.5, 0, 0, 0, 0},
+				// Injected packets on dev1 get distributed among endpoints bound to
+				// dev1 or unbound.
+				2: {0, 0, 1. / 3, 1. / 3, 1. / 3, 0},
+				// Injected packets on dev999 go only to the unbound.
+				1000: {0, 0, 0, 0, 0, 1},
+			},
+		},
+	} {
+		for protoName, netProtoNum := range map[string]tcpip.NetworkProtocolNumber{
+			"IPv4": ipv4.ProtocolNumber,
+			"IPv6": ipv6.ProtocolNumber,
+		} {
+			for device, wantDistribution := range test.wantDistributions {
+				t.Run(test.name+protoName+string(device), func(t *testing.T) {
+					var devices []tcpip.NICID
+					for d := range test.wantDistributions {
+						devices = append(devices, d)
+					}
+					c := newDualTestContextMultiNIC(t, defaultMTU, devices)
+
+					eps := make(map[tcpip.Endpoint]int)
+
+					pollChannel := make(chan tcpip.Endpoint)
+					for i, endpoint := range test.endpoints {
+						// Try to receive the data.
+						wq := waiter.Queue{}
+						we, ch := waiter.NewChannelEntry(nil)
+						wq.EventRegister(&we, waiter.EventIn)
+						defer wq.EventUnregister(&we)
+						defer close(ch)
+
+						var err *tcpip.Error
+						ep, err := c.s.NewEndpoint(udp.ProtocolNumber, netProtoNum, &wq)
+						if err != nil {
+							t.Fatalf("NewEndpoint failed: %s", err)
+						}
+						eps[ep] = i
+
+						go func(ep tcpip.Endpoint) {
+							for range ch {
+								pollChannel <- ep
+							}
+						}(ep)
+
+						defer ep.Close()
+						if err := ep.SetSockOptBool(tcpip.ReusePortOption, endpoint.reuse); err != nil {
+							t.Fatalf("SetSockOptBool(ReusePortOption, %t) on endpoint %d failed: %s", endpoint.reuse, i, err)
+						}
+						bindToDeviceOption := tcpip.BindToDeviceOption(endpoint.bindToDevice)
+						if err := ep.SetSockOpt(bindToDeviceOption); err != nil {
+							t.Fatalf("SetSockOpt(%#v) on endpoint %d failed: %s", bindToDeviceOption, i, err)
+						}
+
+						var dstAddr tcpip.Address
+						switch netProtoNum {
+						case ipv4.ProtocolNumber:
+							dstAddr = testDstAddrV4
+						case ipv6.ProtocolNumber:
+							dstAddr = testDstAddrV6
+						default:
+							t.Fatalf("unexpected protocol number: %d", netProtoNum)
+						}
+						if err := ep.Bind(tcpip.FullAddress{Addr: dstAddr, Port: testDstPort}); err != nil {
+							t.Fatalf("ep.Bind(...) on endpoint %d failed: %s", i, err)
+						}
+					}
+
+					npackets := 100000
+					nports := 10000
+					if got, want := len(test.endpoints), len(wantDistribution); got != want {
+						t.Fatalf("got len(test.endpoints) = %d, want %d", got, want)
+					}
+					ports := make(map[uint16]tcpip.Endpoint)
+					stats := make(map[tcpip.Endpoint]int)
+					for i := 0; i < npackets; i++ {
+						// Send a packet.
+						port := uint16(i % nports)
+						payload := newPayload()
+						hdrs := &headers{
+							srcPort: testSrcPort + port,
+							dstPort: testDstPort,
+						}
+						switch netProtoNum {
+						case ipv4.ProtocolNumber:
+							c.sendV4Packet(payload, hdrs, device)
+						case ipv6.ProtocolNumber:
+							c.sendV6Packet(payload, hdrs, device)
+						default:
+							t.Fatalf("unexpected protocol number: %d", netProtoNum)
+						}
+
+						ep := <-pollChannel
+						if _, _, err := ep.Read(nil); err != nil {
+							t.Fatalf("Read on endpoint %d failed: %s", eps[ep], err)
+						}
+						stats[ep]++
+						if i < nports {
+							ports[uint16(i)] = ep
+						} else {
+							// Check that all packets from one client are handled by the same
+							// socket.
+							if want, got := ports[port], ep; want != got {
+								t.Fatalf("Packet sent on port %d expected on endpoint %d but received on endpoint %d", port, eps[want], eps[got])
+							}
+						}
+					}
+
+					// Check that a packet distribution is as expected.
+					for ep, i := range eps {
+						wantRatio := wantDistribution[i]
+						wantRecv := wantRatio * float64(npackets)
+						actualRecv := stats[ep]
+						actualRatio := float64(stats[ep]) / float64(npackets)
+						// The deviation is less than 10%.
+						if math.Abs(actualRatio-wantRatio) > 0.05 {
+							t.Errorf("want about %.0f%% (%.0f of %d) packets to arrive on endpoint %d, got %.0f%% (%d of %d)", wantRatio*100, wantRecv, npackets, i, actualRatio*100, actualRecv, npackets)
+						}
+					}
+				})
+			}
+		}
+	}
+}
diff --git a/pkg/tcpip/stack/transport_test.go b/pkg/tcpip/stack/transport_test.go
new file mode 100644
index 000000000..7e8b84867
--- /dev/null
+++ b/pkg/tcpip/stack/transport_test.go
@@ -0,0 +1,664 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	fakeTransNumber    tcpip.TransportProtocolNumber = 1
+	fakeTransHeaderLen                               = 3
+)
+
+// fakeTransportEndpoint is a transport-layer protocol endpoint. It counts
+// received packets; the counts of all endpoints are aggregated in the protocol
+// descriptor.
+//
+// Headers of this protocol are fakeTransHeaderLen bytes, but we currently don't
+// use it.
+type fakeTransportEndpoint struct {
+	stack.TransportEndpointInfo
+	stack    *stack.Stack
+	proto    *fakeTransportProtocol
+	peerAddr tcpip.Address
+	route    stack.Route
+	uniqueID uint64
+
+	// acceptQueue is non-nil iff bound.
+	acceptQueue []fakeTransportEndpoint
+}
+
+func (f *fakeTransportEndpoint) Info() tcpip.EndpointInfo {
+	return &f.TransportEndpointInfo
+}
+
+func (f *fakeTransportEndpoint) Stats() tcpip.EndpointStats {
+	return nil
+}
+
+func (f *fakeTransportEndpoint) SetOwner(owner tcpip.PacketOwner) {}
+
+func newFakeTransportEndpoint(s *stack.Stack, proto *fakeTransportProtocol, netProto tcpip.NetworkProtocolNumber, uniqueID uint64) tcpip.Endpoint {
+	return &fakeTransportEndpoint{stack: s, TransportEndpointInfo: stack.TransportEndpointInfo{NetProto: netProto}, proto: proto, uniqueID: uniqueID}
+}
+
+func (f *fakeTransportEndpoint) Abort() {
+	f.Close()
+}
+
+func (f *fakeTransportEndpoint) Close() {
+	f.route.Release()
+}
+
+func (*fakeTransportEndpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return mask
+}
+
+func (*fakeTransportEndpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	return buffer.View{}, tcpip.ControlMessages{}, nil
+}
+
+func (f *fakeTransportEndpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+	if len(f.route.RemoteAddress) == 0 {
+		return 0, nil, tcpip.ErrNoRoute
+	}
+
+	hdr := buffer.NewPrependable(int(f.route.MaxHeaderLength()) + fakeTransHeaderLen)
+	hdr.Prepend(fakeTransHeaderLen)
+	v, err := p.FullPayload()
+	if err != nil {
+		return 0, nil, err
+	}
+	if err := f.route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: fakeTransNumber, TTL: 123, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   buffer.View(v).ToVectorisedView(),
+	}); err != nil {
+		return 0, nil, err
+	}
+
+	return int64(len(v)), nil, nil
+}
+
+func (f *fakeTransportEndpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+	return 0, tcpip.ControlMessages{}, nil
+}
+
+// SetSockOpt sets a socket option. Currently not supported.
+func (*fakeTransportEndpoint) SetSockOpt(interface{}) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// SetSockOptBool sets a socket option. Currently not supported.
+func (*fakeTransportEndpoint) SetSockOptBool(tcpip.SockOptBool, bool) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// SetSockOptInt sets a socket option. Currently not supported.
+func (*fakeTransportEndpoint) SetSockOptInt(tcpip.SockOptInt, int) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (*fakeTransportEndpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	return false, tcpip.ErrUnknownProtocolOption
+}
+
+// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
+func (*fakeTransportEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
+	return -1, tcpip.ErrUnknownProtocolOption
+}
+
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (*fakeTransportEndpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	switch opt.(type) {
+	case tcpip.ErrorOption:
+		return nil
+	}
+	return tcpip.ErrInvalidEndpointState
+}
+
+// Disconnect implements tcpip.Endpoint.Disconnect.
+func (*fakeTransportEndpoint) Disconnect() *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+func (f *fakeTransportEndpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+	f.peerAddr = addr.Addr
+
+	// Find the route.
+	r, err := f.stack.FindRoute(addr.NIC, "", addr.Addr, fakeNetNumber, false /* multicastLoop */)
+	if err != nil {
+		return tcpip.ErrNoRoute
+	}
+	defer r.Release()
+
+	// Try to register so that we can start receiving packets.
+	f.ID.RemoteAddress = addr.Addr
+	err = f.stack.RegisterTransportEndpoint(0, []tcpip.NetworkProtocolNumber{fakeNetNumber}, fakeTransNumber, f.ID, f, ports.Flags{}, 0 /* bindToDevice */)
+	if err != nil {
+		return err
+	}
+
+	f.route = r.Clone()
+
+	return nil
+}
+
+func (f *fakeTransportEndpoint) UniqueID() uint64 {
+	return f.uniqueID
+}
+
+func (f *fakeTransportEndpoint) ConnectEndpoint(e tcpip.Endpoint) *tcpip.Error {
+	return nil
+}
+
+func (*fakeTransportEndpoint) Shutdown(tcpip.ShutdownFlags) *tcpip.Error {
+	return nil
+}
+
+func (*fakeTransportEndpoint) Reset() {
+}
+
+func (*fakeTransportEndpoint) Listen(int) *tcpip.Error {
+	return nil
+}
+
+func (f *fakeTransportEndpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+	if len(f.acceptQueue) == 0 {
+		return nil, nil, nil
+	}
+	a := f.acceptQueue[0]
+	f.acceptQueue = f.acceptQueue[1:]
+	return &a, nil, nil
+}
+
+func (f *fakeTransportEndpoint) Bind(a tcpip.FullAddress) *tcpip.Error {
+	if err := f.stack.RegisterTransportEndpoint(
+		a.NIC,
+		[]tcpip.NetworkProtocolNumber{fakeNetNumber},
+		fakeTransNumber,
+		stack.TransportEndpointID{LocalAddress: a.Addr},
+		f,
+		ports.Flags{},
+		0, /* bindtoDevice */
+	); err != nil {
+		return err
+	}
+	f.acceptQueue = []fakeTransportEndpoint{}
+	return nil
+}
+
+func (*fakeTransportEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return tcpip.FullAddress{}, nil
+}
+
+func (*fakeTransportEndpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return tcpip.FullAddress{}, nil
+}
+
+func (f *fakeTransportEndpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, _ *stack.PacketBuffer) {
+	// Increment the number of received packets.
+	f.proto.packetCount++
+	if f.acceptQueue != nil {
+		f.acceptQueue = append(f.acceptQueue, fakeTransportEndpoint{
+			stack: f.stack,
+			TransportEndpointInfo: stack.TransportEndpointInfo{
+				ID:       f.ID,
+				NetProto: f.NetProto,
+			},
+			proto:    f.proto,
+			peerAddr: r.RemoteAddress,
+			route:    r.Clone(),
+		})
+	}
+}
+
+func (f *fakeTransportEndpoint) HandleControlPacket(stack.TransportEndpointID, stack.ControlType, uint32, *stack.PacketBuffer) {
+	// Increment the number of received control packets.
+	f.proto.controlCount++
+}
+
+func (f *fakeTransportEndpoint) State() uint32 {
+	return 0
+}
+
+func (f *fakeTransportEndpoint) ModerateRecvBuf(copied int) {}
+
+func (f *fakeTransportEndpoint) IPTables() (stack.IPTables, error) {
+	return stack.IPTables{}, nil
+}
+
+func (f *fakeTransportEndpoint) Resume(*stack.Stack) {}
+
+func (f *fakeTransportEndpoint) Wait() {}
+
+type fakeTransportGoodOption bool
+
+type fakeTransportBadOption bool
+
+type fakeTransportInvalidValueOption int
+
+type fakeTransportProtocolOptions struct {
+	good bool
+}
+
+// fakeTransportProtocol is a transport-layer protocol descriptor. It
+// aggregates the number of packets received via endpoints of this protocol.
+type fakeTransportProtocol struct {
+	packetCount  int
+	controlCount int
+	opts         fakeTransportProtocolOptions
+}
+
+func (*fakeTransportProtocol) Number() tcpip.TransportProtocolNumber {
+	return fakeTransNumber
+}
+
+func (f *fakeTransportProtocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return newFakeTransportEndpoint(stack, f, netProto, stack.UniqueID()), nil
+}
+
+func (*fakeTransportProtocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, _ *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return nil, tcpip.ErrUnknownProtocol
+}
+
+func (*fakeTransportProtocol) MinimumPacketSize() int {
+	return fakeTransHeaderLen
+}
+
+func (*fakeTransportProtocol) ParsePorts(buffer.View) (src, dst uint16, err *tcpip.Error) {
+	return 0, 0, nil
+}
+
+func (*fakeTransportProtocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) bool {
+	return true
+}
+
+func (f *fakeTransportProtocol) SetOption(option interface{}) *tcpip.Error {
+	switch v := option.(type) {
+	case fakeTransportGoodOption:
+		f.opts.good = bool(v)
+		return nil
+	case fakeTransportInvalidValueOption:
+		return tcpip.ErrInvalidOptionValue
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+func (f *fakeTransportProtocol) Option(option interface{}) *tcpip.Error {
+	switch v := option.(type) {
+	case *fakeTransportGoodOption:
+		*v = fakeTransportGoodOption(f.opts.good)
+		return nil
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// Abort implements TransportProtocol.Abort.
+func (*fakeTransportProtocol) Abort() {}
+
+// Close implements tcpip.Endpoint.Close.
+func (*fakeTransportProtocol) Close() {}
+
+// Wait implements TransportProtocol.Wait.
+func (*fakeTransportProtocol) Wait() {}
+
+// Parse implements TransportProtocol.Parse.
+func (*fakeTransportProtocol) Parse(pkt *stack.PacketBuffer) bool {
+	hdr, ok := pkt.Data.PullUp(fakeTransHeaderLen)
+	if !ok {
+		return false
+	}
+	pkt.TransportHeader = hdr
+	pkt.Data.TrimFront(fakeTransHeaderLen)
+	return true
+}
+
+func fakeTransFactory() stack.TransportProtocol {
+	return &fakeTransportProtocol{}
+}
+
+func TestTransportReceive(t *testing.T) {
+	linkEP := channel.New(10, defaultMTU, "")
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
+		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+	})
+	if err := s.CreateNIC(1, linkEP); err != nil {
+		t.Fatalf("CreateNIC failed: %v", err)
+	}
+
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+	}
+
+	if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil {
+		t.Fatalf("AddAddress failed: %v", err)
+	}
+
+	// Create endpoint and connect to remote address.
+	wq := waiter.Queue{}
+	ep, err := s.NewEndpoint(fakeTransNumber, fakeNetNumber, &wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	if err := ep.Connect(tcpip.FullAddress{0, "\x02", 0}); err != nil {
+		t.Fatalf("Connect failed: %v", err)
+	}
+
+	fakeTrans := s.TransportProtocolInstance(fakeTransNumber).(*fakeTransportProtocol)
+
+	// Create buffer that will hold the packet.
+	buf := buffer.NewView(30)
+
+	// Make sure packet with wrong protocol is not delivered.
+	buf[0] = 1
+	buf[2] = 0
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+	if fakeTrans.packetCount != 0 {
+		t.Errorf("packetCount = %d, want %d", fakeTrans.packetCount, 0)
+	}
+
+	// Make sure packet from the wrong source is not delivered.
+	buf[0] = 1
+	buf[1] = 3
+	buf[2] = byte(fakeTransNumber)
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+	if fakeTrans.packetCount != 0 {
+		t.Errorf("packetCount = %d, want %d", fakeTrans.packetCount, 0)
+	}
+
+	// Make sure packet is delivered.
+	buf[0] = 1
+	buf[1] = 2
+	buf[2] = byte(fakeTransNumber)
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+	if fakeTrans.packetCount != 1 {
+		t.Errorf("packetCount = %d, want %d", fakeTrans.packetCount, 1)
+	}
+}
+
+func TestTransportControlReceive(t *testing.T) {
+	linkEP := channel.New(10, defaultMTU, "")
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
+		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+	})
+	if err := s.CreateNIC(1, linkEP); err != nil {
+		t.Fatalf("CreateNIC failed: %v", err)
+	}
+
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+	}
+
+	if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil {
+		t.Fatalf("AddAddress failed: %v", err)
+	}
+
+	// Create endpoint and connect to remote address.
+	wq := waiter.Queue{}
+	ep, err := s.NewEndpoint(fakeTransNumber, fakeNetNumber, &wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	if err := ep.Connect(tcpip.FullAddress{0, "\x02", 0}); err != nil {
+		t.Fatalf("Connect failed: %v", err)
+	}
+
+	fakeTrans := s.TransportProtocolInstance(fakeTransNumber).(*fakeTransportProtocol)
+
+	// Create buffer that will hold the control packet.
+	buf := buffer.NewView(2*fakeNetHeaderLen + 30)
+
+	// Outer packet contains the control protocol number.
+	buf[0] = 1
+	buf[1] = 0xfe
+	buf[2] = uint8(fakeControlProtocol)
+
+	// Make sure packet with wrong protocol is not delivered.
+	buf[fakeNetHeaderLen+0] = 0
+	buf[fakeNetHeaderLen+1] = 1
+	buf[fakeNetHeaderLen+2] = 0
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+	if fakeTrans.controlCount != 0 {
+		t.Errorf("controlCount = %d, want %d", fakeTrans.controlCount, 0)
+	}
+
+	// Make sure packet from the wrong source is not delivered.
+	buf[fakeNetHeaderLen+0] = 3
+	buf[fakeNetHeaderLen+1] = 1
+	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+	if fakeTrans.controlCount != 0 {
+		t.Errorf("controlCount = %d, want %d", fakeTrans.controlCount, 0)
+	}
+
+	// Make sure packet is delivered.
+	buf[fakeNetHeaderLen+0] = 2
+	buf[fakeNetHeaderLen+1] = 1
+	buf[fakeNetHeaderLen+2] = byte(fakeTransNumber)
+	linkEP.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+	if fakeTrans.controlCount != 1 {
+		t.Errorf("controlCount = %d, want %d", fakeTrans.controlCount, 1)
+	}
+}
+
+func TestTransportSend(t *testing.T) {
+	linkEP := channel.New(10, defaultMTU, "")
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
+		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+	})
+	if err := s.CreateNIC(1, linkEP); err != nil {
+		t.Fatalf("CreateNIC failed: %v", err)
+	}
+
+	if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil {
+		t.Fatalf("AddAddress failed: %v", err)
+	}
+
+	{
+		subnet, err := tcpip.NewSubnet("\x00", "\x00")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{{Destination: subnet, Gateway: "\x00", NIC: 1}})
+	}
+
+	// Create endpoint and bind it.
+	wq := waiter.Queue{}
+	ep, err := s.NewEndpoint(fakeTransNumber, fakeNetNumber, &wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	if err := ep.Connect(tcpip.FullAddress{0, "\x02", 0}); err != nil {
+		t.Fatalf("Connect failed: %v", err)
+	}
+
+	// Create buffer that will hold the payload.
+	view := buffer.NewView(30)
+	_, _, err = ep.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{})
+	if err != nil {
+		t.Fatalf("write failed: %v", err)
+	}
+
+	fakeNet := s.NetworkProtocolInstance(fakeNetNumber).(*fakeNetworkProtocol)
+
+	if fakeNet.sendPacketCount[2] != 1 {
+		t.Errorf("sendPacketCount = %d, want %d", fakeNet.sendPacketCount[2], 1)
+	}
+}
+
+func TestTransportOptions(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
+		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+	})
+
+	// Try an unsupported transport protocol.
+	if err := s.SetTransportProtocolOption(tcpip.TransportProtocolNumber(99999), fakeTransportGoodOption(false)); err != tcpip.ErrUnknownProtocol {
+		t.Fatalf("SetTransportProtocolOption(fakeTrans2, blah, false) = %v, want = tcpip.ErrUnknownProtocol", err)
+	}
+
+	testCases := []struct {
+		option   interface{}
+		wantErr  *tcpip.Error
+		verifier func(t *testing.T, p stack.TransportProtocol)
+	}{
+		{fakeTransportGoodOption(true), nil, func(t *testing.T, p stack.TransportProtocol) {
+			t.Helper()
+			fakeTrans := p.(*fakeTransportProtocol)
+			if fakeTrans.opts.good != true {
+				t.Fatalf("fakeTrans.opts.good = false, want = true")
+			}
+			var v fakeTransportGoodOption
+			if err := s.TransportProtocolOption(fakeTransNumber, &v); err != nil {
+				t.Fatalf("s.TransportProtocolOption(fakeTransNumber, &v) = %v, want = nil, where v is option %T", v, err)
+			}
+			if v != true {
+				t.Fatalf("s.TransportProtocolOption(fakeTransNumber, &v) returned v = %v, want = true", v)
+			}
+
+		}},
+		{fakeTransportBadOption(true), tcpip.ErrUnknownProtocolOption, nil},
+		{fakeTransportInvalidValueOption(1), tcpip.ErrInvalidOptionValue, nil},
+	}
+	for _, tc := range testCases {
+		if got := s.SetTransportProtocolOption(fakeTransNumber, tc.option); got != tc.wantErr {
+			t.Errorf("s.SetTransportProtocolOption(fakeTrans, %v) = %v, want = %v", tc.option, got, tc.wantErr)
+		}
+		if tc.verifier != nil {
+			tc.verifier(t, s.TransportProtocolInstance(fakeTransNumber))
+		}
+	}
+}
+
+func TestTransportForwarding(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{fakeNetFactory()},
+		TransportProtocols: []stack.TransportProtocol{fakeTransFactory()},
+	})
+	s.SetForwarding(true)
+
+	// TODO(b/123449044): Change this to a channel NIC.
+	ep1 := loopback.New()
+	if err := s.CreateNIC(1, ep1); err != nil {
+		t.Fatalf("CreateNIC #1 failed: %v", err)
+	}
+	if err := s.AddAddress(1, fakeNetNumber, "\x01"); err != nil {
+		t.Fatalf("AddAddress #1 failed: %v", err)
+	}
+
+	ep2 := channel.New(10, defaultMTU, "")
+	if err := s.CreateNIC(2, ep2); err != nil {
+		t.Fatalf("CreateNIC #2 failed: %v", err)
+	}
+	if err := s.AddAddress(2, fakeNetNumber, "\x02"); err != nil {
+		t.Fatalf("AddAddress #2 failed: %v", err)
+	}
+
+	// Route all packets to address 3 to NIC 2 and all packets to address
+	// 1 to NIC 1.
+	{
+		subnet0, err := tcpip.NewSubnet("\x03", "\xff")
+		if err != nil {
+			t.Fatal(err)
+		}
+		subnet1, err := tcpip.NewSubnet("\x01", "\xff")
+		if err != nil {
+			t.Fatal(err)
+		}
+		s.SetRouteTable([]tcpip.Route{
+			{Destination: subnet0, Gateway: "\x00", NIC: 2},
+			{Destination: subnet1, Gateway: "\x00", NIC: 1},
+		})
+	}
+
+	wq := waiter.Queue{}
+	ep, err := s.NewEndpoint(fakeTransNumber, fakeNetNumber, &wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	if err := ep.Bind(tcpip.FullAddress{Addr: "\x01", NIC: 1}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Send a packet to address 1 from address 3.
+	req := buffer.NewView(30)
+	req[0] = 1
+	req[1] = 3
+	req[2] = byte(fakeTransNumber)
+	ep2.InjectInbound(fakeNetNumber, &stack.PacketBuffer{
+		Data: req.ToVectorisedView(),
+	})
+
+	aep, _, err := ep.Accept()
+	if err != nil || aep == nil {
+		t.Fatalf("Accept failed: %v, %v", aep, err)
+	}
+
+	resp := buffer.NewView(30)
+	if _, _, err := aep.Write(tcpip.SlicePayload(resp), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %v", err)
+	}
+
+	p, ok := ep2.Read()
+	if !ok {
+		t.Fatal("Response packet not forwarded")
+	}
+
+	if dst := p.Pkt.NetworkHeader[0]; dst != 3 {
+		t.Errorf("Response packet has incorrect destination addresss: got = %d, want = 3", dst)
+	}
+	if src := p.Pkt.NetworkHeader[1]; src != 1 {
+		t.Errorf("Response packet has incorrect source addresss: got = %d, want = 3", src)
+	}
+}
diff --git a/pkg/tcpip/tcpip.go b/pkg/tcpip/tcpip.go
new file mode 100644
index 000000000..25534a10d
--- /dev/null
+++ b/pkg/tcpip/tcpip.go
@@ -0,0 +1,1616 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tcpip provides the interfaces and related types that users of the
+// tcpip stack will use in order to create endpoints used to send and receive
+// data over the network stack.
+//
+// The starting point is the creation and configuration of a stack. A stack can
+// be created by calling the New() function of the tcpip/stack/stack package;
+// configuring a stack involves creating NICs (via calls to Stack.CreateNIC()),
+// adding network addresses (via calls to Stack.AddAddress()), and
+// setting a route table (via a call to Stack.SetRouteTable()).
+//
+// Once a stack is configured, endpoints can be created by calling
+// Stack.NewEndpoint(). Such endpoints can be used to send/receive data, connect
+// to peers, listen for connections, accept connections, etc., depending on the
+// transport protocol selected.
+package tcpip
+
+import (
+	"errors"
+	"fmt"
+	"math/bits"
+	"reflect"
+	"strconv"
+	"strings"
+	"sync/atomic"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Error represents an error in the netstack error space. Using a special type
+// ensures that errors outside of this space are not accidentally introduced.
+//
+// Note: to support save / restore, it is important that all tcpip errors have
+// distinct error messages.
+type Error struct {
+	msg string
+
+	ignoreStats bool
+}
+
+// String implements fmt.Stringer.String.
+func (e *Error) String() string {
+	if e == nil {
+		return "<nil>"
+	}
+	return e.msg
+}
+
+// IgnoreStats indicates whether this error type should be included in failure
+// counts in tcpip.Stats structs.
+func (e *Error) IgnoreStats() bool {
+	return e.ignoreStats
+}
+
+// Errors that can be returned by the network stack.
+var (
+	ErrUnknownProtocol           = &Error{msg: "unknown protocol"}
+	ErrUnknownNICID              = &Error{msg: "unknown nic id"}
+	ErrUnknownDevice             = &Error{msg: "unknown device"}
+	ErrUnknownProtocolOption     = &Error{msg: "unknown option for protocol"}
+	ErrDuplicateNICID            = &Error{msg: "duplicate nic id"}
+	ErrDuplicateAddress          = &Error{msg: "duplicate address"}
+	ErrNoRoute                   = &Error{msg: "no route"}
+	ErrBadLinkEndpoint           = &Error{msg: "bad link layer endpoint"}
+	ErrAlreadyBound              = &Error{msg: "endpoint already bound", ignoreStats: true}
+	ErrInvalidEndpointState      = &Error{msg: "endpoint is in invalid state"}
+	ErrAlreadyConnecting         = &Error{msg: "endpoint is already connecting", ignoreStats: true}
+	ErrAlreadyConnected          = &Error{msg: "endpoint is already connected", ignoreStats: true}
+	ErrNoPortAvailable           = &Error{msg: "no ports are available"}
+	ErrPortInUse                 = &Error{msg: "port is in use"}
+	ErrBadLocalAddress           = &Error{msg: "bad local address"}
+	ErrClosedForSend             = &Error{msg: "endpoint is closed for send"}
+	ErrClosedForReceive          = &Error{msg: "endpoint is closed for receive"}
+	ErrWouldBlock                = &Error{msg: "operation would block", ignoreStats: true}
+	ErrConnectionRefused         = &Error{msg: "connection was refused"}
+	ErrTimeout                   = &Error{msg: "operation timed out"}
+	ErrAborted                   = &Error{msg: "operation aborted"}
+	ErrConnectStarted            = &Error{msg: "connection attempt started", ignoreStats: true}
+	ErrDestinationRequired       = &Error{msg: "destination address is required"}
+	ErrNotSupported              = &Error{msg: "operation not supported"}
+	ErrQueueSizeNotSupported     = &Error{msg: "queue size querying not supported"}
+	ErrNotConnected              = &Error{msg: "endpoint not connected"}
+	ErrConnectionReset           = &Error{msg: "connection reset by peer"}
+	ErrConnectionAborted         = &Error{msg: "connection aborted"}
+	ErrNoSuchFile                = &Error{msg: "no such file"}
+	ErrInvalidOptionValue        = &Error{msg: "invalid option value specified"}
+	ErrNoLinkAddress             = &Error{msg: "no remote link address"}
+	ErrBadAddress                = &Error{msg: "bad address"}
+	ErrNetworkUnreachable        = &Error{msg: "network is unreachable"}
+	ErrMessageTooLong            = &Error{msg: "message too long"}
+	ErrNoBufferSpace             = &Error{msg: "no buffer space available"}
+	ErrBroadcastDisabled         = &Error{msg: "broadcast socket option disabled"}
+	ErrNotPermitted              = &Error{msg: "operation not permitted"}
+	ErrAddressFamilyNotSupported = &Error{msg: "address family not supported by protocol"}
+)
+
+var messageToError map[string]*Error
+
+var populate sync.Once
+
+// StringToError converts an error message to the error.
+func StringToError(s string) *Error {
+	populate.Do(func() {
+		var errors = []*Error{
+			ErrUnknownProtocol,
+			ErrUnknownNICID,
+			ErrUnknownDevice,
+			ErrUnknownProtocolOption,
+			ErrDuplicateNICID,
+			ErrDuplicateAddress,
+			ErrNoRoute,
+			ErrBadLinkEndpoint,
+			ErrAlreadyBound,
+			ErrInvalidEndpointState,
+			ErrAlreadyConnecting,
+			ErrAlreadyConnected,
+			ErrNoPortAvailable,
+			ErrPortInUse,
+			ErrBadLocalAddress,
+			ErrClosedForSend,
+			ErrClosedForReceive,
+			ErrWouldBlock,
+			ErrConnectionRefused,
+			ErrTimeout,
+			ErrAborted,
+			ErrConnectStarted,
+			ErrDestinationRequired,
+			ErrNotSupported,
+			ErrQueueSizeNotSupported,
+			ErrNotConnected,
+			ErrConnectionReset,
+			ErrConnectionAborted,
+			ErrNoSuchFile,
+			ErrInvalidOptionValue,
+			ErrNoLinkAddress,
+			ErrBadAddress,
+			ErrNetworkUnreachable,
+			ErrMessageTooLong,
+			ErrNoBufferSpace,
+			ErrBroadcastDisabled,
+			ErrNotPermitted,
+			ErrAddressFamilyNotSupported,
+		}
+
+		messageToError = make(map[string]*Error)
+		for _, e := range errors {
+			if messageToError[e.String()] != nil {
+				panic("tcpip errors with duplicated message: " + e.String())
+			}
+			messageToError[e.String()] = e
+		}
+	})
+
+	e, ok := messageToError[s]
+	if !ok {
+		panic("unknown error message: " + s)
+	}
+
+	return e
+}
+
+// Errors related to Subnet
+var (
+	errSubnetLengthMismatch = errors.New("subnet length of address and mask differ")
+	errSubnetAddressMasked  = errors.New("subnet address has bits set outside the mask")
+)
+
+// ErrSaveRejection indicates a failed save due to unsupported networking state.
+// This type of errors is only used for save logic.
+type ErrSaveRejection struct {
+	Err error
+}
+
+// Error returns a sensible description of the save rejection error.
+func (e ErrSaveRejection) Error() string {
+	return "save rejected due to unsupported networking state: " + e.Err.Error()
+}
+
+// A Clock provides the current time.
+//
+// Times returned by a Clock should always be used for application-visible
+// time. Only monotonic times should be used for netstack internal timekeeping.
+type Clock interface {
+	// NowNanoseconds returns the current real time as a number of
+	// nanoseconds since the Unix epoch.
+	NowNanoseconds() int64
+
+	// NowMonotonic returns a monotonic time value.
+	NowMonotonic() int64
+}
+
+// Address is a byte slice cast as a string that represents the address of a
+// network node. Or, in the case of unix endpoints, it may represent a path.
+type Address string
+
+// AddressMask is a bitmask for an address.
+type AddressMask string
+
+// String implements Stringer.
+func (m AddressMask) String() string {
+	return Address(m).String()
+}
+
+// Prefix returns the number of bits before the first host bit.
+func (m AddressMask) Prefix() int {
+	p := 0
+	for _, b := range []byte(m) {
+		p += bits.LeadingZeros8(^b)
+	}
+	return p
+}
+
+// Subnet is a subnet defined by its address and mask.
+type Subnet struct {
+	address Address
+	mask    AddressMask
+}
+
+// NewSubnet creates a new Subnet, checking that the address and mask are the same length.
+func NewSubnet(a Address, m AddressMask) (Subnet, error) {
+	if len(a) != len(m) {
+		return Subnet{}, errSubnetLengthMismatch
+	}
+	for i := 0; i < len(a); i++ {
+		if a[i]&^m[i] != 0 {
+			return Subnet{}, errSubnetAddressMasked
+		}
+	}
+	return Subnet{a, m}, nil
+}
+
+// String implements Stringer.
+func (s Subnet) String() string {
+	return fmt.Sprintf("%s/%d", s.ID(), s.Prefix())
+}
+
+// Contains returns true iff the address is of the same length and matches the
+// subnet address and mask.
+func (s *Subnet) Contains(a Address) bool {
+	if len(a) != len(s.address) {
+		return false
+	}
+	for i := 0; i < len(a); i++ {
+		if a[i]&s.mask[i] != s.address[i] {
+			return false
+		}
+	}
+	return true
+}
+
+// ID returns the subnet ID.
+func (s *Subnet) ID() Address {
+	return s.address
+}
+
+// Bits returns the number of ones (network bits) and zeros (host bits) in the
+// subnet mask.
+func (s *Subnet) Bits() (ones int, zeros int) {
+	ones = s.mask.Prefix()
+	return ones, len(s.mask)*8 - ones
+}
+
+// Prefix returns the number of bits before the first host bit.
+func (s *Subnet) Prefix() int {
+	return s.mask.Prefix()
+}
+
+// Mask returns the subnet mask.
+func (s *Subnet) Mask() AddressMask {
+	return s.mask
+}
+
+// Broadcast returns the subnet's broadcast address.
+func (s *Subnet) Broadcast() Address {
+	addr := []byte(s.address)
+	for i := range addr {
+		addr[i] |= ^s.mask[i]
+	}
+	return Address(addr)
+}
+
+// Equal returns true if s equals o.
+//
+// Needed to use cmp.Equal on Subnet as its fields are unexported.
+func (s Subnet) Equal(o Subnet) bool {
+	return s == o
+}
+
+// NICID is a number that uniquely identifies a NIC.
+type NICID int32
+
+// ShutdownFlags represents flags that can be passed to the Shutdown() method
+// of the Endpoint interface.
+type ShutdownFlags int
+
+// Values of the flags that can be passed to the Shutdown() method. They can
+// be OR'ed together.
+const (
+	ShutdownRead ShutdownFlags = 1 << iota
+	ShutdownWrite
+)
+
+// FullAddress represents a full transport node address, as required by the
+// Connect() and Bind() methods.
+//
+// +stateify savable
+type FullAddress struct {
+	// NIC is the ID of the NIC this address refers to.
+	//
+	// This may not be used by all endpoint types.
+	NIC NICID
+
+	// Addr is the network or link layer address.
+	Addr Address
+
+	// Port is the transport port.
+	//
+	// This may not be used by all endpoint types.
+	Port uint16
+}
+
+// Payloader is an interface that provides data.
+//
+// This interface allows the endpoint to request the amount of data it needs
+// based on internal buffers without exposing them.
+type Payloader interface {
+	// FullPayload returns all available bytes.
+	FullPayload() ([]byte, *Error)
+
+	// Payload returns a slice containing at most size bytes.
+	Payload(size int) ([]byte, *Error)
+}
+
+// SlicePayload implements Payloader for slices.
+//
+// This is typically used for tests.
+type SlicePayload []byte
+
+// FullPayload implements Payloader.FullPayload.
+func (s SlicePayload) FullPayload() ([]byte, *Error) {
+	return s, nil
+}
+
+// Payload implements Payloader.Payload.
+func (s SlicePayload) Payload(size int) ([]byte, *Error) {
+	if size > len(s) {
+		size = len(s)
+	}
+	return s[:size], nil
+}
+
+// A ControlMessages contains socket control messages for IP sockets.
+//
+// +stateify savable
+type ControlMessages struct {
+	// HasTimestamp indicates whether Timestamp is valid/set.
+	HasTimestamp bool
+
+	// Timestamp is the time (in ns) that the last packet used to create
+	// the read data was received.
+	Timestamp int64
+
+	// HasInq indicates whether Inq is valid/set.
+	HasInq bool
+
+	// Inq is the number of bytes ready to be received.
+	Inq int32
+
+	// HasTOS indicates whether Tos is valid/set.
+	HasTOS bool
+
+	// TOS is the IPv4 type of service of the associated packet.
+	TOS uint8
+
+	// HasTClass indicates whether TClass is valid/set.
+	HasTClass bool
+
+	// TClass is the IPv6 traffic class of the associated packet.
+	TClass uint32
+
+	// HasIPPacketInfo indicates whether PacketInfo is set.
+	HasIPPacketInfo bool
+
+	// PacketInfo holds interface and address data on an incoming packet.
+	PacketInfo IPPacketInfo
+}
+
+// PacketOwner is used to get UID and GID of the packet.
+type PacketOwner interface {
+	// UID returns UID of the packet.
+	UID() uint32
+
+	// GID returns GID of the packet.
+	GID() uint32
+}
+
+// Endpoint is the interface implemented by transport protocols (e.g., tcp, udp)
+// that exposes functionality like read, write, connect, etc. to users of the
+// networking stack.
+type Endpoint interface {
+	// Close puts the endpoint in a closed state and frees all resources
+	// associated with it. Close initiates the teardown process, the
+	// Endpoint may not be fully closed when Close returns.
+	Close()
+
+	// Abort initiates an expedited endpoint teardown. As compared to
+	// Close, Abort prioritizes closing the Endpoint quickly over cleanly.
+	// Abort is best effort; implementing Abort with Close is acceptable.
+	Abort()
+
+	// Read reads data from the endpoint and optionally returns the sender.
+	//
+	// This method does not block if there is no data pending. It will also
+	// either return an error or data, never both.
+	Read(*FullAddress) (buffer.View, ControlMessages, *Error)
+
+	// Write writes data to the endpoint's peer. This method does not block if
+	// the data cannot be written.
+	//
+	// Unlike io.Writer.Write, Endpoint.Write transfers ownership of any bytes
+	// successfully written to the Endpoint. That is, if a call to
+	// Write(SlicePayload{data}) returns (n, err), it may retain data[:n], and
+	// the caller should not use data[:n] after Write returns.
+	//
+	// Note that unlike io.Writer.Write, it is not an error for Write to
+	// perform a partial write (if n > 0, no error may be returned). Only
+	// stream (TCP) Endpoints may return partial writes, and even then only
+	// in the case where writing additional data would block. Other Endpoints
+	// will either write the entire message or return an error.
+	//
+	// For UDP and Ping sockets if address resolution is required,
+	// ErrNoLinkAddress and a notification channel is returned for the caller to
+	// block. Channel is closed once address resolution is complete (success or
+	// not). The channel is only non-nil in this case.
+	Write(Payloader, WriteOptions) (int64, <-chan struct{}, *Error)
+
+	// Peek reads data without consuming it from the endpoint.
+	//
+	// This method does not block if there is no data pending.
+	Peek([][]byte) (int64, ControlMessages, *Error)
+
+	// Connect connects the endpoint to its peer. Specifying a NIC is
+	// optional.
+	//
+	// There are three classes of return values:
+	//	nil -- the attempt to connect succeeded.
+	//	ErrConnectStarted/ErrAlreadyConnecting -- the connect attempt started
+	//		but hasn't completed yet. In this case, the caller must call Connect
+	//		or GetSockOpt(ErrorOption) when the endpoint becomes writable to
+	//		get the actual result. The first call to Connect after the socket has
+	//		connected returns nil. Calling connect again results in ErrAlreadyConnected.
+	//	Anything else -- the attempt to connect failed.
+	//
+	// If address.Addr is empty, this means that Enpoint has to be
+	// disconnected if this is supported, otherwise
+	// ErrAddressFamilyNotSupported must be returned.
+	Connect(address FullAddress) *Error
+
+	// Disconnect disconnects the endpoint from its peer.
+	Disconnect() *Error
+
+	// Shutdown closes the read and/or write end of the endpoint connection
+	// to its peer.
+	Shutdown(flags ShutdownFlags) *Error
+
+	// Listen puts the endpoint in "listen" mode, which allows it to accept
+	// new connections.
+	Listen(backlog int) *Error
+
+	// Accept returns a new endpoint if a peer has established a connection
+	// to an endpoint previously set to listen mode. This method does not
+	// block if no new connections are available.
+	//
+	// The returned Queue is the wait queue for the newly created endpoint.
+	Accept() (Endpoint, *waiter.Queue, *Error)
+
+	// Bind binds the endpoint to a specific local address and port.
+	// Specifying a NIC is optional.
+	Bind(address FullAddress) *Error
+
+	// GetLocalAddress returns the address to which the endpoint is bound.
+	GetLocalAddress() (FullAddress, *Error)
+
+	// GetRemoteAddress returns the address to which the endpoint is
+	// connected.
+	GetRemoteAddress() (FullAddress, *Error)
+
+	// Readiness returns the current readiness of the endpoint. For example,
+	// if waiter.EventIn is set, the endpoint is immediately readable.
+	Readiness(mask waiter.EventMask) waiter.EventMask
+
+	// SetSockOpt sets a socket option. opt should be one of the *Option types.
+	SetSockOpt(opt interface{}) *Error
+
+	// SetSockOptBool sets a socket option, for simple cases where a value
+	// has the bool type.
+	SetSockOptBool(opt SockOptBool, v bool) *Error
+
+	// SetSockOptInt sets a socket option, for simple cases where a value
+	// has the int type.
+	SetSockOptInt(opt SockOptInt, v int) *Error
+
+	// GetSockOpt gets a socket option. opt should be a pointer to one of the
+	// *Option types.
+	GetSockOpt(opt interface{}) *Error
+
+	// GetSockOptBool gets a socket option for simple cases where a return
+	// value has the bool type.
+	GetSockOptBool(SockOptBool) (bool, *Error)
+
+	// GetSockOptInt gets a socket option for simple cases where a return
+	// value has the int type.
+	GetSockOptInt(SockOptInt) (int, *Error)
+
+	// State returns a socket's lifecycle state. The returned value is
+	// protocol-specific and is primarily used for diagnostics.
+	State() uint32
+
+	// ModerateRecvBuf should be called everytime data is copied to the user
+	// space. This allows for dynamic tuning of recv buffer space for a
+	// given socket.
+	//
+	// NOTE: This method is a no-op for sockets other than TCP.
+	ModerateRecvBuf(copied int)
+
+	// Info returns a copy to the transport endpoint info.
+	Info() EndpointInfo
+
+	// Stats returns a reference to the endpoint stats.
+	Stats() EndpointStats
+
+	// SetOwner sets the task owner to the endpoint owner.
+	SetOwner(owner PacketOwner)
+}
+
+// EndpointInfo is the interface implemented by each endpoint info struct.
+type EndpointInfo interface {
+	// IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
+	// marker interface.
+	IsEndpointInfo()
+}
+
+// EndpointStats is the interface implemented by each endpoint stats struct.
+type EndpointStats interface {
+	// IsEndpointStats is an empty method to implement the tcpip.EndpointStats
+	// marker interface.
+	IsEndpointStats()
+}
+
+// WriteOptions contains options for Endpoint.Write.
+type WriteOptions struct {
+	// If To is not nil, write to the given address instead of the endpoint's
+	// peer.
+	To *FullAddress
+
+	// More has the same semantics as Linux's MSG_MORE.
+	More bool
+
+	// EndOfRecord has the same semantics as Linux's MSG_EOR.
+	EndOfRecord bool
+
+	// Atomic means that all data fetched from Payloader must be written to the
+	// endpoint. If Atomic is false, then data fetched from the Payloader may be
+	// discarded if available endpoint buffer space is unsufficient.
+	Atomic bool
+}
+
+// SockOptBool represents socket options which values have the bool type.
+type SockOptBool int
+
+const (
+	// BroadcastOption is used by SetSockOptBool/GetSockOptBool to specify
+	// whether datagram sockets are allowed to send packets to a broadcast
+	// address.
+	BroadcastOption SockOptBool = iota
+
+	// CorkOption is used by SetSockOptBool/GetSockOptBool to specify if
+	// data should be held until segments are full by the TCP transport
+	// protocol.
+	CorkOption
+
+	// DelayOption is used by SetSockOptBool/GetSockOptBool to specify if
+	// data should be sent out immediately by the transport protocol. For
+	// TCP, it determines if the Nagle algorithm is on or off.
+	DelayOption
+
+	// KeepaliveEnabledOption is used by SetSockOptBool/GetSockOptBool to
+	// specify whether TCP keepalive is enabled for this socket.
+	KeepaliveEnabledOption
+
+	// MulticastLoopOption is used by SetSockOptBool/GetSockOptBool to
+	// specify whether multicast packets sent over a non-loopback interface
+	// will be looped back.
+	MulticastLoopOption
+
+	// NoChecksumOption is used by SetSockOptBool/GetSockOptBool to specify
+	// whether UDP checksum is disabled for this socket.
+	NoChecksumOption
+
+	// PasscredOption is used by SetSockOptBool/GetSockOptBool to specify
+	// whether SCM_CREDENTIALS socket control messages are enabled.
+	//
+	// Only supported on Unix sockets.
+	PasscredOption
+
+	// QuickAckOption is stubbed out in SetSockOptBool/GetSockOptBool.
+	QuickAckOption
+
+	// ReceiveTClassOption is used by SetSockOptBool/GetSockOptBool to
+	// specify if the IPV6_TCLASS ancillary message is passed with incoming
+	// packets.
+	ReceiveTClassOption
+
+	// ReceiveTOSOption is used by SetSockOptBool/GetSockOptBool to specify
+	// if the TOS ancillary message is passed with incoming packets.
+	ReceiveTOSOption
+
+	// ReceiveIPPacketInfoOption is used by SetSockOptBool/GetSockOptBool to
+	// specify if more inforamtion is provided with incoming packets such as
+	// interface index and address.
+	ReceiveIPPacketInfoOption
+
+	// ReuseAddressOption is used by SetSockOptBool/GetSockOptBool to
+	// specify whether Bind() should allow reuse of local address.
+	ReuseAddressOption
+
+	// ReusePortOption is used by SetSockOptBool/GetSockOptBool to permit
+	// multiple sockets to be bound to an identical socket address.
+	ReusePortOption
+
+	// V6OnlyOption is used by SetSockOptBool/GetSockOptBool to specify
+	// whether an IPv6 socket is to be restricted to sending and receiving
+	// IPv6 packets only.
+	V6OnlyOption
+
+	// IPHdrIncludedOption is used by SetSockOpt to indicate for a raw
+	// endpoint that all packets being written have an IP header and the
+	// endpoint should not attach an IP header.
+	IPHdrIncludedOption
+)
+
+// SockOptInt represents socket options which values have the int type.
+type SockOptInt int
+
+const (
+	// KeepaliveCountOption is used by SetSockOptInt/GetSockOptInt to
+	// specify the number of un-ACKed TCP keepalives that will be sent
+	// before the connection is closed.
+	KeepaliveCountOption SockOptInt = iota
+
+	// IPv4TOSOption is used by SetSockOptInt/GetSockOptInt to specify TOS
+	// for all subsequent outgoing IPv4 packets from the endpoint.
+	IPv4TOSOption
+
+	// IPv6TrafficClassOption is used by SetSockOptInt/GetSockOptInt to
+	// specify TOS for all subsequent outgoing IPv6 packets from the
+	// endpoint.
+	IPv6TrafficClassOption
+
+	// MaxSegOption is used by SetSockOptInt/GetSockOptInt to set/get the
+	// current Maximum Segment Size(MSS) value as specified using the
+	// TCP_MAXSEG option.
+	MaxSegOption
+
+	// MTUDiscoverOption is used to set/get the path MTU discovery setting.
+	//
+	// NOTE: Setting this option to any other value than PMTUDiscoveryDont
+	// is not supported and will fail as such, and getting this option will
+	// always return PMTUDiscoveryDont.
+	MTUDiscoverOption
+
+	// MulticastTTLOption is used by SetSockOptInt/GetSockOptInt to control
+	// the default TTL value for multicast messages. The default is 1.
+	MulticastTTLOption
+
+	// ReceiveQueueSizeOption is used in GetSockOptInt to specify that the
+	// number of unread bytes in the input buffer should be returned.
+	ReceiveQueueSizeOption
+
+	// SendBufferSizeOption is used by SetSockOptInt/GetSockOptInt to
+	// specify the send buffer size option.
+	SendBufferSizeOption
+
+	// ReceiveBufferSizeOption is used by SetSockOptInt/GetSockOptInt to
+	// specify the receive buffer size option.
+	ReceiveBufferSizeOption
+
+	// SendQueueSizeOption is used in GetSockOptInt to specify that the
+	// number of unread bytes in the output buffer should be returned.
+	SendQueueSizeOption
+
+	// TTLOption is used by SetSockOptInt/GetSockOptInt to control the
+	// default TTL/hop limit value for unicast messages. The default is
+	// protocol specific.
+	//
+	// A zero value indicates the default.
+	TTLOption
+
+	// TCPSynCountOption is used by SetSockOptInt/GetSockOptInt to specify
+	// the number of SYN retransmits that TCP should send before aborting
+	// the attempt to connect. It cannot exceed 255.
+	//
+	// NOTE: This option is currently only stubbed out and is no-op.
+	TCPSynCountOption
+
+	// TCPWindowClampOption is used by SetSockOptInt/GetSockOptInt to bound
+	// the size of the advertised window to this value.
+	//
+	// NOTE: This option is currently only stubed out and is a no-op
+	TCPWindowClampOption
+)
+
+const (
+	// PMTUDiscoveryWant is a setting of the MTUDiscoverOption to use
+	// per-route settings.
+	PMTUDiscoveryWant int = iota
+
+	// PMTUDiscoveryDont is a setting of the MTUDiscoverOption to disable
+	// path MTU discovery.
+	PMTUDiscoveryDont
+
+	// PMTUDiscoveryDo is a setting of the MTUDiscoverOption to always do
+	// path MTU discovery.
+	PMTUDiscoveryDo
+
+	// PMTUDiscoveryProbe is a setting of the MTUDiscoverOption to set DF
+	// but ignore path MTU.
+	PMTUDiscoveryProbe
+)
+
+// ErrorOption is used in GetSockOpt to specify that the last error reported by
+// the endpoint should be cleared and returned.
+type ErrorOption struct{}
+
+// BindToDeviceOption is used by SetSockOpt/GetSockOpt to specify that sockets
+// should bind only on a specific NIC.
+type BindToDeviceOption NICID
+
+// TCPInfoOption is used by GetSockOpt to expose TCP statistics.
+//
+// TODO(b/64800844): Add and populate stat fields.
+type TCPInfoOption struct {
+	RTT    time.Duration
+	RTTVar time.Duration
+}
+
+// KeepaliveIdleOption is used by SetSockOpt/GetSockOpt to specify the time a
+// connection must remain idle before the first TCP keepalive packet is sent.
+// Once this time is reached, KeepaliveIntervalOption is used instead.
+type KeepaliveIdleOption time.Duration
+
+// KeepaliveIntervalOption is used by SetSockOpt/GetSockOpt to specify the
+// interval between sending TCP keepalive packets.
+type KeepaliveIntervalOption time.Duration
+
+// TCPUserTimeoutOption is used by SetSockOpt/GetSockOpt to specify a user
+// specified timeout for a given TCP connection.
+// See: RFC5482 for details.
+type TCPUserTimeoutOption time.Duration
+
+// CongestionControlOption is used by SetSockOpt/GetSockOpt to set/get
+// the current congestion control algorithm.
+type CongestionControlOption string
+
+// AvailableCongestionControlOption is used to query the supported congestion
+// control algorithms.
+type AvailableCongestionControlOption string
+
+// buffer moderation.
+type ModerateReceiveBufferOption bool
+
+// TCPLingerTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
+// maximum duration for which a socket lingers in the TCP_FIN_WAIT_2 state
+// before being marked closed.
+type TCPLingerTimeoutOption time.Duration
+
+// TCPTimeWaitTimeoutOption is used by SetSockOpt/GetSockOpt to set/get the
+// maximum duration for which a socket lingers in the TIME_WAIT state
+// before being marked closed.
+type TCPTimeWaitTimeoutOption time.Duration
+
+// TCPDeferAcceptOption is used by SetSockOpt/GetSockOpt to allow a
+// accept to return a completed connection only when there is data to be
+// read. This usually means the listening socket will drop the final ACK
+// for a handshake till the specified timeout until a segment with data arrives.
+type TCPDeferAcceptOption time.Duration
+
+// TCPMinRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
+// default MinRTO used by the Stack.
+type TCPMinRTOOption time.Duration
+
+// TCPMaxRTOOption is use by SetSockOpt/GetSockOpt to allow overriding
+// default MaxRTO used by the Stack.
+type TCPMaxRTOOption time.Duration
+
+// TCPMaxRetriesOption is used by SetSockOpt/GetSockOpt to set/get the
+// maximum number of retransmits after which we time out the connection.
+type TCPMaxRetriesOption uint64
+
+// TCPSynRcvdCountThresholdOption is used by SetSockOpt/GetSockOpt to specify
+// the number of endpoints that can be in SYN-RCVD state before the stack
+// switches to using SYN cookies.
+type TCPSynRcvdCountThresholdOption uint64
+
+// TCPSynRetriesOption is used by SetSockOpt/GetSockOpt to specify stack-wide
+// default for number of times SYN is retransmitted before aborting a connect.
+type TCPSynRetriesOption uint8
+
+// MulticastInterfaceOption is used by SetSockOpt/GetSockOpt to specify a
+// default interface for multicast.
+type MulticastInterfaceOption struct {
+	NIC           NICID
+	InterfaceAddr Address
+}
+
+// MembershipOption is used by SetSockOpt/GetSockOpt as an argument to
+// AddMembershipOption and RemoveMembershipOption.
+type MembershipOption struct {
+	NIC           NICID
+	InterfaceAddr Address
+	MulticastAddr Address
+}
+
+// AddMembershipOption is used by SetSockOpt/GetSockOpt to join a multicast
+// group identified by the given multicast address, on the interface matching
+// the given interface address.
+type AddMembershipOption MembershipOption
+
+// RemoveMembershipOption is used by SetSockOpt/GetSockOpt to leave a multicast
+// group identified by the given multicast address, on the interface matching
+// the given interface address.
+type RemoveMembershipOption MembershipOption
+
+// OutOfBandInlineOption is used by SetSockOpt/GetSockOpt to specify whether
+// TCP out-of-band data is delivered along with the normal in-band data.
+type OutOfBandInlineOption int
+
+// DefaultTTLOption is used by stack.(*Stack).NetworkProtocolOption to specify
+// a default TTL.
+type DefaultTTLOption uint8
+
+//
+// IPPacketInfo is the message structure for IP_PKTINFO.
+//
+// +stateify savable
+type IPPacketInfo struct {
+	// NIC is the ID of the NIC to be used.
+	NIC NICID
+
+	// LocalAddr is the local address.
+	LocalAddr Address
+
+	// DestinationAddr is the destination address.
+	DestinationAddr Address
+}
+
+// Route is a row in the routing table. It specifies through which NIC (and
+// gateway) sets of packets should be routed. A row is considered viable if the
+// masked target address matches the destination address in the row.
+type Route struct {
+	// Destination must contain the target address for this row to be viable.
+	Destination Subnet
+
+	// Gateway is the gateway to be used if this row is viable.
+	Gateway Address
+
+	// NIC is the id of the nic to be used if this row is viable.
+	NIC NICID
+}
+
+// String implements the fmt.Stringer interface.
+func (r Route) String() string {
+	var out strings.Builder
+	fmt.Fprintf(&out, "%s", r.Destination)
+	if len(r.Gateway) > 0 {
+		fmt.Fprintf(&out, " via %s", r.Gateway)
+	}
+	fmt.Fprintf(&out, " nic %d", r.NIC)
+	return out.String()
+}
+
+// TransportProtocolNumber is the number of a transport protocol.
+type TransportProtocolNumber uint32
+
+// NetworkProtocolNumber is the number of a network protocol.
+type NetworkProtocolNumber uint32
+
+// A StatCounter keeps track of a statistic.
+type StatCounter struct {
+	count uint64
+}
+
+// Increment adds one to the counter.
+func (s *StatCounter) Increment() {
+	s.IncrementBy(1)
+}
+
+// Decrement minuses one to the counter.
+func (s *StatCounter) Decrement() {
+	s.IncrementBy(^uint64(0))
+}
+
+// Value returns the current value of the counter.
+func (s *StatCounter) Value() uint64 {
+	return atomic.LoadUint64(&s.count)
+}
+
+// IncrementBy increments the counter by v.
+func (s *StatCounter) IncrementBy(v uint64) {
+	atomic.AddUint64(&s.count, v)
+}
+
+func (s *StatCounter) String() string {
+	return strconv.FormatUint(s.Value(), 10)
+}
+
+// ICMPv4PacketStats enumerates counts for all ICMPv4 packet types.
+type ICMPv4PacketStats struct {
+	// Echo is the total number of ICMPv4 echo packets counted.
+	Echo *StatCounter
+
+	// EchoReply is the total number of ICMPv4 echo reply packets counted.
+	EchoReply *StatCounter
+
+	// DstUnreachable is the total number of ICMPv4 destination unreachable
+	// packets counted.
+	DstUnreachable *StatCounter
+
+	// SrcQuench is the total number of ICMPv4 source quench packets
+	// counted.
+	SrcQuench *StatCounter
+
+	// Redirect is the total number of ICMPv4 redirect packets counted.
+	Redirect *StatCounter
+
+	// TimeExceeded is the total number of ICMPv4 time exceeded packets
+	// counted.
+	TimeExceeded *StatCounter
+
+	// ParamProblem is the total number of ICMPv4 parameter problem packets
+	// counted.
+	ParamProblem *StatCounter
+
+	// Timestamp is the total number of ICMPv4 timestamp packets counted.
+	Timestamp *StatCounter
+
+	// TimestampReply is the total number of ICMPv4 timestamp reply packets
+	// counted.
+	TimestampReply *StatCounter
+
+	// InfoRequest is the total number of ICMPv4 information request
+	// packets counted.
+	InfoRequest *StatCounter
+
+	// InfoReply is the total number of ICMPv4 information reply packets
+	// counted.
+	InfoReply *StatCounter
+}
+
+// ICMPv6PacketStats enumerates counts for all ICMPv6 packet types.
+type ICMPv6PacketStats struct {
+	// EchoRequest is the total number of ICMPv6 echo request packets
+	// counted.
+	EchoRequest *StatCounter
+
+	// EchoReply is the total number of ICMPv6 echo reply packets counted.
+	EchoReply *StatCounter
+
+	// DstUnreachable is the total number of ICMPv6 destination unreachable
+	// packets counted.
+	DstUnreachable *StatCounter
+
+	// PacketTooBig is the total number of ICMPv6 packet too big packets
+	// counted.
+	PacketTooBig *StatCounter
+
+	// TimeExceeded is the total number of ICMPv6 time exceeded packets
+	// counted.
+	TimeExceeded *StatCounter
+
+	// ParamProblem is the total number of ICMPv6 parameter problem packets
+	// counted.
+	ParamProblem *StatCounter
+
+	// RouterSolicit is the total number of ICMPv6 router solicit packets
+	// counted.
+	RouterSolicit *StatCounter
+
+	// RouterAdvert is the total number of ICMPv6 router advert packets
+	// counted.
+	RouterAdvert *StatCounter
+
+	// NeighborSolicit is the total number of ICMPv6 neighbor solicit
+	// packets counted.
+	NeighborSolicit *StatCounter
+
+	// NeighborAdvert is the total number of ICMPv6 neighbor advert packets
+	// counted.
+	NeighborAdvert *StatCounter
+
+	// RedirectMsg is the total number of ICMPv6 redirect message packets
+	// counted.
+	RedirectMsg *StatCounter
+}
+
+// ICMPv4SentPacketStats collects outbound ICMPv4-specific stats.
+type ICMPv4SentPacketStats struct {
+	ICMPv4PacketStats
+
+	// Dropped is the total number of ICMPv4 packets dropped due to link
+	// layer errors.
+	Dropped *StatCounter
+
+	// RateLimited is the total number of ICMPv6 packets dropped due to
+	// rate limit being exceeded.
+	RateLimited *StatCounter
+}
+
+// ICMPv4ReceivedPacketStats collects inbound ICMPv4-specific stats.
+type ICMPv4ReceivedPacketStats struct {
+	ICMPv4PacketStats
+
+	// Invalid is the total number of ICMPv4 packets received that the
+	// transport layer could not parse.
+	Invalid *StatCounter
+}
+
+// ICMPv6SentPacketStats collects outbound ICMPv6-specific stats.
+type ICMPv6SentPacketStats struct {
+	ICMPv6PacketStats
+
+	// Dropped is the total number of ICMPv6 packets dropped due to link
+	// layer errors.
+	Dropped *StatCounter
+
+	// RateLimited is the total number of ICMPv6 packets dropped due to
+	// rate limit being exceeded.
+	RateLimited *StatCounter
+}
+
+// ICMPv6ReceivedPacketStats collects inbound ICMPv6-specific stats.
+type ICMPv6ReceivedPacketStats struct {
+	ICMPv6PacketStats
+
+	// Invalid is the total number of ICMPv6 packets received that the
+	// transport layer could not parse.
+	Invalid *StatCounter
+}
+
+// ICMPStats collects ICMP-specific stats (both v4 and v6).
+type ICMPStats struct {
+	// ICMPv4SentPacketStats contains counts of sent packets by ICMPv4 packet type
+	// and a single count of packets which failed to write to the link
+	// layer.
+	V4PacketsSent ICMPv4SentPacketStats
+
+	// ICMPv4ReceivedPacketStats contains counts of received packets by ICMPv4
+	// packet type and a single count of invalid packets received.
+	V4PacketsReceived ICMPv4ReceivedPacketStats
+
+	// ICMPv6SentPacketStats contains counts of sent packets by ICMPv6 packet type
+	// and a single count of packets which failed to write to the link
+	// layer.
+	V6PacketsSent ICMPv6SentPacketStats
+
+	// ICMPv6ReceivedPacketStats contains counts of received packets by ICMPv6
+	// packet type and a single count of invalid packets received.
+	V6PacketsReceived ICMPv6ReceivedPacketStats
+}
+
+// IPStats collects IP-specific stats (both v4 and v6).
+type IPStats struct {
+	// PacketsReceived is the total number of IP packets received from the
+	// link layer in nic.DeliverNetworkPacket.
+	PacketsReceived *StatCounter
+
+	// InvalidDestinationAddressesReceived is the total number of IP packets
+	// received with an unknown or invalid destination address.
+	InvalidDestinationAddressesReceived *StatCounter
+
+	// InvalidSourceAddressesReceived is the total number of IP packets received
+	// with a source address that should never have been received on the wire.
+	InvalidSourceAddressesReceived *StatCounter
+
+	// PacketsDelivered is the total number of incoming IP packets that
+	// are successfully delivered to the transport layer via HandlePacket.
+	PacketsDelivered *StatCounter
+
+	// PacketsSent is the total number of IP packets sent via WritePacket.
+	PacketsSent *StatCounter
+
+	// OutgoingPacketErrors is the total number of IP packets which failed
+	// to write to a link-layer endpoint.
+	OutgoingPacketErrors *StatCounter
+
+	// MalformedPacketsReceived is the total number of IP Packets that were
+	// dropped due to the IP packet header failing validation checks.
+	MalformedPacketsReceived *StatCounter
+
+	// MalformedFragmentsReceived is the total number of IP Fragments that were
+	// dropped due to the fragment failing validation checks.
+	MalformedFragmentsReceived *StatCounter
+}
+
+// TCPStats collects TCP-specific stats.
+type TCPStats struct {
+	// ActiveConnectionOpenings is the number of connections opened
+	// successfully via Connect.
+	ActiveConnectionOpenings *StatCounter
+
+	// PassiveConnectionOpenings is the number of connections opened
+	// successfully via Listen.
+	PassiveConnectionOpenings *StatCounter
+
+	// CurrentEstablished is the number of TCP connections for which the
+	// current state is ESTABLISHED.
+	CurrentEstablished *StatCounter
+
+	// CurrentConnected is the number of TCP connections that
+	// are in connected state.
+	CurrentConnected *StatCounter
+
+	// EstablishedResets is the number of times TCP connections have made
+	// a direct transition to the CLOSED state from either the
+	// ESTABLISHED state or the CLOSE-WAIT state.
+	EstablishedResets *StatCounter
+
+	// EstablishedClosed is the number of times established TCP connections
+	// made a transition to CLOSED state.
+	EstablishedClosed *StatCounter
+
+	// EstablishedTimedout is the number of times an established connection
+	// was reset because of keep-alive time out.
+	EstablishedTimedout *StatCounter
+
+	// ListenOverflowSynDrop is the number of times the listen queue overflowed
+	// and a SYN was dropped.
+	ListenOverflowSynDrop *StatCounter
+
+	// ListenOverflowAckDrop is the number of times the final ACK
+	// in the handshake was dropped due to overflow.
+	ListenOverflowAckDrop *StatCounter
+
+	// ListenOverflowCookieSent is the number of times a SYN cookie was sent.
+	ListenOverflowSynCookieSent *StatCounter
+
+	// ListenOverflowSynCookieRcvd is the number of times a valid SYN
+	// cookie was received.
+	ListenOverflowSynCookieRcvd *StatCounter
+
+	// ListenOverflowInvalidSynCookieRcvd is the number of times an invalid SYN cookie
+	// was received.
+	ListenOverflowInvalidSynCookieRcvd *StatCounter
+
+	// FailedConnectionAttempts is the number of calls to Connect or Listen
+	// (active and passive openings, respectively) that end in an error.
+	FailedConnectionAttempts *StatCounter
+
+	// ValidSegmentsReceived is the number of TCP segments received that
+	// the transport layer successfully parsed.
+	ValidSegmentsReceived *StatCounter
+
+	// InvalidSegmentsReceived is the number of TCP segments received that
+	// the transport layer could not parse.
+	InvalidSegmentsReceived *StatCounter
+
+	// SegmentsSent is the number of TCP segments sent.
+	SegmentsSent *StatCounter
+
+	// SegmentSendErrors is the number of TCP segments failed to be sent.
+	SegmentSendErrors *StatCounter
+
+	// ResetsSent is the number of TCP resets sent.
+	ResetsSent *StatCounter
+
+	// ResetsReceived is the number of TCP resets received.
+	ResetsReceived *StatCounter
+
+	// Retransmits is the number of TCP segments retransmitted.
+	Retransmits *StatCounter
+
+	// FastRecovery is the number of times Fast Recovery was used to
+	// recover from packet loss.
+	FastRecovery *StatCounter
+
+	// SACKRecovery is the number of times SACK Recovery was used to
+	// recover from packet loss.
+	SACKRecovery *StatCounter
+
+	// SlowStartRetransmits is the number of segments retransmitted in slow
+	// start.
+	SlowStartRetransmits *StatCounter
+
+	// FastRetransmit is the number of segments retransmitted in fast
+	// recovery.
+	FastRetransmit *StatCounter
+
+	// Timeouts is the number of times the RTO expired.
+	Timeouts *StatCounter
+
+	// ChecksumErrors is the number of segments dropped due to bad checksums.
+	ChecksumErrors *StatCounter
+}
+
+// UDPStats collects UDP-specific stats.
+type UDPStats struct {
+	// PacketsReceived is the number of UDP datagrams received via
+	// HandlePacket.
+	PacketsReceived *StatCounter
+
+	// UnknownPortErrors is the number of incoming UDP datagrams dropped
+	// because they did not have a known destination port.
+	UnknownPortErrors *StatCounter
+
+	// ReceiveBufferErrors is the number of incoming UDP datagrams dropped
+	// due to the receiving buffer being in an invalid state.
+	ReceiveBufferErrors *StatCounter
+
+	// MalformedPacketsReceived is the number of incoming UDP datagrams
+	// dropped due to the UDP header being in a malformed state.
+	MalformedPacketsReceived *StatCounter
+
+	// PacketsSent is the number of UDP datagrams sent via sendUDP.
+	PacketsSent *StatCounter
+
+	// PacketSendErrors is the number of datagrams failed to be sent.
+	PacketSendErrors *StatCounter
+
+	// ChecksumErrors is the number of datagrams dropped due to bad checksums.
+	ChecksumErrors *StatCounter
+}
+
+// Stats holds statistics about the networking stack.
+//
+// All fields are optional.
+type Stats struct {
+	// UnknownProtocolRcvdPackets is the number of packets received by the
+	// stack that were for an unknown or unsupported protocol.
+	UnknownProtocolRcvdPackets *StatCounter
+
+	// MalformedRcvdPackets is the number of packets received by the stack
+	// that were deemed malformed.
+	MalformedRcvdPackets *StatCounter
+
+	// DroppedPackets is the number of packets dropped due to full queues.
+	DroppedPackets *StatCounter
+
+	// ICMP breaks out ICMP-specific stats (both v4 and v6).
+	ICMP ICMPStats
+
+	// IP breaks out IP-specific stats (both v4 and v6).
+	IP IPStats
+
+	// TCP breaks out TCP-specific stats.
+	TCP TCPStats
+
+	// UDP breaks out UDP-specific stats.
+	UDP UDPStats
+}
+
+// ReceiveErrors collects packet receive errors within transport endpoint.
+type ReceiveErrors struct {
+	// ReceiveBufferOverflow is the number of received packets dropped
+	// due to the receive buffer being full.
+	ReceiveBufferOverflow StatCounter
+
+	// MalformedPacketsReceived is the number of incoming packets
+	// dropped due to the packet header being in a malformed state.
+	MalformedPacketsReceived StatCounter
+
+	// ClosedReceiver is the number of received packets dropped because
+	// of receiving endpoint state being closed.
+	ClosedReceiver StatCounter
+
+	// ChecksumErrors is the number of packets dropped due to bad checksums.
+	ChecksumErrors StatCounter
+}
+
+// SendErrors collects packet send errors within the transport layer for
+// an endpoint.
+type SendErrors struct {
+	// SendToNetworkFailed is the number of packets failed to be written to
+	// the network endpoint.
+	SendToNetworkFailed StatCounter
+
+	// NoRoute is the number of times we failed to resolve IP route.
+	NoRoute StatCounter
+
+	// NoLinkAddr is the number of times we failed to resolve ARP.
+	NoLinkAddr StatCounter
+}
+
+// ReadErrors collects segment read errors from an endpoint read call.
+type ReadErrors struct {
+	// ReadClosed is the number of received packet drops because the endpoint
+	// was shutdown for read.
+	ReadClosed StatCounter
+
+	// InvalidEndpointState is the number of times we found the endpoint state
+	// to be unexpected.
+	InvalidEndpointState StatCounter
+
+	// NotConnected is the number of times we tried to read but found that the
+	// endpoint was not connected.
+	NotConnected StatCounter
+}
+
+// WriteErrors collects packet write errors from an endpoint write call.
+type WriteErrors struct {
+	// WriteClosed is the number of packet drops because the endpoint
+	// was shutdown for write.
+	WriteClosed StatCounter
+
+	// InvalidEndpointState is the number of times we found the endpoint state
+	// to be unexpected.
+	InvalidEndpointState StatCounter
+
+	// InvalidArgs is the number of times invalid input arguments were
+	// provided for endpoint Write call.
+	InvalidArgs StatCounter
+}
+
+// TransportEndpointStats collects statistics about the endpoint.
+type TransportEndpointStats struct {
+	// PacketsReceived is the number of successful packet receives.
+	PacketsReceived StatCounter
+
+	// PacketsSent is the number of successful packet sends.
+	PacketsSent StatCounter
+
+	// ReceiveErrors collects packet receive errors within transport layer.
+	ReceiveErrors ReceiveErrors
+
+	// ReadErrors collects packet read errors from an endpoint read call.
+	ReadErrors ReadErrors
+
+	// SendErrors collects packet send errors within the transport layer.
+	SendErrors SendErrors
+
+	// WriteErrors collects packet write errors from an endpoint write call.
+	WriteErrors WriteErrors
+}
+
+// IsEndpointStats is an empty method to implement the tcpip.EndpointStats
+// marker interface.
+func (*TransportEndpointStats) IsEndpointStats() {}
+
+// InitStatCounters initializes v's fields with nil StatCounter fields to new
+// StatCounters.
+func InitStatCounters(v reflect.Value) {
+	for i := 0; i < v.NumField(); i++ {
+		v := v.Field(i)
+		if s, ok := v.Addr().Interface().(**StatCounter); ok {
+			if *s == nil {
+				*s = new(StatCounter)
+			}
+		} else {
+			InitStatCounters(v)
+		}
+	}
+}
+
+// FillIn returns a copy of s with nil fields initialized to new StatCounters.
+func (s Stats) FillIn() Stats {
+	InitStatCounters(reflect.ValueOf(&s).Elem())
+	return s
+}
+
+// Clone returns a copy of the TransportEndpointStats by atomically reading
+// each field.
+func (src *TransportEndpointStats) Clone() TransportEndpointStats {
+	var dst TransportEndpointStats
+	clone(reflect.ValueOf(&dst).Elem(), reflect.ValueOf(src).Elem())
+	return dst
+}
+
+func clone(dst reflect.Value, src reflect.Value) {
+	for i := 0; i < dst.NumField(); i++ {
+		d := dst.Field(i)
+		s := src.Field(i)
+		if c, ok := s.Addr().Interface().(*StatCounter); ok {
+			d.Addr().Interface().(*StatCounter).IncrementBy(c.Value())
+		} else {
+			clone(d, s)
+		}
+	}
+}
+
+// String implements the fmt.Stringer interface.
+func (a Address) String() string {
+	switch len(a) {
+	case 4:
+		return fmt.Sprintf("%d.%d.%d.%d", int(a[0]), int(a[1]), int(a[2]), int(a[3]))
+	case 16:
+		// Find the longest subsequence of hexadecimal zeros.
+		start, end := -1, -1
+		for i := 0; i < len(a); i += 2 {
+			j := i
+			for j < len(a) && a[j] == 0 && a[j+1] == 0 {
+				j += 2
+			}
+			if j > i+2 && j-i > end-start {
+				start, end = i, j
+			}
+		}
+
+		var b strings.Builder
+		for i := 0; i < len(a); i += 2 {
+			if i == start {
+				b.WriteString("::")
+				i = end
+				if end >= len(a) {
+					break
+				}
+			} else if i > 0 {
+				b.WriteByte(':')
+			}
+			v := uint16(a[i+0])<<8 | uint16(a[i+1])
+			if v == 0 {
+				b.WriteByte('0')
+			} else {
+				const digits = "0123456789abcdef"
+				for i := uint(3); i < 4; i-- {
+					if v := v >> (i * 4); v != 0 {
+						b.WriteByte(digits[v&0xf])
+					}
+				}
+			}
+		}
+		return b.String()
+	default:
+		return fmt.Sprintf("%x", []byte(a))
+	}
+}
+
+// To4 converts the IPv4 address to a 4-byte representation.
+// If the address is not an IPv4 address, To4 returns "".
+func (a Address) To4() Address {
+	const (
+		ipv4len = 4
+		ipv6len = 16
+	)
+	if len(a) == ipv4len {
+		return a
+	}
+	if len(a) == ipv6len &&
+		isZeros(a[0:10]) &&
+		a[10] == 0xff &&
+		a[11] == 0xff {
+		return a[12:16]
+	}
+	return ""
+}
+
+// isZeros reports whether a is all zeros.
+func isZeros(a Address) bool {
+	for i := 0; i < len(a); i++ {
+		if a[i] != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// LinkAddress is a byte slice cast as a string that represents a link address.
+// It is typically a 6-byte MAC address.
+type LinkAddress string
+
+// String implements the fmt.Stringer interface.
+func (a LinkAddress) String() string {
+	switch len(a) {
+	case 6:
+		return fmt.Sprintf("%02x:%02x:%02x:%02x:%02x:%02x", a[0], a[1], a[2], a[3], a[4], a[5])
+	default:
+		return fmt.Sprintf("%x", []byte(a))
+	}
+}
+
+// ParseMACAddress parses an IEEE 802 address.
+//
+// It must be in the format aa:bb:cc:dd:ee:ff or aa-bb-cc-dd-ee-ff.
+func ParseMACAddress(s string) (LinkAddress, error) {
+	parts := strings.FieldsFunc(s, func(c rune) bool {
+		return c == ':' || c == '-'
+	})
+	if len(parts) != 6 {
+		return "", fmt.Errorf("inconsistent parts: %s", s)
+	}
+	addr := make([]byte, 0, len(parts))
+	for _, part := range parts {
+		u, err := strconv.ParseUint(part, 16, 8)
+		if err != nil {
+			return "", fmt.Errorf("invalid hex digits: %s", s)
+		}
+		addr = append(addr, byte(u))
+	}
+	return LinkAddress(addr), nil
+}
+
+// AddressWithPrefix is an address with its subnet prefix length.
+type AddressWithPrefix struct {
+	// Address is a network address.
+	Address Address
+
+	// PrefixLen is the subnet prefix length.
+	PrefixLen int
+}
+
+// String implements the fmt.Stringer interface.
+func (a AddressWithPrefix) String() string {
+	return fmt.Sprintf("%s/%d", a.Address, a.PrefixLen)
+}
+
+// Subnet converts the address and prefix into a Subnet value and returns it.
+func (a AddressWithPrefix) Subnet() Subnet {
+	addrLen := len(a.Address)
+	if a.PrefixLen <= 0 {
+		return Subnet{
+			address: Address(strings.Repeat("\x00", addrLen)),
+			mask:    AddressMask(strings.Repeat("\x00", addrLen)),
+		}
+	}
+	if a.PrefixLen >= addrLen*8 {
+		return Subnet{
+			address: a.Address,
+			mask:    AddressMask(strings.Repeat("\xff", addrLen)),
+		}
+	}
+
+	sa := make([]byte, addrLen)
+	sm := make([]byte, addrLen)
+	n := uint(a.PrefixLen)
+	for i := 0; i < addrLen; i++ {
+		if n >= 8 {
+			sa[i] = a.Address[i]
+			sm[i] = 0xff
+			n -= 8
+			continue
+		}
+		sm[i] = ^byte(0xff >> n)
+		sa[i] = a.Address[i] & sm[i]
+		n = 0
+	}
+
+	// For extra caution, call NewSubnet rather than directly creating the Subnet
+	// value. If that fails it indicates a serious bug in this code, so panic is
+	// in order.
+	s, err := NewSubnet(Address(sa), AddressMask(sm))
+	if err != nil {
+		panic("invalid subnet: " + err.Error())
+	}
+	return s
+}
+
+// ProtocolAddress is an address and the network protocol it is associated
+// with.
+type ProtocolAddress struct {
+	// Protocol is the protocol of the address.
+	Protocol NetworkProtocolNumber
+
+	// AddressWithPrefix is a network address with its subnet prefix length.
+	AddressWithPrefix AddressWithPrefix
+}
+
+var (
+	// danglingEndpointsMu protects access to danglingEndpoints.
+	danglingEndpointsMu sync.Mutex
+
+	// danglingEndpoints tracks all dangling endpoints no longer owned by the app.
+	danglingEndpoints = make(map[Endpoint]struct{})
+)
+
+// GetDanglingEndpoints returns all dangling endpoints.
+func GetDanglingEndpoints() []Endpoint {
+	danglingEndpointsMu.Lock()
+	es := make([]Endpoint, 0, len(danglingEndpoints))
+	for e := range danglingEndpoints {
+		es = append(es, e)
+	}
+	danglingEndpointsMu.Unlock()
+	return es
+}
+
+// AddDanglingEndpoint adds a dangling endpoint.
+func AddDanglingEndpoint(e Endpoint) {
+	danglingEndpointsMu.Lock()
+	danglingEndpoints[e] = struct{}{}
+	danglingEndpointsMu.Unlock()
+}
+
+// DeleteDanglingEndpoint removes a dangling endpoint.
+func DeleteDanglingEndpoint(e Endpoint) {
+	danglingEndpointsMu.Lock()
+	delete(danglingEndpoints, e)
+	danglingEndpointsMu.Unlock()
+}
+
+// AsyncLoading is the global barrier for asynchronous endpoint loading
+// activities.
+var AsyncLoading sync.WaitGroup
diff --git a/pkg/tcpip/tcpip_test.go b/pkg/tcpip/tcpip_test.go
new file mode 100644
index 000000000..1c8e2bc34
--- /dev/null
+++ b/pkg/tcpip/tcpip_test.go
@@ -0,0 +1,228 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcpip
+
+import (
+	"fmt"
+	"net"
+	"strings"
+	"testing"
+)
+
+func TestSubnetContains(t *testing.T) {
+	tests := []struct {
+		s    Address
+		m    AddressMask
+		a    Address
+		want bool
+	}{
+		{"\xa0", "\xf0", "\x90", false},
+		{"\xa0", "\xf0", "\xa0", true},
+		{"\xa0", "\xf0", "\xa5", true},
+		{"\xa0", "\xf0", "\xaf", true},
+		{"\xa0", "\xf0", "\xb0", false},
+		{"\xa0", "\xf0", "", false},
+		{"\xa0", "\xf0", "\xa0\x00", false},
+		{"\xc2\x80", "\xff\xf0", "\xc2\x80", true},
+		{"\xc2\x80", "\xff\xf0", "\xc2\x00", false},
+		{"\xc2\x00", "\xff\xf0", "\xc2\x00", true},
+		{"\xc2\x00", "\xff\xf0", "\xc2\x80", false},
+	}
+	for _, tt := range tests {
+		s, err := NewSubnet(tt.s, tt.m)
+		if err != nil {
+			t.Errorf("NewSubnet(%v, %v) = %v", tt.s, tt.m, err)
+			continue
+		}
+		if got := s.Contains(tt.a); got != tt.want {
+			t.Errorf("Subnet(%v).Contains(%v) = %v, want %v", s, tt.a, got, tt.want)
+		}
+	}
+}
+
+func TestSubnetBits(t *testing.T) {
+	tests := []struct {
+		a     AddressMask
+		want1 int
+		want0 int
+	}{
+		{"\x00", 0, 8},
+		{"\x00\x00", 0, 16},
+		{"\x36", 0, 8},
+		{"\x5c", 0, 8},
+		{"\x5c\x5c", 0, 16},
+		{"\x5c\x36", 0, 16},
+		{"\x36\x5c", 0, 16},
+		{"\x36\x36", 0, 16},
+		{"\xff", 8, 0},
+		{"\xff\xff", 16, 0},
+	}
+	for _, tt := range tests {
+		s := &Subnet{mask: tt.a}
+		got1, got0 := s.Bits()
+		if got1 != tt.want1 || got0 != tt.want0 {
+			t.Errorf("Subnet{mask: %x}.Bits() = %d, %d, want %d, %d", tt.a, got1, got0, tt.want1, tt.want0)
+		}
+	}
+}
+
+func TestSubnetPrefix(t *testing.T) {
+	tests := []struct {
+		a    AddressMask
+		want int
+	}{
+		{"\x00", 0},
+		{"\x00\x00", 0},
+		{"\x36", 0},
+		{"\x86", 1},
+		{"\xc5", 2},
+		{"\xff\x00", 8},
+		{"\xff\x36", 8},
+		{"\xff\x8c", 9},
+		{"\xff\xc8", 10},
+		{"\xff", 8},
+		{"\xff\xff", 16},
+	}
+	for _, tt := range tests {
+		s := &Subnet{mask: tt.a}
+		got := s.Prefix()
+		if got != tt.want {
+			t.Errorf("Subnet{mask: %x}.Bits() = %d want %d", tt.a, got, tt.want)
+		}
+	}
+}
+
+func TestSubnetCreation(t *testing.T) {
+	tests := []struct {
+		a    Address
+		m    AddressMask
+		want error
+	}{
+		{"\xa0", "\xf0", nil},
+		{"\xa0\xa0", "\xf0", errSubnetLengthMismatch},
+		{"\xaa", "\xf0", errSubnetAddressMasked},
+		{"", "", nil},
+	}
+	for _, tt := range tests {
+		if _, err := NewSubnet(tt.a, tt.m); err != tt.want {
+			t.Errorf("NewSubnet(%v, %v) = %v, want %v", tt.a, tt.m, err, tt.want)
+		}
+	}
+}
+
+func TestAddressString(t *testing.T) {
+	for _, want := range []string{
+		// Taken from stdlib.
+		"2001:db8::123:12:1",
+		"2001:db8::1",
+		"2001:db8:0:1:0:1:0:1",
+		"2001:db8:1:0:1:0:1:0",
+		"2001::1:0:0:1",
+		"2001:db8:0:0:1::",
+		"2001:db8::1:0:0:1",
+		"2001:db8::a:b:c:d",
+
+		// Leading zeros.
+		"::1",
+		// Trailing zeros.
+		"8::",
+		// No zeros.
+		"1:1:1:1:1:1:1:1",
+		// Longer sequence is after other zeros, but not at the end.
+		"1:0:0:1::1",
+		// Longer sequence is at the beginning, shorter sequence is at
+		// the end.
+		"::1:1:1:0:0",
+		// Longer sequence is not at the beginning, shorter sequence is
+		// at the end.
+		"1::1:1:0:0",
+		// Longer sequence is at the beginning, shorter sequence is not
+		// at the end.
+		"::1:1:0:0:1",
+		// Neither sequence is at an end, longer is after shorter.
+		"1:0:0:1::1",
+		// Shorter sequence is at the beginning, longer sequence is not
+		// at the end.
+		"0:0:1:1::1",
+		// Shorter sequence is at the beginning, longer sequence is at
+		// the end.
+		"0:0:1:1:1::",
+		// Short sequences at both ends, longer one in the middle.
+		"0:1:1::1:1:0",
+		// Short sequences at both ends, longer one in the middle.
+		"0:1::1:0:0",
+		// Short sequences at both ends, longer one in the middle.
+		"0:0:1::1:0",
+		// Longer sequence surrounded by shorter sequences, but none at
+		// the end.
+		"1:0:1::1:0:1",
+	} {
+		addr := Address(net.ParseIP(want))
+		if got := addr.String(); got != want {
+			t.Errorf("Address(%x).String() = '%s', want = '%s'", addr, got, want)
+		}
+	}
+}
+
+func TestStatsString(t *testing.T) {
+	got := fmt.Sprintf("%+v", Stats{}.FillIn())
+
+	matchers := []string{
+		// Print root-level stats correctly.
+		"UnknownProtocolRcvdPackets:0",
+		// Print protocol-specific stats correctly.
+		"TCP:{ActiveConnectionOpenings:0",
+	}
+
+	for _, m := range matchers {
+		if !strings.Contains(got, m) {
+			t.Errorf("string.Contains(got, %q) = false", m)
+		}
+	}
+	if t.Failed() {
+		t.Logf(`got = fmt.Sprintf("%%+v", Stats{}.FillIn()) = %q`, got)
+	}
+}
+
+func TestAddressWithPrefixSubnet(t *testing.T) {
+	tests := []struct {
+		addr       Address
+		prefixLen  int
+		subnetAddr Address
+		subnetMask AddressMask
+	}{
+		{"\xaa\x55\x33\x42", -1, "\x00\x00\x00\x00", "\x00\x00\x00\x00"},
+		{"\xaa\x55\x33\x42", 0, "\x00\x00\x00\x00", "\x00\x00\x00\x00"},
+		{"\xaa\x55\x33\x42", 1, "\x80\x00\x00\x00", "\x80\x00\x00\x00"},
+		{"\xaa\x55\x33\x42", 7, "\xaa\x00\x00\x00", "\xfe\x00\x00\x00"},
+		{"\xaa\x55\x33\x42", 8, "\xaa\x00\x00\x00", "\xff\x00\x00\x00"},
+		{"\xaa\x55\x33\x42", 24, "\xaa\x55\x33\x00", "\xff\xff\xff\x00"},
+		{"\xaa\x55\x33\x42", 31, "\xaa\x55\x33\x42", "\xff\xff\xff\xfe"},
+		{"\xaa\x55\x33\x42", 32, "\xaa\x55\x33\x42", "\xff\xff\xff\xff"},
+		{"\xaa\x55\x33\x42", 33, "\xaa\x55\x33\x42", "\xff\xff\xff\xff"},
+	}
+	for _, tt := range tests {
+		ap := AddressWithPrefix{Address: tt.addr, PrefixLen: tt.prefixLen}
+		gotSubnet := ap.Subnet()
+		wantSubnet, err := NewSubnet(tt.subnetAddr, tt.subnetMask)
+		if err != nil {
+			t.Errorf("NewSubnet(%q, %q) failed: %s", tt.subnetAddr, tt.subnetMask, err)
+			continue
+		}
+		if gotSubnet != wantSubnet {
+			t.Errorf("got subnet = %q, want = %q", gotSubnet, wantSubnet)
+		}
+	}
+}
diff --git a/pkg/tcpip/time.s b/pkg/tcpip/time.s
new file mode 100644
index 000000000..fb37360ac
--- /dev/null
+++ b/pkg/tcpip/time.s
@@ -0,0 +1,15 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Empty assembly file so empty func definitions work.
diff --git a/pkg/tcpip/time_unsafe.go b/pkg/tcpip/time_unsafe.go
new file mode 100644
index 000000000..7f172f978
--- /dev/null
+++ b/pkg/tcpip/time_unsafe.go
@@ -0,0 +1,47 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build go1.9
+// +build !go1.16
+
+// Check go:linkname function signatures when updating Go version.
+
+package tcpip
+
+import (
+	_ "time"   // Used with go:linkname.
+	_ "unsafe" // Required for go:linkname.
+)
+
+// StdClock implements Clock with the time package.
+//
+// +stateify savable
+type StdClock struct{}
+
+var _ Clock = (*StdClock)(nil)
+
+//go:linkname now time.now
+func now() (sec int64, nsec int32, mono int64)
+
+// NowNanoseconds implements Clock.NowNanoseconds.
+func (*StdClock) NowNanoseconds() int64 {
+	sec, nsec, _ := now()
+	return sec*1e9 + int64(nsec)
+}
+
+// NowMonotonic implements Clock.NowMonotonic.
+func (*StdClock) NowMonotonic() int64 {
+	_, _, mono := now()
+	return mono
+}
diff --git a/pkg/tcpip/timer.go b/pkg/tcpip/timer.go
new file mode 100644
index 000000000..59f3b391f
--- /dev/null
+++ b/pkg/tcpip/timer.go
@@ -0,0 +1,184 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcpip
+
+import (
+	"sync"
+	"time"
+)
+
+// cancellableTimerInstance is a specific instance of CancellableTimer.
+//
+// Different instances are created each time CancellableTimer is Reset so each
+// timer has its own earlyReturn signal. This is to address a bug when a
+// CancellableTimer is stopped and reset in quick succession resulting in a
+// timer instance's earlyReturn signal being affected or seen by another timer
+// instance.
+//
+// Consider the following sceneario where timer instances share a common
+// earlyReturn signal (T1 creates, stops and resets a Cancellable timer under a
+// lock L; T2, T3, T4 and T5 are goroutines that handle the first (A), second
+// (B), third (C), and fourth (D) instance of the timer firing, respectively):
+//   T1: Obtain L
+//   T1: Create a new CancellableTimer w/ lock L (create instance A)
+//   T2: instance A fires, blocked trying to obtain L.
+//   T1: Attempt to stop instance A (set earlyReturn = true)
+//   T1: Reset timer (create instance B)
+//   T3: instance B fires, blocked trying to obtain L.
+//   T1: Attempt to stop instance B (set earlyReturn = true)
+//   T1: Reset timer (create instance C)
+//   T4: instance C fires, blocked trying to obtain L.
+//   T1: Attempt to stop instance C (set earlyReturn = true)
+//   T1: Reset timer (create instance D)
+//   T5: instance D fires, blocked trying to obtain L.
+//   T1: Release L
+//
+// Now that T1 has released L, any of the 4 timer instances can take L and check
+// earlyReturn. If the timers simply check earlyReturn and then do nothing
+// further, then instance D will never early return even though it was not
+// requested to stop. If the timers reset earlyReturn before early returning,
+// then all but one of the timers will do work when only one was expected to.
+// If CancellableTimer resets earlyReturn when resetting, then all the timers
+// will fire (again, when only one was expected to).
+//
+// To address the above concerns the simplest solution was to give each timer
+// its own earlyReturn signal.
+type cancellableTimerInstance struct {
+	timer *time.Timer
+
+	// Used to inform the timer to early return when it gets stopped while the
+	// lock the timer tries to obtain when fired is held (T1 is a goroutine that
+	// tries to cancel the timer and T2 is the goroutine that handles the timer
+	// firing):
+	//   T1: Obtain the lock, then call StopLocked()
+	//   T2: timer fires, and gets blocked on obtaining the lock
+	//   T1: Releases lock
+	//   T2: Obtains lock does unintended work
+	//
+	// To resolve this, T1 will check to see if the timer already fired, and
+	// inform the timer using earlyReturn to return early so that once T2 obtains
+	// the lock, it will see that it is set to true and do nothing further.
+	earlyReturn *bool
+}
+
+// stop stops the timer instance t from firing if it hasn't fired already. If it
+// has fired and is blocked at obtaining the lock, earlyReturn will be set to
+// true so that it will early return when it obtains the lock.
+func (t *cancellableTimerInstance) stop() {
+	if t.timer != nil {
+		t.timer.Stop()
+		*t.earlyReturn = true
+	}
+}
+
+// CancellableTimer is a timer that does some work and can be safely cancelled
+// when it fires at the same time some "related work" is being done.
+//
+// The term "related work" is defined as some work that needs to be done while
+// holding some lock that the timer must also hold while doing some work.
+//
+// Note, it is not safe to copy a CancellableTimer as its timer instance creates
+// a closure over the address of the CancellableTimer.
+type CancellableTimer struct {
+	// The active instance of a cancellable timer.
+	instance cancellableTimerInstance
+
+	// locker is the lock taken by the timer immediately after it fires and must
+	// be held when attempting to stop the timer.
+	//
+	// Must never change after being assigned.
+	locker sync.Locker
+
+	// fn is the function that will be called when a timer fires and has not been
+	// signaled to early return.
+	//
+	// fn MUST NOT attempt to lock locker.
+	//
+	// Must never change after being assigned.
+	fn func()
+}
+
+// StopLocked prevents the Timer from firing if it has not fired already.
+//
+// If the timer is blocked on obtaining the t.locker lock when StopLocked is
+// called, it will early return instead of calling t.fn.
+//
+// Note, t will be modified.
+//
+// t.locker MUST be locked.
+func (t *CancellableTimer) StopLocked() {
+	t.instance.stop()
+
+	// Nothing to do with the stopped instance anymore.
+	t.instance = cancellableTimerInstance{}
+}
+
+// Reset changes the timer to expire after duration d.
+//
+// Note, t will be modified.
+//
+// Reset should only be called on stopped or expired timers. To be safe, callers
+// should always call StopLocked before calling Reset.
+func (t *CancellableTimer) Reset(d time.Duration) {
+	// Create a new instance.
+	earlyReturn := false
+
+	// Capture the locker so that updating the timer does not cause a data race
+	// when a timer fires and tries to obtain the lock (read the timer's locker).
+	locker := t.locker
+	t.instance = cancellableTimerInstance{
+		timer: time.AfterFunc(d, func() {
+			locker.Lock()
+			defer locker.Unlock()
+
+			if earlyReturn {
+				// If we reach this point, it means that the timer fired while another
+				// goroutine called StopLocked while it had the lock. Simply return
+				// here and do nothing further.
+				earlyReturn = false
+				return
+			}
+
+			t.fn()
+		}),
+		earlyReturn: &earlyReturn,
+	}
+}
+
+// Lock is a no-op used by the copylocks checker from go vet.
+//
+// See CancellableTimer for details about why it shouldn't be copied.
+//
+// See https://github.com/golang/go/issues/8005#issuecomment-190753527 for more
+// details about the copylocks checker.
+func (*CancellableTimer) Lock() {}
+
+// Unlock is a no-op used by the copylocks checker from go vet.
+//
+// See CancellableTimer for details about why it shouldn't be copied.
+//
+// See https://github.com/golang/go/issues/8005#issuecomment-190753527 for more
+// details about the copylocks checker.
+func (*CancellableTimer) Unlock() {}
+
+// NewCancellableTimer returns an unscheduled CancellableTimer with the given
+// locker and fn.
+//
+// fn MUST NOT attempt to lock locker.
+//
+// Callers must call Reset to schedule the timer to fire.
+func NewCancellableTimer(locker sync.Locker, fn func()) *CancellableTimer {
+	return &CancellableTimer{locker: locker, fn: fn}
+}
diff --git a/pkg/tcpip/timer_test.go b/pkg/tcpip/timer_test.go
new file mode 100644
index 000000000..b4940e397
--- /dev/null
+++ b/pkg/tcpip/timer_test.go
@@ -0,0 +1,261 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcpip_test
+
+import (
+	"sync"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	shortDuration  = 1 * time.Nanosecond
+	middleDuration = 100 * time.Millisecond
+	longDuration   = 1 * time.Second
+)
+
+func TestCancellableTimerReassignment(t *testing.T) {
+	var timer tcpip.CancellableTimer
+	var wg sync.WaitGroup
+	var lock sync.Mutex
+
+	for i := 0; i < 2; i++ {
+		wg.Add(1)
+
+		go func() {
+			lock.Lock()
+			// Assigning a new timer value updates the timer's locker and function.
+			// This test makes sure there is no data race when reassigning a timer
+			// that has an active timer (even if it has been stopped as a stopped
+			// timer may be blocked on a lock before it can check if it has been
+			// stopped while another goroutine holds the same lock).
+			timer = *tcpip.NewCancellableTimer(&lock, func() {
+				wg.Done()
+			})
+			timer.Reset(shortDuration)
+			lock.Unlock()
+		}()
+	}
+	wg.Wait()
+}
+
+func TestCancellableTimerFire(t *testing.T) {
+	t.Parallel()
+
+	ch := make(chan struct{})
+	var lock sync.Mutex
+
+	timer := tcpip.NewCancellableTimer(&lock, func() {
+		ch <- struct{}{}
+	})
+	timer.Reset(shortDuration)
+
+	// Wait for timer to fire.
+	select {
+	case <-ch:
+	case <-time.After(middleDuration):
+		t.Fatal("timed out waiting for timer to fire")
+	}
+
+	// The timer should have fired only once.
+	select {
+	case <-ch:
+		t.Fatal("no other timers should have fired")
+	case <-time.After(middleDuration):
+	}
+}
+
+func TestCancellableTimerResetFromLongDuration(t *testing.T) {
+	t.Parallel()
+
+	ch := make(chan struct{})
+	var lock sync.Mutex
+
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer.Reset(middleDuration)
+
+	lock.Lock()
+	timer.StopLocked()
+	lock.Unlock()
+
+	timer.Reset(shortDuration)
+
+	// Wait for timer to fire.
+	select {
+	case <-ch:
+	case <-time.After(middleDuration):
+		t.Fatal("timed out waiting for timer to fire")
+	}
+
+	// The timer should have fired only once.
+	select {
+	case <-ch:
+		t.Fatal("no other timers should have fired")
+	case <-time.After(middleDuration):
+	}
+}
+
+func TestCancellableTimerResetFromShortDuration(t *testing.T) {
+	t.Parallel()
+
+	ch := make(chan struct{})
+	var lock sync.Mutex
+
+	lock.Lock()
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer.Reset(shortDuration)
+	timer.StopLocked()
+	lock.Unlock()
+
+	// Wait for timer to fire if it wasn't correctly stopped.
+	select {
+	case <-ch:
+		t.Fatal("timer fired after being stopped")
+	case <-time.After(middleDuration):
+	}
+
+	timer.Reset(shortDuration)
+
+	// Wait for timer to fire.
+	select {
+	case <-ch:
+	case <-time.After(middleDuration):
+		t.Fatal("timed out waiting for timer to fire")
+	}
+
+	// The timer should have fired only once.
+	select {
+	case <-ch:
+		t.Fatal("no other timers should have fired")
+	case <-time.After(middleDuration):
+	}
+}
+
+func TestCancellableTimerImmediatelyStop(t *testing.T) {
+	t.Parallel()
+
+	ch := make(chan struct{})
+	var lock sync.Mutex
+
+	for i := 0; i < 1000; i++ {
+		lock.Lock()
+		timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
+		timer.Reset(shortDuration)
+		timer.StopLocked()
+		lock.Unlock()
+	}
+
+	// Wait for timer to fire if it wasn't correctly stopped.
+	select {
+	case <-ch:
+		t.Fatal("timer fired after being stopped")
+	case <-time.After(middleDuration):
+	}
+}
+
+func TestCancellableTimerStoppedResetWithoutLock(t *testing.T) {
+	t.Parallel()
+
+	ch := make(chan struct{})
+	var lock sync.Mutex
+
+	lock.Lock()
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer.Reset(shortDuration)
+	timer.StopLocked()
+	lock.Unlock()
+
+	for i := 0; i < 10; i++ {
+		timer.Reset(middleDuration)
+
+		lock.Lock()
+		// Sleep until the timer fires and gets blocked trying to take the lock.
+		time.Sleep(middleDuration * 2)
+		timer.StopLocked()
+		lock.Unlock()
+	}
+
+	// Wait for double the duration so timers that weren't correctly stopped can
+	// fire.
+	select {
+	case <-ch:
+		t.Fatal("timer fired after being stopped")
+	case <-time.After(middleDuration * 2):
+	}
+}
+
+func TestManyCancellableTimerResetAfterBlockedOnLock(t *testing.T) {
+	t.Parallel()
+
+	ch := make(chan struct{})
+	var lock sync.Mutex
+
+	lock.Lock()
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer.Reset(shortDuration)
+	for i := 0; i < 10; i++ {
+		// Sleep until the timer fires and gets blocked trying to take the lock.
+		time.Sleep(middleDuration)
+		timer.StopLocked()
+		timer.Reset(shortDuration)
+	}
+	lock.Unlock()
+
+	// Wait for double the duration for the last timer to fire.
+	select {
+	case <-ch:
+	case <-time.After(middleDuration):
+		t.Fatal("timed out waiting for timer to fire")
+	}
+
+	// The timer should have fired only once.
+	select {
+	case <-ch:
+		t.Fatal("no other timers should have fired")
+	case <-time.After(middleDuration):
+	}
+}
+
+func TestManyCancellableTimerResetUnderLock(t *testing.T) {
+	t.Parallel()
+
+	ch := make(chan struct{})
+	var lock sync.Mutex
+
+	lock.Lock()
+	timer := tcpip.NewCancellableTimer(&lock, func() { ch <- struct{}{} })
+	timer.Reset(shortDuration)
+	for i := 0; i < 10; i++ {
+		timer.StopLocked()
+		timer.Reset(shortDuration)
+	}
+	lock.Unlock()
+
+	// Wait for double the duration for the last timer to fire.
+	select {
+	case <-ch:
+	case <-time.After(middleDuration):
+		t.Fatal("timed out waiting for timer to fire")
+	}
+
+	// The timer should have fired only once.
+	select {
+	case <-ch:
+		t.Fatal("no other timers should have fired")
+	case <-time.After(middleDuration):
+	}
+}
diff --git a/pkg/tcpip/transport/icmp/BUILD b/pkg/tcpip/transport/icmp/BUILD
new file mode 100644
index 000000000..7e5c79776
--- /dev/null
+++ b/pkg/tcpip/transport/icmp/BUILD
@@ -0,0 +1,40 @@
+load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "icmp_packet_list",
+    out = "icmp_packet_list.go",
+    package = "icmp",
+    prefix = "icmpPacket",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*icmpPacket",
+        "Linker": "*icmpPacket",
+    },
+)
+
+go_library(
+    name = "icmp",
+    srcs = [
+        "endpoint.go",
+        "endpoint_state.go",
+        "icmp_packet_list.go",
+        "protocol.go",
+    ],
+    imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sleep",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/ports",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/raw",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/tcpip/transport/icmp/endpoint.go b/pkg/tcpip/transport/icmp/endpoint.go
new file mode 100644
index 000000000..62d1acad4
--- /dev/null
+++ b/pkg/tcpip/transport/icmp/endpoint.go
@@ -0,0 +1,831 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package icmp
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// +stateify savable
+type icmpPacket struct {
+	icmpPacketEntry
+	senderAddress tcpip.FullAddress
+	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
+	timestamp     int64
+}
+
+type endpointState int
+
+const (
+	stateInitial endpointState = iota
+	stateBound
+	stateConnected
+	stateClosed
+)
+
+// endpoint represents an ICMP endpoint. This struct serves as the interface
+// between users of the endpoint and the protocol implementation; it is legal to
+// have concurrent goroutines make calls into the endpoint, they are properly
+// synchronized.
+//
+// +stateify savable
+type endpoint struct {
+	stack.TransportEndpointInfo
+
+	// The following fields are initialized at creation time and are
+	// immutable.
+	stack       *stack.Stack `state:"manual"`
+	waiterQueue *waiter.Queue
+	uniqueID    uint64
+
+	// The following fields are used to manage the receive queue, and are
+	// protected by rcvMu.
+	rcvMu         sync.Mutex `state:"nosave"`
+	rcvReady      bool
+	rcvList       icmpPacketList
+	rcvBufSizeMax int `state:".(int)"`
+	rcvBufSize    int
+	rcvClosed     bool
+
+	// The following fields are protected by the mu mutex.
+	mu         sync.RWMutex `state:"nosave"`
+	sndBufSize int
+	// shutdownFlags represent the current shutdown state of the endpoint.
+	shutdownFlags tcpip.ShutdownFlags
+	state         endpointState
+	route         stack.Route `state:"manual"`
+	ttl           uint8
+	stats         tcpip.TransportEndpointStats `state:"nosave"`
+
+	// owner is used to get uid and gid of the packet.
+	owner tcpip.PacketOwner
+}
+
+func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return &endpoint{
+		stack: s,
+		TransportEndpointInfo: stack.TransportEndpointInfo{
+			NetProto:   netProto,
+			TransProto: transProto,
+		},
+		waiterQueue:   waiterQueue,
+		rcvBufSizeMax: 32 * 1024,
+		sndBufSize:    32 * 1024,
+		state:         stateInitial,
+		uniqueID:      s.UniqueID(),
+	}, nil
+}
+
+// UniqueID implements stack.TransportEndpoint.UniqueID.
+func (e *endpoint) UniqueID() uint64 {
+	return e.uniqueID
+}
+
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	e.Close()
+}
+
+// Close puts the endpoint in a closed state and frees all resources
+// associated with it.
+func (e *endpoint) Close() {
+	e.mu.Lock()
+	e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite
+	switch e.state {
+	case stateBound, stateConnected:
+		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, []tcpip.NetworkProtocolNumber{e.NetProto}, e.TransProto, e.ID, e, ports.Flags{}, 0 /* bindToDevice */)
+	}
+
+	// Close the receive list and drain it.
+	e.rcvMu.Lock()
+	e.rcvClosed = true
+	e.rcvBufSize = 0
+	for !e.rcvList.Empty() {
+		p := e.rcvList.Front()
+		e.rcvList.Remove(p)
+	}
+	e.rcvMu.Unlock()
+
+	e.route.Release()
+
+	// Update the state.
+	e.state = stateClosed
+
+	e.mu.Unlock()
+
+	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+}
+
+// ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
+func (e *endpoint) ModerateRecvBuf(copied int) {}
+
+func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
+	e.owner = owner
+}
+
+// Read reads data from the endpoint. This method does not block if
+// there is no data pending.
+func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	e.rcvMu.Lock()
+
+	if e.rcvList.Empty() {
+		err := tcpip.ErrWouldBlock
+		if e.rcvClosed {
+			e.stats.ReadErrors.ReadClosed.Increment()
+			err = tcpip.ErrClosedForReceive
+		}
+		e.rcvMu.Unlock()
+		return buffer.View{}, tcpip.ControlMessages{}, err
+	}
+
+	p := e.rcvList.Front()
+	e.rcvList.Remove(p)
+	e.rcvBufSize -= p.data.Size()
+
+	e.rcvMu.Unlock()
+
+	if addr != nil {
+		*addr = p.senderAddress
+	}
+
+	return p.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: p.timestamp}, nil
+}
+
+// prepareForWrite prepares the endpoint for sending data. In particular, it
+// binds it if it's still in the initial state. To do so, it must first
+// reacquire the mutex in exclusive mode.
+//
+// Returns true for retry if preparation should be retried.
+func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpip.Error) {
+	switch e.state {
+	case stateInitial:
+	case stateConnected:
+		return false, nil
+
+	case stateBound:
+		if to == nil {
+			return false, tcpip.ErrDestinationRequired
+		}
+		return false, nil
+	default:
+		return false, tcpip.ErrInvalidEndpointState
+	}
+
+	e.mu.RUnlock()
+	defer e.mu.RLock()
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// The state changed when we released the shared locked and re-acquired
+	// it in exclusive mode. Try again.
+	if e.state != stateInitial {
+		return true, nil
+	}
+
+	// The state is still 'initial', so try to bind the endpoint.
+	if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
+		return false, err
+	}
+
+	return true, nil
+}
+
+// Write writes data to the endpoint's peer. This method does not block
+// if the data cannot be written.
+func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+	n, ch, err := e.write(p, opts)
+	switch err {
+	case nil:
+		e.stats.PacketsSent.Increment()
+	case tcpip.ErrMessageTooLong, tcpip.ErrInvalidOptionValue:
+		e.stats.WriteErrors.InvalidArgs.Increment()
+	case tcpip.ErrClosedForSend:
+		e.stats.WriteErrors.WriteClosed.Increment()
+	case tcpip.ErrInvalidEndpointState:
+		e.stats.WriteErrors.InvalidEndpointState.Increment()
+	case tcpip.ErrNoLinkAddress:
+		e.stats.SendErrors.NoLinkAddr.Increment()
+	case tcpip.ErrNoRoute, tcpip.ErrBroadcastDisabled, tcpip.ErrNetworkUnreachable:
+		// Errors indicating any problem with IP routing of the packet.
+		e.stats.SendErrors.NoRoute.Increment()
+	default:
+		// For all other errors when writing to the network layer.
+		e.stats.SendErrors.SendToNetworkFailed.Increment()
+	}
+	return n, ch, err
+}
+
+func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+	// MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
+	if opts.More {
+		return 0, nil, tcpip.ErrInvalidOptionValue
+	}
+
+	to := opts.To
+
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	// If we've shutdown with SHUT_WR we are in an invalid state for sending.
+	if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
+		return 0, nil, tcpip.ErrClosedForSend
+	}
+
+	// Prepare for write.
+	for {
+		retry, err := e.prepareForWrite(to)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		if !retry {
+			break
+		}
+	}
+
+	var route *stack.Route
+	if to == nil {
+		route = &e.route
+
+		if route.IsResolutionRequired() {
+			// Promote lock to exclusive if using a shared route,
+			// given that it may need to change in Route.Resolve()
+			// call below.
+			e.mu.RUnlock()
+			defer e.mu.RLock()
+
+			e.mu.Lock()
+			defer e.mu.Unlock()
+
+			// Recheck state after lock was re-acquired.
+			if e.state != stateConnected {
+				return 0, nil, tcpip.ErrInvalidEndpointState
+			}
+		}
+	} else {
+		// Reject destination address if it goes through a different
+		// NIC than the endpoint was bound to.
+		nicID := to.NIC
+		if e.BindNICID != 0 {
+			if nicID != 0 && nicID != e.BindNICID {
+				return 0, nil, tcpip.ErrNoRoute
+			}
+
+			nicID = e.BindNICID
+		}
+
+		dst, netProto, err := e.checkV4MappedLocked(*to)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		// Find the endpoint.
+		r, err := e.stack.FindRoute(nicID, e.BindAddr, dst.Addr, netProto, false /* multicastLoop */)
+		if err != nil {
+			return 0, nil, err
+		}
+		defer r.Release()
+
+		route = &r
+	}
+
+	if route.IsResolutionRequired() {
+		if ch, err := route.Resolve(nil); err != nil {
+			if err == tcpip.ErrWouldBlock {
+				return 0, ch, tcpip.ErrNoLinkAddress
+			}
+			return 0, nil, err
+		}
+	}
+
+	v, err := p.FullPayload()
+	if err != nil {
+		return 0, nil, err
+	}
+
+	switch e.NetProto {
+	case header.IPv4ProtocolNumber:
+		err = send4(route, e.ID.LocalPort, v, e.ttl, e.owner)
+
+	case header.IPv6ProtocolNumber:
+		err = send6(route, e.ID.LocalPort, v, e.ttl)
+	}
+
+	if err != nil {
+		return 0, nil, err
+	}
+
+	return int64(len(v)), nil, nil
+}
+
+// Peek only returns data from a single datagram, so do nothing here.
+func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+	return 0, tcpip.ControlMessages{}, nil
+}
+
+// SetSockOpt sets a socket option.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	return nil
+}
+
+// SetSockOptBool sets a socket option. Currently not supported.
+func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	return nil
+}
+
+// SetSockOptInt sets a socket option. Currently not supported.
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	switch opt {
+	case tcpip.TTLOption:
+		e.mu.Lock()
+		e.ttl = uint8(v)
+		e.mu.Unlock()
+
+	}
+	return nil
+}
+
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	switch opt {
+	case tcpip.KeepaliveEnabledOption:
+		return false, nil
+
+	default:
+		return false, tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
+func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
+	switch opt {
+	case tcpip.ReceiveQueueSizeOption:
+		v := 0
+		e.rcvMu.Lock()
+		if !e.rcvList.Empty() {
+			p := e.rcvList.Front()
+			v = p.data.Size()
+		}
+		e.rcvMu.Unlock()
+		return v, nil
+	case tcpip.SendBufferSizeOption:
+		e.mu.Lock()
+		v := e.sndBufSize
+		e.mu.Unlock()
+		return v, nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		e.rcvMu.Lock()
+		v := e.rcvBufSizeMax
+		e.rcvMu.Unlock()
+		return v, nil
+
+	case tcpip.TTLOption:
+		e.rcvMu.Lock()
+		v := int(e.ttl)
+		e.rcvMu.Unlock()
+		return v, nil
+
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	switch opt.(type) {
+	case tcpip.ErrorOption:
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+func send4(r *stack.Route, ident uint16, data buffer.View, ttl uint8, owner tcpip.PacketOwner) *tcpip.Error {
+	if len(data) < header.ICMPv4MinimumSize {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	hdr := buffer.NewPrependable(header.ICMPv4MinimumSize + int(r.MaxHeaderLength()))
+
+	icmpv4 := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
+	copy(icmpv4, data)
+	// Set the ident to the user-specified port. Sequence number should
+	// already be set by the user.
+	icmpv4.SetIdent(ident)
+	data = data[header.ICMPv4MinimumSize:]
+
+	// Linux performs these basic checks.
+	if icmpv4.Type() != header.ICMPv4Echo || icmpv4.Code() != 0 {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	icmpv4.SetChecksum(0)
+	icmpv4.SetChecksum(^header.Checksum(icmpv4, header.Checksum(data, 0)))
+
+	if ttl == 0 {
+		ttl = r.DefaultTTL()
+	}
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
+		Header:          hdr,
+		Data:            data.ToVectorisedView(),
+		TransportHeader: buffer.View(icmpv4),
+		Owner:           owner,
+	})
+}
+
+func send6(r *stack.Route, ident uint16, data buffer.View, ttl uint8) *tcpip.Error {
+	if len(data) < header.ICMPv6EchoMinimumSize {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	hdr := buffer.NewPrependable(header.ICMPv6MinimumSize + int(r.MaxHeaderLength()))
+
+	icmpv6 := header.ICMPv6(hdr.Prepend(header.ICMPv6MinimumSize))
+	copy(icmpv6, data)
+	// Set the ident. Sequence number is provided by the user.
+	icmpv6.SetIdent(ident)
+	data = data[header.ICMPv6MinimumSize:]
+
+	if icmpv6.Type() != header.ICMPv6EchoRequest || icmpv6.Code() != 0 {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	dataVV := data.ToVectorisedView()
+	icmpv6.SetChecksum(header.ICMPv6Checksum(icmpv6, r.LocalAddress, r.RemoteAddress, dataVV))
+
+	if ttl == 0 {
+		ttl = r.DefaultTTL()
+	}
+	return r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: ttl, TOS: stack.DefaultTOS}, &stack.PacketBuffer{
+		Header:          hdr,
+		Data:            dataVV,
+		TransportHeader: buffer.View(icmpv6),
+	})
+}
+
+// checkV4MappedLocked determines the effective network protocol and converts
+// addr to its canonical form.
+func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, false /* v6only */)
+	if err != nil {
+		return tcpip.FullAddress{}, 0, err
+	}
+	return unwrapped, netProto, nil
+}
+
+// Disconnect implements tcpip.Endpoint.Disconnect.
+func (*endpoint) Disconnect() *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Connect connects the endpoint to its peer. Specifying a NIC is optional.
+func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	nicID := addr.NIC
+	localPort := uint16(0)
+	switch e.state {
+	case stateInitial:
+	case stateBound, stateConnected:
+		localPort = e.ID.LocalPort
+		if e.BindNICID == 0 {
+			break
+		}
+
+		if nicID != 0 && nicID != e.BindNICID {
+			return tcpip.ErrInvalidEndpointState
+		}
+
+		nicID = e.BindNICID
+	default:
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	addr, netProto, err := e.checkV4MappedLocked(addr)
+	if err != nil {
+		return err
+	}
+
+	// Find a route to the desired destination.
+	r, err := e.stack.FindRoute(nicID, e.BindAddr, addr.Addr, netProto, false /* multicastLoop */)
+	if err != nil {
+		return err
+	}
+	defer r.Release()
+
+	id := stack.TransportEndpointID{
+		LocalAddress:  r.LocalAddress,
+		LocalPort:     localPort,
+		RemoteAddress: r.RemoteAddress,
+	}
+
+	// Even if we're connected, this endpoint can still be used to send
+	// packets on a different network protocol, so we register both even if
+	// v6only is set to false and this is an ipv6 endpoint.
+	netProtos := []tcpip.NetworkProtocolNumber{netProto}
+
+	id, err = e.registerWithStack(nicID, netProtos, id)
+	if err != nil {
+		return err
+	}
+
+	e.ID = id
+	e.route = r.Clone()
+	e.RegisterNICID = nicID
+
+	e.state = stateConnected
+
+	e.rcvMu.Lock()
+	e.rcvReady = true
+	e.rcvMu.Unlock()
+
+	return nil
+}
+
+// ConnectEndpoint is not supported.
+func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// Shutdown closes the read and/or write end of the endpoint connection
+// to its peer.
+func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	e.shutdownFlags |= flags
+
+	if e.state != stateConnected {
+		return tcpip.ErrNotConnected
+	}
+
+	if flags&tcpip.ShutdownRead != 0 {
+		e.rcvMu.Lock()
+		wasClosed := e.rcvClosed
+		e.rcvClosed = true
+		e.rcvMu.Unlock()
+
+		if !wasClosed {
+			e.waiterQueue.Notify(waiter.EventIn)
+		}
+	}
+
+	return nil
+}
+
+// Listen is not supported by UDP, it just fails.
+func (*endpoint) Listen(int) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Accept is not supported by UDP, it just fails.
+func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+	return nil, nil, tcpip.ErrNotSupported
+}
+
+func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, *tcpip.Error) {
+	if id.LocalPort != 0 {
+		// The endpoint already has a local port, just attempt to
+		// register it.
+		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, e.TransProto, id, e, ports.Flags{}, 0 /* bindToDevice */)
+		return id, err
+	}
+
+	// We need to find a port for the endpoint.
+	_, err := e.stack.PickEphemeralPort(func(p uint16) (bool, *tcpip.Error) {
+		id.LocalPort = p
+		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, e.TransProto, id, e, ports.Flags{}, 0 /* bindtodevice */)
+		switch err {
+		case nil:
+			return true, nil
+		case tcpip.ErrPortInUse:
+			return false, nil
+		default:
+			return false, err
+		}
+	})
+
+	return id, err
+}
+
+func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
+	// Don't allow binding once endpoint is not in the initial state
+	// anymore.
+	if e.state != stateInitial {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	addr, netProto, err := e.checkV4MappedLocked(addr)
+	if err != nil {
+		return err
+	}
+
+	// Expand netProtos to include v4 and v6 if the caller is binding to a
+	// wildcard (empty) address, and this is an IPv6 endpoint with v6only
+	// set to false.
+	netProtos := []tcpip.NetworkProtocolNumber{netProto}
+
+	if len(addr.Addr) != 0 {
+		// A local address was specified, verify that it's valid.
+		if e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) == 0 {
+			return tcpip.ErrBadLocalAddress
+		}
+	}
+
+	id := stack.TransportEndpointID{
+		LocalPort:    addr.Port,
+		LocalAddress: addr.Addr,
+	}
+	id, err = e.registerWithStack(addr.NIC, netProtos, id)
+	if err != nil {
+		return err
+	}
+
+	e.ID = id
+	e.RegisterNICID = addr.NIC
+
+	// Mark endpoint as bound.
+	e.state = stateBound
+
+	e.rcvMu.Lock()
+	e.rcvReady = true
+	e.rcvMu.Unlock()
+
+	return nil
+}
+
+// Bind binds the endpoint to a specific local address and port.
+// Specifying a NIC is optional.
+func (e *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	err := e.bindLocked(addr)
+	if err != nil {
+		return err
+	}
+
+	e.BindNICID = addr.NIC
+	e.BindAddr = addr.Addr
+
+	return nil
+}
+
+// GetLocalAddress returns the address to which the endpoint is bound.
+func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	return tcpip.FullAddress{
+		NIC:  e.RegisterNICID,
+		Addr: e.ID.LocalAddress,
+		Port: e.ID.LocalPort,
+	}, nil
+}
+
+// GetRemoteAddress returns the address to which the endpoint is connected.
+func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	if e.state != stateConnected {
+		return tcpip.FullAddress{}, tcpip.ErrNotConnected
+	}
+
+	return tcpip.FullAddress{
+		NIC:  e.RegisterNICID,
+		Addr: e.ID.RemoteAddress,
+		Port: e.ID.RemotePort,
+	}, nil
+}
+
+// Readiness returns the current readiness of the endpoint. For example, if
+// waiter.EventIn is set, the endpoint is immediately readable.
+func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+	// The endpoint is always writable.
+	result := waiter.EventOut & mask
+
+	// Determine if the endpoint is readable if requested.
+	if (mask & waiter.EventIn) != 0 {
+		e.rcvMu.Lock()
+		if !e.rcvList.Empty() || e.rcvClosed {
+			result |= waiter.EventIn
+		}
+		e.rcvMu.Unlock()
+	}
+
+	return result
+}
+
+// HandlePacket is called by the stack when new packets arrive to this transport
+// endpoint.
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
+	// Only accept echo replies.
+	switch e.NetProto {
+	case header.IPv4ProtocolNumber:
+		h := header.ICMPv4(pkt.TransportHeader)
+		if len(h) < header.ICMPv4MinimumSize || h.Type() != header.ICMPv4EchoReply {
+			e.stack.Stats().DroppedPackets.Increment()
+			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
+			return
+		}
+	case header.IPv6ProtocolNumber:
+		h := header.ICMPv6(pkt.TransportHeader)
+		if len(h) < header.ICMPv6MinimumSize || h.Type() != header.ICMPv6EchoReply {
+			e.stack.Stats().DroppedPackets.Increment()
+			e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
+			return
+		}
+	}
+
+	e.rcvMu.Lock()
+
+	// Drop the packet if our buffer is currently full.
+	if !e.rcvReady || e.rcvClosed {
+		e.rcvMu.Unlock()
+		e.stack.Stats().DroppedPackets.Increment()
+		e.stats.ReceiveErrors.ClosedReceiver.Increment()
+		return
+	}
+
+	if e.rcvBufSize >= e.rcvBufSizeMax {
+		e.rcvMu.Unlock()
+		e.stack.Stats().DroppedPackets.Increment()
+		e.stats.ReceiveErrors.ReceiveBufferOverflow.Increment()
+		return
+	}
+
+	wasEmpty := e.rcvBufSize == 0
+
+	// Push new packet into receive list and increment the buffer size.
+	packet := &icmpPacket{
+		senderAddress: tcpip.FullAddress{
+			NIC:  r.NICID(),
+			Addr: id.RemoteAddress,
+		},
+	}
+
+	// ICMP socket's data includes ICMP header.
+	packet.data = pkt.TransportHeader.ToVectorisedView()
+	packet.data.Append(pkt.Data)
+
+	e.rcvList.PushBack(packet)
+	e.rcvBufSize += packet.data.Size()
+
+	packet.timestamp = e.stack.NowNanoseconds()
+
+	e.rcvMu.Unlock()
+	e.stats.PacketsReceived.Increment()
+	// Notify any waiters that there's data to be read now.
+	if wasEmpty {
+		e.waiterQueue.Notify(waiter.EventIn)
+	}
+}
+
+// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
+}
+
+// State implements tcpip.Endpoint.State. The ICMP endpoint currently doesn't
+// expose internal socket state.
+func (e *endpoint) State() uint32 {
+	return 0
+}
+
+// Info returns a copy of the endpoint info.
+func (e *endpoint) Info() tcpip.EndpointInfo {
+	e.mu.RLock()
+	// Make a copy of the endpoint info.
+	ret := e.TransportEndpointInfo
+	e.mu.RUnlock()
+	return &ret
+}
+
+// Stats returns a pointer to the endpoint stats.
+func (e *endpoint) Stats() tcpip.EndpointStats {
+	return &e.stats
+}
+
+// Wait implements stack.TransportEndpoint.Wait.
+func (*endpoint) Wait() {}
diff --git a/pkg/tcpip/transport/icmp/endpoint_state.go b/pkg/tcpip/transport/icmp/endpoint_state.go
new file mode 100644
index 000000000..9d263c0ec
--- /dev/null
+++ b/pkg/tcpip/transport/icmp/endpoint_state.go
@@ -0,0 +1,95 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package icmp
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// saveData saves icmpPacket.data field.
+func (p *icmpPacket) saveData() buffer.VectorisedView {
+	// We cannot save p.data directly as p.data.views may alias to p.views,
+	// which is not allowed by state framework (in-struct pointer).
+	return p.data.Clone(nil)
+}
+
+// loadData loads icmpPacket.data field.
+func (p *icmpPacket) loadData(data buffer.VectorisedView) {
+	// NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization
+	// here because data.views is not guaranteed to be loaded by now. Plus,
+	// data.views will be allocated anyway so there really is little point
+	// of utilizing p.views for data.views.
+	p.data = data
+}
+
+// beforeSave is invoked by stateify.
+func (e *endpoint) beforeSave() {
+	// Stop incoming packets from being handled (and mutate endpoint state).
+	// The lock will be released after savercvBufSizeMax(), which would have
+	// saved e.rcvBufSizeMax and set it to 0 to continue blocking incoming
+	// packets.
+	e.rcvMu.Lock()
+}
+
+// saveRcvBufSizeMax is invoked by stateify.
+func (e *endpoint) saveRcvBufSizeMax() int {
+	max := e.rcvBufSizeMax
+	// Make sure no new packets will be handled regardless of the lock.
+	e.rcvBufSizeMax = 0
+	// Release the lock acquired in beforeSave() so regular endpoint closing
+	// logic can proceed after save.
+	e.rcvMu.Unlock()
+	return max
+}
+
+// loadRcvBufSizeMax is invoked by stateify.
+func (e *endpoint) loadRcvBufSizeMax(max int) {
+	e.rcvBufSizeMax = max
+}
+
+// afterLoad is invoked by stateify.
+func (e *endpoint) afterLoad() {
+	stack.StackFromEnv.RegisterRestoredEndpoint(e)
+}
+
+// Resume implements tcpip.ResumableEndpoint.Resume.
+func (e *endpoint) Resume(s *stack.Stack) {
+	e.stack = s
+
+	if e.state != stateBound && e.state != stateConnected {
+		return
+	}
+
+	var err *tcpip.Error
+	if e.state == stateConnected {
+		e.route, err = e.stack.FindRoute(e.RegisterNICID, e.BindAddr, e.ID.RemoteAddress, e.NetProto, false /* multicastLoop */)
+		if err != nil {
+			panic(err)
+		}
+
+		e.ID.LocalAddress = e.route.LocalAddress
+	} else if len(e.ID.LocalAddress) != 0 { // stateBound
+		if e.stack.CheckLocalAddress(e.RegisterNICID, e.NetProto, e.ID.LocalAddress) == 0 {
+			panic(tcpip.ErrBadLocalAddress)
+		}
+	}
+
+	e.ID, err = e.registerWithStack(e.RegisterNICID, []tcpip.NetworkProtocolNumber{e.NetProto}, e.ID)
+	if err != nil {
+		panic(err)
+	}
+}
diff --git a/pkg/tcpip/transport/icmp/protocol.go b/pkg/tcpip/transport/icmp/protocol.go
new file mode 100644
index 000000000..74ef6541e
--- /dev/null
+++ b/pkg/tcpip/transport/icmp/protocol.go
@@ -0,0 +1,145 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package icmp contains the implementation of the ICMP and IPv6-ICMP transport
+// protocols for use in ping. To use it in the networking stack, this package
+// must be added to the project, and activated on the stack by passing
+// icmp.NewProtocol4() and/or icmp.NewProtocol6() as one of the transport
+// protocols when calling stack.New(). Then endpoints can be created by passing
+// icmp.ProtocolNumber or icmp.ProtocolNumber6 as the transport protocol number
+// when calling Stack.NewEndpoint().
+package icmp
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// ProtocolNumber4 is the ICMP protocol number.
+	ProtocolNumber4 = header.ICMPv4ProtocolNumber
+
+	// ProtocolNumber6 is the IPv6-ICMP protocol number.
+	ProtocolNumber6 = header.ICMPv6ProtocolNumber
+)
+
+// protocol implements stack.TransportProtocol.
+type protocol struct {
+	number tcpip.TransportProtocolNumber
+}
+
+// Number returns the ICMP protocol number.
+func (p *protocol) Number() tcpip.TransportProtocolNumber {
+	return p.number
+}
+
+func (p *protocol) netProto() tcpip.NetworkProtocolNumber {
+	switch p.number {
+	case ProtocolNumber4:
+		return header.IPv4ProtocolNumber
+	case ProtocolNumber6:
+		return header.IPv6ProtocolNumber
+	}
+	panic(fmt.Sprint("unknown protocol number: ", p.number))
+}
+
+// NewEndpoint creates a new icmp endpoint. It implements
+// stack.TransportProtocol.NewEndpoint.
+func (p *protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	if netProto != p.netProto() {
+		return nil, tcpip.ErrUnknownProtocol
+	}
+	return newEndpoint(stack, netProto, p.number, waiterQueue)
+}
+
+// NewRawEndpoint creates a new raw icmp endpoint. It implements
+// stack.TransportProtocol.NewRawEndpoint.
+func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	if netProto != p.netProto() {
+		return nil, tcpip.ErrUnknownProtocol
+	}
+	return raw.NewEndpoint(stack, netProto, p.number, waiterQueue)
+}
+
+// MinimumPacketSize returns the minimum valid icmp packet size.
+func (p *protocol) MinimumPacketSize() int {
+	switch p.number {
+	case ProtocolNumber4:
+		return header.ICMPv4MinimumSize
+	case ProtocolNumber6:
+		return header.ICMPv6MinimumSize
+	}
+	panic(fmt.Sprint("unknown protocol number: ", p.number))
+}
+
+// ParsePorts in case of ICMP sets src to 0, dst to ICMP ID, and err to nil.
+func (p *protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
+	switch p.number {
+	case ProtocolNumber4:
+		hdr := header.ICMPv4(v)
+		return 0, hdr.Ident(), nil
+	case ProtocolNumber6:
+		hdr := header.ICMPv6(v)
+		return 0, hdr.Ident(), nil
+	}
+	panic(fmt.Sprint("unknown protocol number: ", p.number))
+}
+
+// HandleUnknownDestinationPacket handles packets targeted at this protocol but
+// that don't match any existing endpoint.
+func (*protocol) HandleUnknownDestinationPacket(*stack.Route, stack.TransportEndpointID, *stack.PacketBuffer) bool {
+	return true
+}
+
+// SetOption implements stack.TransportProtocol.SetOption.
+func (*protocol) SetOption(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// Option implements stack.TransportProtocol.Option.
+func (*protocol) Option(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
+	// TODO(gvisor.dev/issue/170): Implement parsing of ICMP.
+	//
+	// Right now, the Parse() method is tied to enabled protocols passed into
+	// stack.New. This works for UDP and TCP, but we handle ICMP traffic even
+	// when netstack users don't pass ICMP as a supported protocol.
+	return false
+}
+
+// NewProtocol4 returns an ICMPv4 transport protocol.
+func NewProtocol4() stack.TransportProtocol {
+	return &protocol{ProtocolNumber4}
+}
+
+// NewProtocol6 returns an ICMPv6 transport protocol.
+func NewProtocol6() stack.TransportProtocol {
+	return &protocol{ProtocolNumber6}
+}
diff --git a/pkg/tcpip/transport/packet/BUILD b/pkg/tcpip/transport/packet/BUILD
new file mode 100644
index 000000000..b989b1209
--- /dev/null
+++ b/pkg/tcpip/transport/packet/BUILD
@@ -0,0 +1,37 @@
+load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "packet_list",
+    out = "packet_list.go",
+    package = "packet",
+    prefix = "packet",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*packet",
+        "Linker": "*packet",
+    },
+)
+
+go_library(
+    name = "packet",
+    srcs = [
+        "endpoint.go",
+        "endpoint_state.go",
+        "packet_list.go",
+    ],
+    imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/log",
+        "//pkg/sleep",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/tcpip/transport/packet/endpoint.go b/pkg/tcpip/transport/packet/endpoint.go
new file mode 100644
index 000000000..a8f8454dd
--- /dev/null
+++ b/pkg/tcpip/transport/packet/endpoint.go
@@ -0,0 +1,469 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package packet provides the implementation of packet sockets (see
+// packet(7)). Packet sockets allow applications to:
+//
+//   * manually write and inspect link, network, and transport headers
+//   * receive all traffic of a given network protocol, or all protocols
+//
+// Packet sockets are similar to raw sockets, but provide even more power to
+// users, letting them effectively talk directly to the network device.
+//
+// Packet sockets skip the input and output iptables chains.
+package packet
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// +stateify savable
+type packet struct {
+	packetEntry
+	// data holds the actual packet data, including any headers and
+	// payload.
+	data buffer.VectorisedView `state:".(buffer.VectorisedView)"`
+	// timestampNS is the unix time at which the packet was received.
+	timestampNS int64
+	// senderAddr is the network address of the sender.
+	senderAddr tcpip.FullAddress
+}
+
+// endpoint is the packet socket implementation of tcpip.Endpoint. It is legal
+// to have goroutines make concurrent calls into the endpoint.
+//
+// Lock order:
+//   endpoint.mu
+//     endpoint.rcvMu
+//
+// +stateify savable
+type endpoint struct {
+	stack.TransportEndpointInfo
+	// The following fields are initialized at creation time and are
+	// immutable.
+	stack       *stack.Stack `state:"manual"`
+	netProto    tcpip.NetworkProtocolNumber
+	waiterQueue *waiter.Queue
+	cooked      bool
+
+	// The following fields are used to manage the receive queue and are
+	// protected by rcvMu.
+	rcvMu         sync.Mutex `state:"nosave"`
+	rcvList       packetList
+	rcvBufSizeMax int `state:".(int)"`
+	rcvBufSize    int
+	rcvClosed     bool
+
+	// The following fields are protected by mu.
+	mu            sync.RWMutex `state:"nosave"`
+	sndBufSize    int
+	sndBufSizeMax int
+	closed        bool
+	stats         tcpip.TransportEndpointStats `state:"nosave"`
+	bound         bool
+}
+
+// NewEndpoint returns a new packet endpoint.
+func NewEndpoint(s *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	ep := &endpoint{
+		stack: s,
+		TransportEndpointInfo: stack.TransportEndpointInfo{
+			NetProto: netProto,
+		},
+		cooked:        cooked,
+		netProto:      netProto,
+		waiterQueue:   waiterQueue,
+		rcvBufSizeMax: 32 * 1024,
+		sndBufSize:    32 * 1024,
+	}
+
+	// Override with stack defaults.
+	var ss stack.SendBufferSizeOption
+	if err := s.Option(&ss); err == nil {
+		ep.sndBufSizeMax = ss.Default
+	}
+
+	var rs stack.ReceiveBufferSizeOption
+	if err := s.Option(&rs); err == nil {
+		ep.rcvBufSizeMax = rs.Default
+	}
+
+	if err := s.RegisterPacketEndpoint(0, netProto, ep); err != nil {
+		return nil, err
+	}
+	return ep, nil
+}
+
+// Abort implements stack.TransportEndpoint.Abort.
+func (ep *endpoint) Abort() {
+	ep.Close()
+}
+
+// Close implements tcpip.Endpoint.Close.
+func (ep *endpoint) Close() {
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	if ep.closed {
+		return
+	}
+
+	ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep)
+
+	ep.rcvMu.Lock()
+	defer ep.rcvMu.Unlock()
+
+	// Clear the receive list.
+	ep.rcvClosed = true
+	ep.rcvBufSize = 0
+	for !ep.rcvList.Empty() {
+		ep.rcvList.Remove(ep.rcvList.Front())
+	}
+
+	ep.closed = true
+	ep.bound = false
+	ep.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+}
+
+// ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
+func (ep *endpoint) ModerateRecvBuf(copied int) {}
+
+// Read implements tcpip.Endpoint.Read.
+func (ep *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	ep.rcvMu.Lock()
+
+	// If there's no data to read, return that read would block or that the
+	// endpoint is closed.
+	if ep.rcvList.Empty() {
+		err := tcpip.ErrWouldBlock
+		if ep.rcvClosed {
+			ep.stats.ReadErrors.ReadClosed.Increment()
+			err = tcpip.ErrClosedForReceive
+		}
+		ep.rcvMu.Unlock()
+		return buffer.View{}, tcpip.ControlMessages{}, err
+	}
+
+	packet := ep.rcvList.Front()
+	ep.rcvList.Remove(packet)
+	ep.rcvBufSize -= packet.data.Size()
+
+	ep.rcvMu.Unlock()
+
+	if addr != nil {
+		*addr = packet.senderAddr
+	}
+
+	return packet.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: packet.timestampNS}, nil
+}
+
+func (ep *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+	// TODO(b/129292371): Implement.
+	return 0, nil, tcpip.ErrInvalidOptionValue
+}
+
+// Peek implements tcpip.Endpoint.Peek.
+func (ep *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+	return 0, tcpip.ControlMessages{}, nil
+}
+
+// Disconnect implements tcpip.Endpoint.Disconnect. Packet sockets cannot be
+// disconnected, and this function always returns tpcip.ErrNotSupported.
+func (*endpoint) Disconnect() *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Connect implements tcpip.Endpoint.Connect. Packet sockets cannot be
+// connected, and this function always returnes tcpip.ErrNotSupported.
+func (ep *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Shutdown implements tcpip.Endpoint.Shutdown. Packet sockets cannot be used
+// with Shutdown, and this function always returns tcpip.ErrNotSupported.
+func (ep *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Listen implements tcpip.Endpoint.Listen. Packet sockets cannot be used with
+// Listen, and this function always returns tcpip.ErrNotSupported.
+func (ep *endpoint) Listen(backlog int) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Accept implements tcpip.Endpoint.Accept. Packet sockets cannot be used with
+// Accept, and this function always returns tcpip.ErrNotSupported.
+func (ep *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+	return nil, nil, tcpip.ErrNotSupported
+}
+
+// Bind implements tcpip.Endpoint.Bind.
+func (ep *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
+	// TODO(gvisor.dev/issue/173): Add Bind support.
+
+	// "By default, all packets of the specified protocol type are passed
+	// to a packet socket.  To get packets only from a specific interface
+	// use bind(2) specifying an address in a struct sockaddr_ll to bind
+	// the packet socket  to  an interface.  Fields used for binding are
+	// sll_family (should be AF_PACKET), sll_protocol, and sll_ifindex."
+	// - packet(7).
+
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	if ep.bound {
+		return tcpip.ErrAlreadyBound
+	}
+
+	// Unregister endpoint with all the nics.
+	ep.stack.UnregisterPacketEndpoint(0, ep.netProto, ep)
+
+	// Bind endpoint to receive packets from specific interface.
+	if err := ep.stack.RegisterPacketEndpoint(addr.NIC, ep.netProto, ep); err != nil {
+		return err
+	}
+
+	ep.bound = true
+
+	return nil
+}
+
+// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
+func (ep *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return tcpip.FullAddress{}, tcpip.ErrNotSupported
+}
+
+// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress.
+func (ep *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+	// Even a connected socket doesn't return a remote address.
+	return tcpip.FullAddress{}, tcpip.ErrNotConnected
+}
+
+// Readiness implements tcpip.Endpoint.Readiness.
+func (ep *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+	// The endpoint is always writable.
+	result := waiter.EventOut & mask
+
+	// Determine whether the endpoint is readable.
+	if (mask & waiter.EventIn) != 0 {
+		ep.rcvMu.Lock()
+		if !ep.rcvList.Empty() || ep.rcvClosed {
+			result |= waiter.EventIn
+		}
+		ep.rcvMu.Unlock()
+	}
+
+	return result
+}
+
+// SetSockOpt implements tcpip.Endpoint.SetSockOpt. Packet sockets cannot be
+// used with SetSockOpt, and this function always returns
+// tcpip.ErrNotSupported.
+func (ep *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
+func (ep *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
+func (ep *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	switch opt {
+	case tcpip.SendBufferSizeOption:
+		// Make sure the send buffer size is within the min and max
+		// allowed.
+		var ss stack.SendBufferSizeOption
+		if err := ep.stack.Option(&ss); err != nil {
+			panic(fmt.Sprintf("s.Option(%#v) = %s", ss, err))
+		}
+		if v > ss.Max {
+			v = ss.Max
+		}
+		if v < ss.Min {
+			v = ss.Min
+		}
+		ep.mu.Lock()
+		ep.sndBufSizeMax = v
+		ep.mu.Unlock()
+		return nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		// Make sure the receive buffer size is within the min and max
+		// allowed.
+		var rs stack.ReceiveBufferSizeOption
+		if err := ep.stack.Option(&rs); err != nil {
+			panic(fmt.Sprintf("s.Option(%#v) = %s", rs, err))
+		}
+		if v > rs.Max {
+			v = rs.Max
+		}
+		if v < rs.Min {
+			v = rs.Min
+		}
+		ep.rcvMu.Lock()
+		ep.rcvBufSizeMax = v
+		ep.rcvMu.Unlock()
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (ep *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (ep *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	return false, tcpip.ErrNotSupported
+}
+
+// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
+func (ep *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
+	switch opt {
+	case tcpip.ReceiveQueueSizeOption:
+		v := 0
+		ep.rcvMu.Lock()
+		if !ep.rcvList.Empty() {
+			p := ep.rcvList.Front()
+			v = p.data.Size()
+		}
+		ep.rcvMu.Unlock()
+		return v, nil
+
+	case tcpip.SendBufferSizeOption:
+		ep.mu.Lock()
+		v := ep.sndBufSizeMax
+		ep.mu.Unlock()
+		return v, nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		ep.rcvMu.Lock()
+		v := ep.rcvBufSizeMax
+		ep.rcvMu.Unlock()
+		return v, nil
+
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// HandlePacket implements stack.PacketEndpoint.HandlePacket.
+func (ep *endpoint) HandlePacket(nicID tcpip.NICID, localAddr tcpip.LinkAddress, netProto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	ep.rcvMu.Lock()
+
+	// Drop the packet if our buffer is currently full.
+	if ep.rcvClosed {
+		ep.rcvMu.Unlock()
+		ep.stack.Stats().DroppedPackets.Increment()
+		ep.stats.ReceiveErrors.ClosedReceiver.Increment()
+		return
+	}
+
+	if ep.rcvBufSize >= ep.rcvBufSizeMax {
+		ep.rcvMu.Unlock()
+		ep.stack.Stats().DroppedPackets.Increment()
+		ep.stats.ReceiveErrors.ReceiveBufferOverflow.Increment()
+		return
+	}
+
+	wasEmpty := ep.rcvBufSize == 0
+
+	// Push new packet into receive list and increment the buffer size.
+	var packet packet
+	// TODO(b/129292371): Return network protocol.
+	if len(pkt.LinkHeader) > 0 {
+		// Get info directly from the ethernet header.
+		hdr := header.Ethernet(pkt.LinkHeader)
+		packet.senderAddr = tcpip.FullAddress{
+			NIC:  nicID,
+			Addr: tcpip.Address(hdr.SourceAddress()),
+		}
+	} else {
+		// Guess the would-be ethernet header.
+		packet.senderAddr = tcpip.FullAddress{
+			NIC:  nicID,
+			Addr: tcpip.Address(localAddr),
+		}
+	}
+
+	if ep.cooked {
+		// Cooked packets can simply be queued.
+		packet.data = pkt.Data
+	} else {
+		// Raw packets need their ethernet headers prepended before
+		// queueing.
+		var linkHeader buffer.View
+		if len(pkt.LinkHeader) == 0 {
+			// We weren't provided with an actual ethernet header,
+			// so fake one.
+			ethFields := header.EthernetFields{
+				SrcAddr: tcpip.LinkAddress([]byte{0x00, 0x00, 0x00, 0x00, 0x00, 0x00}),
+				DstAddr: localAddr,
+				Type:    netProto,
+			}
+			fakeHeader := make(header.Ethernet, header.EthernetMinimumSize)
+			fakeHeader.Encode(&ethFields)
+			linkHeader = buffer.View(fakeHeader)
+		} else {
+			linkHeader = append(buffer.View(nil), pkt.LinkHeader...)
+		}
+		combinedVV := linkHeader.ToVectorisedView()
+		combinedVV.Append(pkt.Data)
+		packet.data = combinedVV
+	}
+	packet.timestampNS = ep.stack.NowNanoseconds()
+
+	ep.rcvList.PushBack(&packet)
+	ep.rcvBufSize += packet.data.Size()
+
+	ep.rcvMu.Unlock()
+	ep.stats.PacketsReceived.Increment()
+	// Notify waiters that there's data to be read.
+	if wasEmpty {
+		ep.waiterQueue.Notify(waiter.EventIn)
+	}
+}
+
+// State implements socket.Socket.State.
+func (ep *endpoint) State() uint32 {
+	return 0
+}
+
+// Info returns a copy of the endpoint info.
+func (ep *endpoint) Info() tcpip.EndpointInfo {
+	ep.mu.RLock()
+	// Make a copy of the endpoint info.
+	ret := ep.TransportEndpointInfo
+	ep.mu.RUnlock()
+	return &ret
+}
+
+// Stats returns a pointer to the endpoint stats.
+func (ep *endpoint) Stats() tcpip.EndpointStats {
+	return &ep.stats
+}
+
+func (ep *endpoint) SetOwner(owner tcpip.PacketOwner) {}
diff --git a/pkg/tcpip/transport/packet/endpoint_state.go b/pkg/tcpip/transport/packet/endpoint_state.go
new file mode 100644
index 000000000..9b88f17e4
--- /dev/null
+++ b/pkg/tcpip/transport/packet/endpoint_state.go
@@ -0,0 +1,72 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package packet
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// saveData saves packet.data field.
+func (p *packet) saveData() buffer.VectorisedView {
+	// We cannot save p.data directly as p.data.views may alias to p.views,
+	// which is not allowed by state framework (in-struct pointer).
+	return p.data.Clone(nil)
+}
+
+// loadData loads packet.data field.
+func (p *packet) loadData(data buffer.VectorisedView) {
+	// NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization
+	// here because data.views is not guaranteed to be loaded by now. Plus,
+	// data.views will be allocated anyway so there really is little point
+	// of utilizing p.views for data.views.
+	p.data = data
+}
+
+// beforeSave is invoked by stateify.
+func (ep *endpoint) beforeSave() {
+	// Stop incoming packets from being handled (and mutate endpoint state).
+	// The lock will be released after saveRcvBufSizeMax(), which would have
+	// saved ep.rcvBufSizeMax and set it to 0 to continue blocking incoming
+	// packets.
+	ep.rcvMu.Lock()
+}
+
+// saveRcvBufSizeMax is invoked by stateify.
+func (ep *endpoint) saveRcvBufSizeMax() int {
+	max := ep.rcvBufSizeMax
+	// Make sure no new packets will be handled regardless of the lock.
+	ep.rcvBufSizeMax = 0
+	// Release the lock acquired in beforeSave() so regular endpoint closing
+	// logic can proceed after save.
+	ep.rcvMu.Unlock()
+	return max
+}
+
+// loadRcvBufSizeMax is invoked by stateify.
+func (ep *endpoint) loadRcvBufSizeMax(max int) {
+	ep.rcvBufSizeMax = max
+}
+
+// afterLoad is invoked by stateify.
+func (ep *endpoint) afterLoad() {
+	// StackFromEnv is a stack used specifically for save/restore.
+	ep.stack = stack.StackFromEnv
+
+	// TODO(gvisor.dev/173): Once bind is supported, choose the right NIC.
+	if err := ep.stack.RegisterPacketEndpoint(0, ep.netProto, ep); err != nil {
+		panic(*err)
+	}
+}
diff --git a/pkg/tcpip/transport/raw/BUILD b/pkg/tcpip/transport/raw/BUILD
new file mode 100644
index 000000000..2eab09088
--- /dev/null
+++ b/pkg/tcpip/transport/raw/BUILD
@@ -0,0 +1,39 @@
+load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "raw_packet_list",
+    out = "raw_packet_list.go",
+    package = "raw",
+    prefix = "rawPacket",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*rawPacket",
+        "Linker": "*rawPacket",
+    },
+)
+
+go_library(
+    name = "raw",
+    srcs = [
+        "endpoint.go",
+        "endpoint_state.go",
+        "protocol.go",
+        "raw_packet_list.go",
+    ],
+    imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/log",
+        "//pkg/sleep",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/packet",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/tcpip/transport/raw/endpoint.go b/pkg/tcpip/transport/raw/endpoint.go
new file mode 100644
index 000000000..5b6e7d102
--- /dev/null
+++ b/pkg/tcpip/transport/raw/endpoint.go
@@ -0,0 +1,729 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package raw provides the implementation of raw sockets (see raw(7)). Raw
+// sockets allow applications to:
+//
+//   * manually write and inspect transport layer headers and payloads
+//   * receive all traffic of a given transport protocol (e.g. ICMP or UDP)
+//   * optionally write and inspect network layer headers of packets
+//
+// Raw sockets don't have any notion of ports, and incoming packets are
+// demultiplexed solely by protocol number. Thus, a raw UDP endpoint will
+// receive every UDP packet received by netstack. bind(2) and connect(2) can be
+// used to filter incoming packets by source and destination.
+package raw
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// +stateify savable
+type rawPacket struct {
+	rawPacketEntry
+	// data holds the actual packet data, including any headers and
+	// payload.
+	data buffer.VectorisedView `state:".(buffer.VectorisedView)"`
+	// timestampNS is the unix time at which the packet was received.
+	timestampNS int64
+	// senderAddr is the network address of the sender.
+	senderAddr tcpip.FullAddress
+}
+
+// endpoint is the raw socket implementation of tcpip.Endpoint. It is legal to
+// have goroutines make concurrent calls into the endpoint.
+//
+// Lock order:
+//   endpoint.mu
+//     endpoint.rcvMu
+//
+// +stateify savable
+type endpoint struct {
+	stack.TransportEndpointInfo
+	// The following fields are initialized at creation time and are
+	// immutable.
+	stack       *stack.Stack `state:"manual"`
+	waiterQueue *waiter.Queue
+	associated  bool
+	hdrIncluded bool
+
+	// The following fields are used to manage the receive queue and are
+	// protected by rcvMu.
+	rcvMu         sync.Mutex `state:"nosave"`
+	rcvList       rawPacketList
+	rcvBufSize    int
+	rcvBufSizeMax int `state:".(int)"`
+	rcvClosed     bool
+
+	// The following fields are protected by mu.
+	mu            sync.RWMutex `state:"nosave"`
+	sndBufSize    int
+	sndBufSizeMax int
+	closed        bool
+	connected     bool
+	bound         bool
+	// route is the route to a remote network endpoint. It is set via
+	// Connect(), and is valid only when conneted is true.
+	route stack.Route                  `state:"manual"`
+	stats tcpip.TransportEndpointStats `state:"nosave"`
+
+	// owner is used to get uid and gid of the packet.
+	owner tcpip.PacketOwner
+}
+
+// NewEndpoint returns a raw  endpoint for the given protocols.
+func NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return newEndpoint(stack, netProto, transProto, waiterQueue, true /* associated */)
+}
+
+func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue, associated bool) (tcpip.Endpoint, *tcpip.Error) {
+	if netProto != header.IPv4ProtocolNumber && netProto != header.IPv6ProtocolNumber {
+		return nil, tcpip.ErrUnknownProtocol
+	}
+
+	e := &endpoint{
+		stack: s,
+		TransportEndpointInfo: stack.TransportEndpointInfo{
+			NetProto:   netProto,
+			TransProto: transProto,
+		},
+		waiterQueue:   waiterQueue,
+		rcvBufSizeMax: 32 * 1024,
+		sndBufSizeMax: 32 * 1024,
+		associated:    associated,
+		hdrIncluded:   !associated,
+	}
+
+	// Override with stack defaults.
+	var ss stack.SendBufferSizeOption
+	if err := s.Option(&ss); err == nil {
+		e.sndBufSizeMax = ss.Default
+	}
+
+	var rs stack.ReceiveBufferSizeOption
+	if err := s.Option(&rs); err == nil {
+		e.rcvBufSizeMax = rs.Default
+	}
+
+	// Unassociated endpoints are write-only and users call Write() with IP
+	// headers included. Because they're write-only, We don't need to
+	// register with the stack.
+	if !associated {
+		e.rcvBufSizeMax = 0
+		e.waiterQueue = nil
+		return e, nil
+	}
+
+	if err := e.stack.RegisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e); err != nil {
+		return nil, err
+	}
+
+	return e, nil
+}
+
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	e.Close()
+}
+
+// Close implements tcpip.Endpoint.Close.
+func (e *endpoint) Close() {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	if e.closed || !e.associated {
+		return
+	}
+
+	e.stack.UnregisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e)
+
+	e.rcvMu.Lock()
+	defer e.rcvMu.Unlock()
+
+	// Clear the receive list.
+	e.rcvClosed = true
+	e.rcvBufSize = 0
+	for !e.rcvList.Empty() {
+		e.rcvList.Remove(e.rcvList.Front())
+	}
+
+	if e.connected {
+		e.route.Release()
+		e.connected = false
+	}
+
+	e.closed = true
+
+	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+}
+
+// ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
+func (e *endpoint) ModerateRecvBuf(copied int) {}
+
+func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
+	e.owner = owner
+}
+
+// Read implements tcpip.Endpoint.Read.
+func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	e.rcvMu.Lock()
+
+	// If there's no data to read, return that read would block or that the
+	// endpoint is closed.
+	if e.rcvList.Empty() {
+		err := tcpip.ErrWouldBlock
+		if e.rcvClosed {
+			e.stats.ReadErrors.ReadClosed.Increment()
+			err = tcpip.ErrClosedForReceive
+		}
+		e.rcvMu.Unlock()
+		return buffer.View{}, tcpip.ControlMessages{}, err
+	}
+
+	pkt := e.rcvList.Front()
+	e.rcvList.Remove(pkt)
+	e.rcvBufSize -= pkt.data.Size()
+
+	e.rcvMu.Unlock()
+
+	if addr != nil {
+		*addr = pkt.senderAddr
+	}
+
+	return pkt.data.ToView(), tcpip.ControlMessages{HasTimestamp: true, Timestamp: pkt.timestampNS}, nil
+}
+
+// Write implements tcpip.Endpoint.Write.
+func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+	// We can create, but not write to, unassociated IPv6 endpoints.
+	if !e.associated && e.TransportEndpointInfo.NetProto == header.IPv6ProtocolNumber {
+		return 0, nil, tcpip.ErrInvalidOptionValue
+	}
+
+	n, ch, err := e.write(p, opts)
+	switch err {
+	case nil:
+		e.stats.PacketsSent.Increment()
+	case tcpip.ErrMessageTooLong, tcpip.ErrInvalidOptionValue:
+		e.stats.WriteErrors.InvalidArgs.Increment()
+	case tcpip.ErrClosedForSend:
+		e.stats.WriteErrors.WriteClosed.Increment()
+	case tcpip.ErrInvalidEndpointState:
+		e.stats.WriteErrors.InvalidEndpointState.Increment()
+	case tcpip.ErrNoLinkAddress:
+		e.stats.SendErrors.NoLinkAddr.Increment()
+	case tcpip.ErrNoRoute, tcpip.ErrBroadcastDisabled, tcpip.ErrNetworkUnreachable:
+		// Errors indicating any problem with IP routing of the packet.
+		e.stats.SendErrors.NoRoute.Increment()
+	default:
+		// For all other errors when writing to the network layer.
+		e.stats.SendErrors.SendToNetworkFailed.Increment()
+	}
+	return n, ch, err
+}
+
+func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+	// MSG_MORE is unimplemented. This also means that MSG_EOR is a no-op.
+	if opts.More {
+		return 0, nil, tcpip.ErrInvalidOptionValue
+	}
+
+	e.mu.RLock()
+
+	if e.closed {
+		e.mu.RUnlock()
+		return 0, nil, tcpip.ErrInvalidEndpointState
+	}
+
+	payloadBytes, err := p.FullPayload()
+	if err != nil {
+		e.mu.RUnlock()
+		return 0, nil, err
+	}
+
+	// If this is an unassociated socket and callee provided a nonzero
+	// destination address, route using that address.
+	if e.hdrIncluded {
+		ip := header.IPv4(payloadBytes)
+		if !ip.IsValid(len(payloadBytes)) {
+			e.mu.RUnlock()
+			return 0, nil, tcpip.ErrInvalidOptionValue
+		}
+		dstAddr := ip.DestinationAddress()
+		// Update dstAddr with the address in the IP header, unless
+		// opts.To is set (e.g. if sendto specifies a specific
+		// address).
+		if dstAddr != tcpip.Address([]byte{0, 0, 0, 0}) && opts.To == nil {
+			opts.To = &tcpip.FullAddress{
+				NIC:  0,       // NIC is unset.
+				Addr: dstAddr, // The address from the payload.
+				Port: 0,       // There are no ports here.
+			}
+		}
+	}
+
+	// Did the user caller provide a destination? If not, use the connected
+	// destination.
+	if opts.To == nil {
+		// If the user doesn't specify a destination, they should have
+		// connected to another address.
+		if !e.connected {
+			e.mu.RUnlock()
+			return 0, nil, tcpip.ErrDestinationRequired
+		}
+
+		if e.route.IsResolutionRequired() {
+			savedRoute := &e.route
+			// Promote lock to exclusive if using a shared route,
+			// given that it may need to change in finishWrite.
+			e.mu.RUnlock()
+			e.mu.Lock()
+
+			// Make sure that the route didn't change during the
+			// time we didn't hold the lock.
+			if !e.connected || savedRoute != &e.route {
+				e.mu.Unlock()
+				return 0, nil, tcpip.ErrInvalidEndpointState
+			}
+
+			n, ch, err := e.finishWrite(payloadBytes, savedRoute)
+			e.mu.Unlock()
+			return n, ch, err
+		}
+
+		n, ch, err := e.finishWrite(payloadBytes, &e.route)
+		e.mu.RUnlock()
+		return n, ch, err
+	}
+
+	// The caller provided a destination. Reject destination address if it
+	// goes through a different NIC than the endpoint was bound to.
+	nic := opts.To.NIC
+	if e.bound && nic != 0 && nic != e.BindNICID {
+		e.mu.RUnlock()
+		return 0, nil, tcpip.ErrNoRoute
+	}
+
+	// Find the route to the destination. If BindAddress is 0,
+	// FindRoute will choose an appropriate source address.
+	route, err := e.stack.FindRoute(nic, e.BindAddr, opts.To.Addr, e.NetProto, false)
+	if err != nil {
+		e.mu.RUnlock()
+		return 0, nil, err
+	}
+
+	n, ch, err := e.finishWrite(payloadBytes, &route)
+	route.Release()
+	e.mu.RUnlock()
+	return n, ch, err
+}
+
+// finishWrite writes the payload to a route. It resolves the route if
+// necessary. It's really just a helper to make defer unnecessary in Write.
+func (e *endpoint) finishWrite(payloadBytes []byte, route *stack.Route) (int64, <-chan struct{}, *tcpip.Error) {
+	// We may need to resolve the route (match a link layer address to the
+	// network address). If that requires blocking (e.g. to use ARP),
+	// return a channel on which the caller can wait.
+	if route.IsResolutionRequired() {
+		if ch, err := route.Resolve(nil); err != nil {
+			if err == tcpip.ErrWouldBlock {
+				return 0, ch, tcpip.ErrNoLinkAddress
+			}
+			return 0, nil, err
+		}
+	}
+
+	if e.hdrIncluded {
+		if err := route.WriteHeaderIncludedPacket(&stack.PacketBuffer{
+			Data: buffer.View(payloadBytes).ToVectorisedView(),
+		}); err != nil {
+			return 0, nil, err
+		}
+	} else {
+		hdr := buffer.NewPrependable(len(payloadBytes) + int(route.MaxHeaderLength()))
+		if err := route.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: e.TransProto, TTL: route.DefaultTTL(), TOS: stack.DefaultTOS}, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   buffer.View(payloadBytes).ToVectorisedView(),
+			Owner:  e.owner,
+		}); err != nil {
+			return 0, nil, err
+		}
+	}
+
+	return int64(len(payloadBytes)), nil, nil
+}
+
+// Peek implements tcpip.Endpoint.Peek.
+func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+	return 0, tcpip.ControlMessages{}, nil
+}
+
+// Disconnect implements tcpip.Endpoint.Disconnect.
+func (*endpoint) Disconnect() *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Connect implements tcpip.Endpoint.Connect.
+func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	if e.closed {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	nic := addr.NIC
+	if e.bound {
+		if e.BindNICID == 0 {
+			// If we're bound, but not to a specific NIC, the NIC
+			// in addr will be used. Nothing to do here.
+		} else if addr.NIC == 0 {
+			// If we're bound to a specific NIC, but addr doesn't
+			// specify a NIC, use the bound NIC.
+			nic = e.BindNICID
+		} else if addr.NIC != e.BindNICID {
+			// We're bound and addr specifies a NIC. They must be
+			// the same.
+			return tcpip.ErrInvalidEndpointState
+		}
+	}
+
+	// Find a route to the destination.
+	route, err := e.stack.FindRoute(nic, tcpip.Address(""), addr.Addr, e.NetProto, false)
+	if err != nil {
+		return err
+	}
+	defer route.Release()
+
+	if e.associated {
+		// Re-register the endpoint with the appropriate NIC.
+		if err := e.stack.RegisterRawTransportEndpoint(addr.NIC, e.NetProto, e.TransProto, e); err != nil {
+			return err
+		}
+		e.stack.UnregisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e)
+		e.RegisterNICID = nic
+	}
+
+	// Save the route we've connected via.
+	e.route = route.Clone()
+	e.connected = true
+
+	return nil
+}
+
+// Shutdown implements tcpip.Endpoint.Shutdown. It's a noop for raw sockets.
+func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	if !e.connected {
+		return tcpip.ErrNotConnected
+	}
+	return nil
+}
+
+// Listen implements tcpip.Endpoint.Listen.
+func (e *endpoint) Listen(backlog int) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Accept implements tcpip.Endpoint.Accept.
+func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+	return nil, nil, tcpip.ErrNotSupported
+}
+
+// Bind implements tcpip.Endpoint.Bind.
+func (e *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// If a local address was specified, verify that it's valid.
+	if e.stack.CheckLocalAddress(addr.NIC, e.NetProto, addr.Addr) == 0 {
+		return tcpip.ErrBadLocalAddress
+	}
+
+	if e.associated {
+		// Re-register the endpoint with the appropriate NIC.
+		if err := e.stack.RegisterRawTransportEndpoint(addr.NIC, e.NetProto, e.TransProto, e); err != nil {
+			return err
+		}
+		e.stack.UnregisterRawTransportEndpoint(e.RegisterNICID, e.NetProto, e.TransProto, e)
+		e.RegisterNICID = addr.NIC
+		e.BindNICID = addr.NIC
+	}
+
+	e.BindAddr = addr.Addr
+	e.bound = true
+
+	return nil
+}
+
+// GetLocalAddress implements tcpip.Endpoint.GetLocalAddress.
+func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return tcpip.FullAddress{}, tcpip.ErrNotSupported
+}
+
+// GetRemoteAddress implements tcpip.Endpoint.GetRemoteAddress.
+func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+	// Even a connected socket doesn't return a remote address.
+	return tcpip.FullAddress{}, tcpip.ErrNotConnected
+}
+
+// Readiness implements tcpip.Endpoint.Readiness.
+func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+	// The endpoint is always writable.
+	result := waiter.EventOut & mask
+
+	// Determine whether the endpoint is readable.
+	if (mask & waiter.EventIn) != 0 {
+		e.rcvMu.Lock()
+		if !e.rcvList.Empty() || e.rcvClosed {
+			result |= waiter.EventIn
+		}
+		e.rcvMu.Unlock()
+	}
+
+	return result
+}
+
+// SetSockOpt implements tcpip.Endpoint.SetSockOpt.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
+func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	switch opt {
+	case tcpip.IPHdrIncludedOption:
+		e.mu.Lock()
+		e.hdrIncluded = v
+		e.mu.Unlock()
+		return nil
+	}
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	switch opt {
+	case tcpip.SendBufferSizeOption:
+		// Make sure the send buffer size is within the min and max
+		// allowed.
+		var ss stack.SendBufferSizeOption
+		if err := e.stack.Option(&ss); err != nil {
+			panic(fmt.Sprintf("s.Option(%#v) = %s", ss, err))
+		}
+		if v > ss.Max {
+			v = ss.Max
+		}
+		if v < ss.Min {
+			v = ss.Min
+		}
+		e.mu.Lock()
+		e.sndBufSizeMax = v
+		e.mu.Unlock()
+		return nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		// Make sure the receive buffer size is within the min and max
+		// allowed.
+		var rs stack.ReceiveBufferSizeOption
+		if err := e.stack.Option(&rs); err != nil {
+			panic(fmt.Sprintf("s.Option(%#v) = %s", rs, err))
+		}
+		if v > rs.Max {
+			v = rs.Max
+		}
+		if v < rs.Min {
+			v = rs.Min
+		}
+		e.rcvMu.Lock()
+		e.rcvBufSizeMax = v
+		e.rcvMu.Unlock()
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	switch opt.(type) {
+	case tcpip.ErrorOption:
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	switch opt {
+	case tcpip.KeepaliveEnabledOption:
+		return false, nil
+
+	case tcpip.IPHdrIncludedOption:
+		e.mu.Lock()
+		v := e.hdrIncluded
+		e.mu.Unlock()
+		return v, nil
+
+	default:
+		return false, tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
+func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
+	switch opt {
+	case tcpip.ReceiveQueueSizeOption:
+		v := 0
+		e.rcvMu.Lock()
+		if !e.rcvList.Empty() {
+			p := e.rcvList.Front()
+			v = p.data.Size()
+		}
+		e.rcvMu.Unlock()
+		return v, nil
+
+	case tcpip.SendBufferSizeOption:
+		e.mu.Lock()
+		v := e.sndBufSizeMax
+		e.mu.Unlock()
+		return v, nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		e.rcvMu.Lock()
+		v := e.rcvBufSizeMax
+		e.rcvMu.Unlock()
+		return v, nil
+
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// HandlePacket implements stack.RawTransportEndpoint.HandlePacket.
+func (e *endpoint) HandlePacket(route *stack.Route, pkt *stack.PacketBuffer) {
+	e.rcvMu.Lock()
+
+	// Drop the packet if our buffer is currently full or if this is an unassociated
+	// endpoint (i.e endpoint created  w/ IPPROTO_RAW). Such endpoints are send only
+	// See: https://man7.org/linux/man-pages/man7/raw.7.html
+	//
+	//    An IPPROTO_RAW socket is send only.  If you really want to receive
+	//    all IP packets, use a packet(7) socket with the ETH_P_IP protocol.
+	//    Note that packet sockets don't reassemble IP fragments, unlike raw
+	//    sockets.
+	if e.rcvClosed || !e.associated {
+		e.rcvMu.Unlock()
+		e.stack.Stats().DroppedPackets.Increment()
+		e.stats.ReceiveErrors.ClosedReceiver.Increment()
+		return
+	}
+
+	if e.rcvBufSize >= e.rcvBufSizeMax {
+		e.rcvMu.Unlock()
+		e.stack.Stats().DroppedPackets.Increment()
+		e.stats.ReceiveErrors.ReceiveBufferOverflow.Increment()
+		return
+	}
+
+	if e.bound {
+		// If bound to a NIC, only accept data for that NIC.
+		if e.BindNICID != 0 && e.BindNICID != route.NICID() {
+			e.rcvMu.Unlock()
+			return
+		}
+		// If bound to an address, only accept data for that address.
+		if e.BindAddr != "" && e.BindAddr != route.RemoteAddress {
+			e.rcvMu.Unlock()
+			return
+		}
+	}
+
+	// If connected, only accept packets from the remote address we
+	// connected to.
+	if e.connected && e.route.RemoteAddress != route.RemoteAddress {
+		e.rcvMu.Unlock()
+		return
+	}
+
+	wasEmpty := e.rcvBufSize == 0
+
+	// Push new packet into receive list and increment the buffer size.
+	packet := &rawPacket{
+		senderAddr: tcpip.FullAddress{
+			NIC:  route.NICID(),
+			Addr: route.RemoteAddress,
+		},
+	}
+
+	// Raw IPv4 endpoints return the IP header, but IPv6 endpoints do not.
+	// We copy headers' underlying bytes because pkt.*Header may point to
+	// the middle of a slice, and another struct may point to the "outer"
+	// slice. Save/restore doesn't support overlapping slices and will fail.
+	var combinedVV buffer.VectorisedView
+	if e.TransportEndpointInfo.NetProto == header.IPv4ProtocolNumber {
+		headers := make(buffer.View, 0, len(pkt.NetworkHeader)+len(pkt.TransportHeader))
+		headers = append(headers, pkt.NetworkHeader...)
+		headers = append(headers, pkt.TransportHeader...)
+		combinedVV = headers.ToVectorisedView()
+	} else {
+		combinedVV = append(buffer.View(nil), pkt.TransportHeader...).ToVectorisedView()
+	}
+	combinedVV.Append(pkt.Data)
+	packet.data = combinedVV
+	packet.timestampNS = e.stack.NowNanoseconds()
+
+	e.rcvList.PushBack(packet)
+	e.rcvBufSize += packet.data.Size()
+	e.rcvMu.Unlock()
+	e.stats.PacketsReceived.Increment()
+	// Notify waiters that there's data to be read.
+	if wasEmpty {
+		e.waiterQueue.Notify(waiter.EventIn)
+	}
+}
+
+// State implements socket.Socket.State.
+func (e *endpoint) State() uint32 {
+	return 0
+}
+
+// Info returns a copy of the endpoint info.
+func (e *endpoint) Info() tcpip.EndpointInfo {
+	e.mu.RLock()
+	// Make a copy of the endpoint info.
+	ret := e.TransportEndpointInfo
+	e.mu.RUnlock()
+	return &ret
+}
+
+// Stats returns a pointer to the endpoint stats.
+func (e *endpoint) Stats() tcpip.EndpointStats {
+	return &e.stats
+}
+
+// Wait implements stack.TransportEndpoint.Wait.
+func (*endpoint) Wait() {}
diff --git a/pkg/tcpip/transport/raw/endpoint_state.go b/pkg/tcpip/transport/raw/endpoint_state.go
new file mode 100644
index 000000000..33bfb56cd
--- /dev/null
+++ b/pkg/tcpip/transport/raw/endpoint_state.go
@@ -0,0 +1,94 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package raw
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// saveData saves rawPacket.data field.
+func (p *rawPacket) saveData() buffer.VectorisedView {
+	// We cannot save p.data directly as p.data.views may alias to p.views,
+	// which is not allowed by state framework (in-struct pointer).
+	return p.data.Clone(nil)
+}
+
+// loadData loads rawPacket.data field.
+func (p *rawPacket) loadData(data buffer.VectorisedView) {
+	// NOTE: We cannot do the p.data = data.Clone(p.views[:]) optimization
+	// here because data.views is not guaranteed to be loaded by now. Plus,
+	// data.views will be allocated anyway so there really is little point
+	// of utilizing p.views for data.views.
+	p.data = data
+}
+
+// beforeSave is invoked by stateify.
+func (ep *endpoint) beforeSave() {
+	// Stop incoming packets from being handled (and mutate endpoint state).
+	// The lock will be released after saveRcvBufSizeMax(), which would have
+	// saved ep.rcvBufSizeMax and set it to 0 to continue blocking incoming
+	// packets.
+	ep.rcvMu.Lock()
+}
+
+// saveRcvBufSizeMax is invoked by stateify.
+func (ep *endpoint) saveRcvBufSizeMax() int {
+	max := ep.rcvBufSizeMax
+	// Make sure no new packets will be handled regardless of the lock.
+	ep.rcvBufSizeMax = 0
+	// Release the lock acquired in beforeSave() so regular endpoint closing
+	// logic can proceed after save.
+	ep.rcvMu.Unlock()
+	return max
+}
+
+// loadRcvBufSizeMax is invoked by stateify.
+func (ep *endpoint) loadRcvBufSizeMax(max int) {
+	ep.rcvBufSizeMax = max
+}
+
+// afterLoad is invoked by stateify.
+func (ep *endpoint) afterLoad() {
+	stack.StackFromEnv.RegisterRestoredEndpoint(ep)
+}
+
+// Resume implements tcpip.ResumableEndpoint.Resume.
+func (ep *endpoint) Resume(s *stack.Stack) {
+	ep.stack = s
+
+	// If the endpoint is connected, re-connect.
+	if ep.connected {
+		var err *tcpip.Error
+		ep.route, err = ep.stack.FindRoute(ep.RegisterNICID, ep.BindAddr, ep.route.RemoteAddress, ep.NetProto, false)
+		if err != nil {
+			panic(err)
+		}
+	}
+
+	// If the endpoint is bound, re-bind.
+	if ep.bound {
+		if ep.stack.CheckLocalAddress(ep.RegisterNICID, ep.NetProto, ep.BindAddr) == 0 {
+			panic(tcpip.ErrBadLocalAddress)
+		}
+	}
+
+	if ep.associated {
+		if err := ep.stack.RegisterRawTransportEndpoint(ep.RegisterNICID, ep.NetProto, ep.TransProto, ep); err != nil {
+			panic(err)
+		}
+	}
+}
diff --git a/pkg/tcpip/transport/raw/protocol.go b/pkg/tcpip/transport/raw/protocol.go
new file mode 100644
index 000000000..f30aa2a4a
--- /dev/null
+++ b/pkg/tcpip/transport/raw/protocol.go
@@ -0,0 +1,35 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package raw
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/packet"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// EndpointFactory implements stack.RawFactory.
+type EndpointFactory struct{}
+
+// NewUnassociatedEndpoint implements stack.RawFactory.NewUnassociatedEndpoint.
+func (EndpointFactory) NewUnassociatedEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return newEndpoint(stack, netProto, transProto, waiterQueue, false /* associated */)
+}
+
+// NewPacketEndpoint implements stack.RawFactory.NewPacketEndpoint.
+func (EndpointFactory) NewPacketEndpoint(stack *stack.Stack, cooked bool, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return packet.NewEndpoint(stack, cooked, netProto, waiterQueue)
+}
diff --git a/pkg/tcpip/transport/tcp/BUILD b/pkg/tcpip/transport/tcp/BUILD
new file mode 100644
index 000000000..18ff89ffc
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/BUILD
@@ -0,0 +1,126 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "tcp_segment_list",
+    out = "tcp_segment_list.go",
+    package = "tcp",
+    prefix = "segment",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*segment",
+        "Linker": "*segment",
+    },
+)
+
+go_template_instance(
+    name = "tcp_endpoint_list",
+    out = "tcp_endpoint_list.go",
+    package = "tcp",
+    prefix = "endpoint",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*endpoint",
+        "Linker": "*endpoint",
+    },
+)
+
+go_library(
+    name = "tcp",
+    srcs = [
+        "accept.go",
+        "connect.go",
+        "connect_unsafe.go",
+        "cubic.go",
+        "cubic_state.go",
+        "dispatcher.go",
+        "endpoint.go",
+        "endpoint_state.go",
+        "forwarder.go",
+        "protocol.go",
+        "rcv.go",
+        "rcv_state.go",
+        "reno.go",
+        "sack.go",
+        "sack_scoreboard.go",
+        "segment.go",
+        "segment_heap.go",
+        "segment_queue.go",
+        "segment_state.go",
+        "snd.go",
+        "snd_state.go",
+        "tcp_endpoint_list.go",
+        "tcp_segment_list.go",
+        "timer.go",
+    ],
+    imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/log",
+        "//pkg/rand",
+        "//pkg/sleep",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/hash/jenkins",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/ports",
+        "//pkg/tcpip/seqnum",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/raw",
+        "//pkg/waiter",
+        "@com_github_google_btree//:go_default_library",
+    ],
+)
+
+go_test(
+    name = "tcp_x_test",
+    size = "medium",
+    srcs = [
+        "dual_stack_test.go",
+        "sack_scoreboard_test.go",
+        "tcp_noracedetector_test.go",
+        "tcp_sack_test.go",
+        "tcp_test.go",
+        "tcp_timestamp_test.go",
+    ],
+    shard_count = 10,
+    deps = [
+        ":tcp",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/checker",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/ports",
+        "//pkg/tcpip/seqnum",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/tcp/testing/context",
+        "//pkg/test/testutil",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "rcv_test",
+    size = "small",
+    srcs = ["rcv_test.go"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/seqnum",
+    ],
+)
+
+go_test(
+    name = "tcp_test",
+    size = "small",
+    srcs = ["timer_test.go"],
+    library = ":tcp",
+    deps = ["//pkg/sleep"],
+)
diff --git a/pkg/tcpip/transport/tcp/accept.go b/pkg/tcpip/transport/tcp/accept.go
new file mode 100644
index 000000000..6e00e5526
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/accept.go
@@ -0,0 +1,752 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"crypto/sha1"
+	"encoding/binary"
+	"fmt"
+	"hash"
+	"io"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// tsLen is the length, in bits, of the timestamp in the SYN cookie.
+	tsLen = 8
+
+	// tsMask is a mask for timestamp values (i.e., tsLen bits).
+	tsMask = (1 << tsLen) - 1
+
+	// tsOffset is the offset, in bits, of the timestamp in the SYN cookie.
+	tsOffset = 24
+
+	// hashMask is the mask for hash values (i.e., tsOffset bits).
+	hashMask = (1 << tsOffset) - 1
+
+	// maxTSDiff is the maximum allowed difference between a received cookie
+	// timestamp and the current timestamp. If the difference is greater
+	// than maxTSDiff, the cookie is expired.
+	maxTSDiff = 2
+
+	// SynRcvdCountThreshold is the default global maximum number of
+	// connections that are allowed to be in SYN-RCVD state before TCP
+	// starts using SYN cookies to accept connections.
+	SynRcvdCountThreshold uint64 = 1000
+)
+
+var (
+	// mssTable is a slice containing the possible MSS values that we
+	// encode in the SYN cookie with two bits.
+	mssTable = []uint16{536, 1300, 1440, 1460}
+)
+
+func encodeMSS(mss uint16) uint32 {
+	for i := len(mssTable) - 1; i > 0; i-- {
+		if mss >= mssTable[i] {
+			return uint32(i)
+		}
+	}
+	return 0
+}
+
+// listenContext is used by a listening endpoint to store state used while
+// listening for connections. This struct is allocated by the listen goroutine
+// and must not be accessed or have its methods called concurrently as they
+// may mutate the stored objects.
+type listenContext struct {
+	stack *stack.Stack
+
+	// synRcvdCount is a reference to the stack level synRcvdCount.
+	synRcvdCount *synRcvdCounter
+
+	// rcvWnd is the receive window that is sent by this listening context
+	// in the initial SYN-ACK.
+	rcvWnd seqnum.Size
+
+	// nonce are random bytes that are initialized once when the context
+	// is created and used to seed the hash function when generating
+	// the SYN cookie.
+	nonce [2][sha1.BlockSize]byte
+
+	// listenEP is a reference to the listening endpoint associated with
+	// this context. Can be nil if the context is created by the forwarder.
+	listenEP *endpoint
+
+	// hasherMu protects hasher.
+	hasherMu sync.Mutex
+	// hasher is the hash function used to generate a SYN cookie.
+	hasher hash.Hash
+
+	// v6Only is true if listenEP is a dual stack socket and has the
+	// IPV6_V6ONLY option set.
+	v6Only bool
+
+	// netProto indicates the network protocol(IPv4/v6) for the listening
+	// endpoint.
+	netProto tcpip.NetworkProtocolNumber
+
+	// pendingMu protects pendingEndpoints. This should only be accessed
+	// by the listening endpoint's worker goroutine.
+	//
+	// Lock Ordering: listenEP.workerMu -> pendingMu
+	pendingMu sync.Mutex
+	// pending is used to wait for all pendingEndpoints to finish when
+	// a socket is closed.
+	pending sync.WaitGroup
+	// pendingEndpoints is a map of all endpoints for which a handshake is
+	// in progress.
+	pendingEndpoints map[stack.TransportEndpointID]*endpoint
+}
+
+// timeStamp returns an 8-bit timestamp with a granularity of 64 seconds.
+func timeStamp() uint32 {
+	return uint32(time.Now().Unix()>>6) & tsMask
+}
+
+// newListenContext creates a new listen context.
+func newListenContext(stk *stack.Stack, listenEP *endpoint, rcvWnd seqnum.Size, v6Only bool, netProto tcpip.NetworkProtocolNumber) *listenContext {
+	l := &listenContext{
+		stack:            stk,
+		rcvWnd:           rcvWnd,
+		hasher:           sha1.New(),
+		v6Only:           v6Only,
+		netProto:         netProto,
+		listenEP:         listenEP,
+		pendingEndpoints: make(map[stack.TransportEndpointID]*endpoint),
+	}
+	p, ok := stk.TransportProtocolInstance(ProtocolNumber).(*protocol)
+	if !ok {
+		panic(fmt.Sprintf("unable to get TCP protocol instance from stack: %+v", stk))
+	}
+	l.synRcvdCount = p.SynRcvdCounter()
+
+	rand.Read(l.nonce[0][:])
+	rand.Read(l.nonce[1][:])
+
+	return l
+}
+
+// cookieHash calculates the cookieHash for the given id, timestamp and nonce
+// index. The hash is used to create and validate cookies.
+func (l *listenContext) cookieHash(id stack.TransportEndpointID, ts uint32, nonceIndex int) uint32 {
+
+	// Initialize block with fixed-size data: local ports and v.
+	var payload [8]byte
+	binary.BigEndian.PutUint16(payload[0:], id.LocalPort)
+	binary.BigEndian.PutUint16(payload[2:], id.RemotePort)
+	binary.BigEndian.PutUint32(payload[4:], ts)
+
+	// Feed everything to the hasher.
+	l.hasherMu.Lock()
+	l.hasher.Reset()
+	l.hasher.Write(payload[:])
+	l.hasher.Write(l.nonce[nonceIndex][:])
+	io.WriteString(l.hasher, string(id.LocalAddress))
+	io.WriteString(l.hasher, string(id.RemoteAddress))
+
+	// Finalize the calculation of the hash and return the first 4 bytes.
+	h := make([]byte, 0, sha1.Size)
+	h = l.hasher.Sum(h)
+	l.hasherMu.Unlock()
+
+	return binary.BigEndian.Uint32(h[:])
+}
+
+// createCookie creates a SYN cookie for the given id and incoming sequence
+// number.
+func (l *listenContext) createCookie(id stack.TransportEndpointID, seq seqnum.Value, data uint32) seqnum.Value {
+	ts := timeStamp()
+	v := l.cookieHash(id, 0, 0) + uint32(seq) + (ts << tsOffset)
+	v += (l.cookieHash(id, ts, 1) + data) & hashMask
+	return seqnum.Value(v)
+}
+
+// isCookieValid checks if the supplied cookie is valid for the given id and
+// sequence number. If it is, it also returns the data originally encoded in the
+// cookie when createCookie was called.
+func (l *listenContext) isCookieValid(id stack.TransportEndpointID, cookie seqnum.Value, seq seqnum.Value) (uint32, bool) {
+	ts := timeStamp()
+	v := uint32(cookie) - l.cookieHash(id, 0, 0) - uint32(seq)
+	cookieTS := v >> tsOffset
+	if ((ts - cookieTS) & tsMask) > maxTSDiff {
+		return 0, false
+	}
+
+	return (v - l.cookieHash(id, cookieTS, 1)) & hashMask, true
+}
+
+// createConnectingEndpoint creates a new endpoint in a connecting state, with
+// the connection parameters given by the arguments.
+func (l *listenContext) createConnectingEndpoint(s *segment, iss seqnum.Value, irs seqnum.Value, rcvdSynOpts *header.TCPSynOptions, queue *waiter.Queue) *endpoint {
+	// Create a new endpoint.
+	netProto := l.netProto
+	if netProto == 0 {
+		netProto = s.route.NetProto
+	}
+	n := newEndpoint(l.stack, netProto, queue)
+	n.v6only = l.v6Only
+	n.ID = s.id
+	n.boundNICID = s.route.NICID()
+	n.route = s.route.Clone()
+	n.effectiveNetProtos = []tcpip.NetworkProtocolNumber{s.route.NetProto}
+	n.rcvBufSize = int(l.rcvWnd)
+	n.amss = mssForRoute(&n.route)
+	n.setEndpointState(StateConnecting)
+
+	n.maybeEnableTimestamp(rcvdSynOpts)
+	n.maybeEnableSACKPermitted(rcvdSynOpts)
+
+	n.initGSO()
+
+	// Bootstrap the auto tuning algorithm. Starting at zero will result in
+	// a large step function on the first window adjustment causing the
+	// window to grow to a really large value.
+	n.rcvAutoParams.prevCopied = n.initialReceiveWindow()
+
+	return n
+}
+
+// createEndpointAndPerformHandshake creates a new endpoint in connected state
+// and then performs the TCP 3-way handshake.
+//
+// The new endpoint is returned with e.mu held.
+func (l *listenContext) createEndpointAndPerformHandshake(s *segment, opts *header.TCPSynOptions, queue *waiter.Queue, owner tcpip.PacketOwner) (*endpoint, *tcpip.Error) {
+	// Create new endpoint.
+	irs := s.sequenceNumber
+	isn := generateSecureISN(s.id, l.stack.Seed())
+	ep := l.createConnectingEndpoint(s, isn, irs, opts, queue)
+
+	// Lock the endpoint before registering to ensure that no out of
+	// band changes are possible due to incoming packets etc till
+	// the endpoint is done initializing.
+	ep.mu.Lock()
+	ep.owner = owner
+
+	// listenEP is nil when listenContext is used by tcp.Forwarder.
+	deferAccept := time.Duration(0)
+	if l.listenEP != nil {
+		l.listenEP.mu.Lock()
+		if l.listenEP.EndpointState() != StateListen {
+
+			l.listenEP.mu.Unlock()
+			// Ensure we release any registrations done by the newly
+			// created endpoint.
+			ep.mu.Unlock()
+			ep.Close()
+
+			return nil, tcpip.ErrConnectionAborted
+		}
+		l.addPendingEndpoint(ep)
+
+		// Propagate any inheritable options from the listening endpoint
+		// to the newly created endpoint.
+		l.listenEP.propagateInheritableOptionsLocked(ep)
+
+		if !ep.reserveTupleLocked() {
+			ep.mu.Unlock()
+			ep.Close()
+
+			if l.listenEP != nil {
+				l.removePendingEndpoint(ep)
+				l.listenEP.mu.Unlock()
+			}
+
+			return nil, tcpip.ErrConnectionAborted
+		}
+
+		deferAccept = l.listenEP.deferAccept
+		l.listenEP.mu.Unlock()
+	}
+
+	// Register new endpoint so that packets are routed to it.
+	if err := ep.stack.RegisterTransportEndpoint(ep.boundNICID, ep.effectiveNetProtos, ProtocolNumber, ep.ID, ep, ep.boundPortFlags, ep.boundBindToDevice); err != nil {
+		ep.mu.Unlock()
+		ep.Close()
+
+		if l.listenEP != nil {
+			l.removePendingEndpoint(ep)
+		}
+
+		ep.drainClosingSegmentQueue()
+
+		return nil, err
+	}
+
+	ep.isRegistered = true
+
+	// Perform the 3-way handshake.
+	h := newPassiveHandshake(ep, seqnum.Size(ep.initialReceiveWindow()), isn, irs, opts, deferAccept)
+	if err := h.execute(); err != nil {
+		ep.mu.Unlock()
+		ep.Close()
+		ep.notifyAborted()
+
+		if l.listenEP != nil {
+			l.removePendingEndpoint(ep)
+		}
+
+		ep.drainClosingSegmentQueue()
+
+		return nil, err
+	}
+	ep.isConnectNotified = true
+
+	// Update the receive window scaling. We can't do it before the
+	// handshake because it's possible that the peer doesn't support window
+	// scaling.
+	ep.rcv.rcvWndScale = h.effectiveRcvWndScale()
+
+	return ep, nil
+}
+
+func (l *listenContext) addPendingEndpoint(n *endpoint) {
+	l.pendingMu.Lock()
+	l.pendingEndpoints[n.ID] = n
+	l.pending.Add(1)
+	l.pendingMu.Unlock()
+}
+
+func (l *listenContext) removePendingEndpoint(n *endpoint) {
+	l.pendingMu.Lock()
+	delete(l.pendingEndpoints, n.ID)
+	l.pending.Done()
+	l.pendingMu.Unlock()
+}
+
+func (l *listenContext) closeAllPendingEndpoints() {
+	l.pendingMu.Lock()
+	for _, n := range l.pendingEndpoints {
+		n.notifyProtocolGoroutine(notifyClose)
+	}
+	l.pendingMu.Unlock()
+	l.pending.Wait()
+}
+
+// deliverAccepted delivers the newly-accepted endpoint to the listener. If the
+// endpoint has transitioned out of the listen state (acceptedChan is nil),
+// the new endpoint is closed instead.
+func (e *endpoint) deliverAccepted(n *endpoint) {
+	e.mu.Lock()
+	e.pendingAccepted.Add(1)
+	e.mu.Unlock()
+	defer e.pendingAccepted.Done()
+
+	e.acceptMu.Lock()
+	for {
+		if e.acceptedChan == nil {
+			e.acceptMu.Unlock()
+			n.notifyProtocolGoroutine(notifyReset)
+			return
+		}
+		select {
+		case e.acceptedChan <- n:
+			e.acceptMu.Unlock()
+			e.waiterQueue.Notify(waiter.EventIn)
+			return
+		default:
+			e.acceptCond.Wait()
+		}
+	}
+}
+
+// propagateInheritableOptionsLocked propagates any options set on the listening
+// endpoint to the newly created endpoint.
+//
+// Precondition: e.mu and n.mu must be held.
+func (e *endpoint) propagateInheritableOptionsLocked(n *endpoint) {
+	n.userTimeout = e.userTimeout
+	n.portFlags = e.portFlags
+	n.boundBindToDevice = e.boundBindToDevice
+	n.boundPortFlags = e.boundPortFlags
+}
+
+// reserveTupleLocked reserves an accepted endpoint's tuple.
+//
+// Preconditions:
+// * propagateInheritableOptionsLocked has been called.
+// * e.mu is held.
+func (e *endpoint) reserveTupleLocked() bool {
+	dest := tcpip.FullAddress{Addr: e.ID.RemoteAddress, Port: e.ID.RemotePort}
+	if !e.stack.ReserveTuple(
+		e.effectiveNetProtos,
+		ProtocolNumber,
+		e.ID.LocalAddress,
+		e.ID.LocalPort,
+		e.boundPortFlags,
+		e.boundBindToDevice,
+		dest,
+	) {
+		return false
+	}
+
+	e.isPortReserved = true
+	e.boundDest = dest
+	return true
+}
+
+// notifyAborted wakes up any waiters on registered, but not accepted
+// endpoints.
+//
+// This is strictly not required normally as a socket that was never accepted
+// can't really have any registered waiters except when stack.Wait() is called
+// which waits for all registered endpoints to stop and expects an EventHUp.
+func (e *endpoint) notifyAborted() {
+	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+}
+
+// handleSynSegment is called in its own goroutine once the listening endpoint
+// receives a SYN segment. It is responsible for completing the handshake and
+// queueing the new endpoint for acceptance.
+//
+// A limited number of these goroutines are allowed before TCP starts using SYN
+// cookies to accept connections.
+func (e *endpoint) handleSynSegment(ctx *listenContext, s *segment, opts *header.TCPSynOptions) {
+	defer ctx.synRcvdCount.dec()
+	defer func() {
+		e.mu.Lock()
+		e.decSynRcvdCount()
+		e.mu.Unlock()
+	}()
+	defer s.decRef()
+
+	n, err := ctx.createEndpointAndPerformHandshake(s, opts, &waiter.Queue{}, e.owner)
+	if err != nil {
+		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
+		e.stats.FailedConnectionAttempts.Increment()
+		return
+	}
+	ctx.removePendingEndpoint(n)
+	n.startAcceptedLoop()
+	e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
+
+	e.deliverAccepted(n)
+}
+
+func (e *endpoint) incSynRcvdCount() bool {
+	e.acceptMu.Lock()
+	canInc := e.synRcvdCount < cap(e.acceptedChan)
+	e.acceptMu.Unlock()
+	if canInc {
+		e.synRcvdCount++
+	}
+	return canInc
+}
+
+func (e *endpoint) decSynRcvdCount() {
+	e.synRcvdCount--
+}
+
+func (e *endpoint) acceptQueueIsFull() bool {
+	e.acceptMu.Lock()
+	full := len(e.acceptedChan)+e.synRcvdCount >= cap(e.acceptedChan)
+	e.acceptMu.Unlock()
+	return full
+}
+
+// handleListenSegment is called when a listening endpoint receives a segment
+// and needs to handle it.
+func (e *endpoint) handleListenSegment(ctx *listenContext, s *segment) {
+	e.rcvListMu.Lock()
+	rcvClosed := e.rcvClosed
+	e.rcvListMu.Unlock()
+	if rcvClosed || s.flagsAreSet(header.TCPFlagSyn|header.TCPFlagAck) {
+		// If the endpoint is shutdown, reply with reset.
+		//
+		// RFC 793 section 3.4 page 35 (figure 12) outlines that a RST
+		// must be sent in response to a SYN-ACK while in the listen
+		// state to prevent completing a handshake from an old SYN.
+		replyWithReset(s, e.sendTOS, e.ttl)
+		return
+	}
+
+	// TODO(b/143300739): Use the userMSS of the listening socket
+	// for accepted sockets.
+
+	switch {
+	case s.flags == header.TCPFlagSyn:
+		opts := parseSynSegmentOptions(s)
+		if ctx.synRcvdCount.inc() {
+			// Only handle the syn if the following conditions hold
+			//   - accept queue is not full.
+			//   - number of connections in synRcvd state is less than the
+			//     backlog.
+			if !e.acceptQueueIsFull() && e.incSynRcvdCount() {
+				s.incRef()
+				go e.handleSynSegment(ctx, s, &opts) // S/R-SAFE: synRcvdCount is the barrier.
+				return
+			}
+			ctx.synRcvdCount.dec()
+			e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
+			e.stats.ReceiveErrors.ListenOverflowSynDrop.Increment()
+			e.stack.Stats().DroppedPackets.Increment()
+			return
+		} else {
+			// If cookies are in use but the endpoint accept queue
+			// is full then drop the syn.
+			if e.acceptQueueIsFull() {
+				e.stack.Stats().TCP.ListenOverflowSynDrop.Increment()
+				e.stats.ReceiveErrors.ListenOverflowSynDrop.Increment()
+				e.stack.Stats().DroppedPackets.Increment()
+				return
+			}
+			cookie := ctx.createCookie(s.id, s.sequenceNumber, encodeMSS(opts.MSS))
+
+			// Send SYN without window scaling because we currently
+			// dont't encode this information in the cookie.
+			//
+			// Enable Timestamp option if the original syn did have
+			// the timestamp option specified.
+			synOpts := header.TCPSynOptions{
+				WS:    -1,
+				TS:    opts.TS,
+				TSVal: tcpTimeStamp(timeStampOffset()),
+				TSEcr: opts.TSVal,
+				MSS:   mssForRoute(&s.route),
+			}
+			e.sendSynTCP(&s.route, tcpFields{
+				id:     s.id,
+				ttl:    e.ttl,
+				tos:    e.sendTOS,
+				flags:  header.TCPFlagSyn | header.TCPFlagAck,
+				seq:    cookie,
+				ack:    s.sequenceNumber + 1,
+				rcvWnd: ctx.rcvWnd,
+			}, synOpts)
+			e.stack.Stats().TCP.ListenOverflowSynCookieSent.Increment()
+		}
+
+	case (s.flags & header.TCPFlagAck) != 0:
+		if e.acceptQueueIsFull() {
+			// Silently drop the ack as the application can't accept
+			// the connection at this point. The ack will be
+			// retransmitted by the sender anyway and we can
+			// complete the connection at the time of retransmit if
+			// the backlog has space.
+			e.stack.Stats().TCP.ListenOverflowAckDrop.Increment()
+			e.stats.ReceiveErrors.ListenOverflowAckDrop.Increment()
+			e.stack.Stats().DroppedPackets.Increment()
+			return
+		}
+
+		if !ctx.synRcvdCount.synCookiesInUse() {
+			// When not using SYN cookies, as per RFC 793, section 3.9, page 64:
+			// Any acknowledgment is bad if it arrives on a connection still in
+			// the LISTEN state.  An acceptable reset segment should be formed
+			// for any arriving ACK-bearing segment.  The RST should be
+			// formatted as follows:
+			//
+			//  <SEQ=SEG.ACK><CTL=RST>
+			//
+			// Send a reset as this is an ACK for which there is no
+			// half open connections and we are not using cookies
+			// yet.
+			//
+			// The only time we should reach here when a connection
+			// was opened and closed really quickly and a delayed
+			// ACK was received from the sender.
+			replyWithReset(s, e.sendTOS, e.ttl)
+			return
+		}
+
+		iss := s.ackNumber - 1
+		irs := s.sequenceNumber - 1
+
+		// Since SYN cookies are in use this is potentially an ACK to a
+		// SYN-ACK we sent but don't have a half open connection state
+		// as cookies are being used to protect against a potential SYN
+		// flood. In such cases validate the cookie and if valid create
+		// a fully connected endpoint and deliver to the accept queue.
+		//
+		// If not, silently drop the ACK to avoid leaking information
+		// when under a potential syn flood attack.
+		//
+		// Validate the cookie.
+		data, ok := ctx.isCookieValid(s.id, iss, irs)
+		if !ok || int(data) >= len(mssTable) {
+			e.stack.Stats().TCP.ListenOverflowInvalidSynCookieRcvd.Increment()
+			e.stack.Stats().DroppedPackets.Increment()
+			return
+		}
+		e.stack.Stats().TCP.ListenOverflowSynCookieRcvd.Increment()
+		// Create newly accepted endpoint and deliver it.
+		rcvdSynOptions := &header.TCPSynOptions{
+			MSS: mssTable[data],
+			// Disable Window scaling as original SYN is
+			// lost.
+			WS: -1,
+		}
+
+		// When syn cookies are in use we enable timestamp only
+		// if the ack specifies the timestamp option assuming
+		// that the other end did in fact negotiate the
+		// timestamp option in the original SYN.
+		if s.parsedOptions.TS {
+			rcvdSynOptions.TS = true
+			rcvdSynOptions.TSVal = s.parsedOptions.TSVal
+			rcvdSynOptions.TSEcr = s.parsedOptions.TSEcr
+		}
+
+		n := ctx.createConnectingEndpoint(s, iss, irs, rcvdSynOptions, &waiter.Queue{})
+
+		n.mu.Lock()
+
+		// Propagate any inheritable options from the listening endpoint
+		// to the newly created endpoint.
+		e.propagateInheritableOptionsLocked(n)
+
+		if !n.reserveTupleLocked() {
+			n.mu.Unlock()
+			n.Close()
+
+			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
+			e.stats.FailedConnectionAttempts.Increment()
+			return
+		}
+
+		// Register new endpoint so that packets are routed to it.
+		if err := n.stack.RegisterTransportEndpoint(n.boundNICID, n.effectiveNetProtos, ProtocolNumber, n.ID, n, n.boundPortFlags, n.boundBindToDevice); err != nil {
+			n.mu.Unlock()
+			n.Close()
+
+			e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
+			e.stats.FailedConnectionAttempts.Increment()
+			return
+		}
+
+		n.isRegistered = true
+
+		// clear the tsOffset for the newly created
+		// endpoint as the Timestamp was already
+		// randomly offset when the original SYN-ACK was
+		// sent above.
+		n.tsOffset = 0
+
+		// Switch state to connected.
+		n.isConnectNotified = true
+		n.transitionToStateEstablishedLocked(&handshake{
+			ep:          n,
+			iss:         iss,
+			ackNum:      irs + 1,
+			rcvWnd:      seqnum.Size(n.initialReceiveWindow()),
+			sndWnd:      s.window,
+			rcvWndScale: e.rcvWndScaleForHandshake(),
+			sndWndScale: rcvdSynOptions.WS,
+			mss:         rcvdSynOptions.MSS,
+		})
+
+		// Do the delivery in a separate goroutine so
+		// that we don't block the listen loop in case
+		// the application is slow to accept or stops
+		// accepting.
+		//
+		// NOTE: This won't result in an unbounded
+		// number of goroutines as we do check before
+		// entering here that there was at least some
+		// space available in the backlog.
+
+		// Start the protocol goroutine.
+		n.startAcceptedLoop()
+		e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
+		go e.deliverAccepted(n)
+	}
+}
+
+// protocolListenLoop is the main loop of a listening TCP endpoint. It runs in
+// its own goroutine and is responsible for handling connection requests.
+func (e *endpoint) protocolListenLoop(rcvWnd seqnum.Size) *tcpip.Error {
+	e.mu.Lock()
+	v6Only := e.v6only
+	ctx := newListenContext(e.stack, e, rcvWnd, v6Only, e.NetProto)
+
+	defer func() {
+		// Mark endpoint as closed. This will prevent goroutines running
+		// handleSynSegment() from attempting to queue new connections
+		// to the endpoint.
+		e.setEndpointState(StateClose)
+
+		// close any endpoints in SYN-RCVD state.
+		ctx.closeAllPendingEndpoints()
+
+		// Do cleanup if needed.
+		e.completeWorkerLocked()
+
+		if e.drainDone != nil {
+			close(e.drainDone)
+		}
+		e.mu.Unlock()
+
+		e.drainClosingSegmentQueue()
+
+		// Notify waiters that the endpoint is shutdown.
+		e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut | waiter.EventHUp | waiter.EventErr)
+	}()
+
+	s := sleep.Sleeper{}
+	s.AddWaker(&e.notificationWaker, wakerForNotification)
+	s.AddWaker(&e.newSegmentWaker, wakerForNewSegment)
+	for {
+		e.mu.Unlock()
+		index, _ := s.Fetch(true)
+		e.mu.Lock()
+		switch index {
+		case wakerForNotification:
+			n := e.fetchNotifications()
+			if n&notifyClose != 0 {
+				return nil
+			}
+			if n&notifyDrain != 0 {
+				for !e.segmentQueue.empty() {
+					s := e.segmentQueue.dequeue()
+					e.handleListenSegment(ctx, s)
+					s.decRef()
+				}
+				close(e.drainDone)
+				e.mu.Unlock()
+				<-e.undrain
+				e.mu.Lock()
+			}
+
+		case wakerForNewSegment:
+			// Process at most maxSegmentsPerWake segments.
+			mayRequeue := true
+			for i := 0; i < maxSegmentsPerWake; i++ {
+				s := e.segmentQueue.dequeue()
+				if s == nil {
+					mayRequeue = false
+					break
+				}
+
+				e.handleListenSegment(ctx, s)
+				s.decRef()
+			}
+
+			// If the queue is not empty, make sure we'll wake up
+			// in the next iteration.
+			if mayRequeue && !e.segmentQueue.empty() {
+				e.newSegmentWaker.Assert()
+			}
+		}
+	}
+}
diff --git a/pkg/tcpip/transport/tcp/connect.go b/pkg/tcpip/transport/tcp/connect.go
new file mode 100644
index 000000000..81b740115
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/connect.go
@@ -0,0 +1,1713 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"encoding/binary"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// maxSegmentsPerWake is the maximum number of segments to process in the main
+// protocol goroutine per wake-up. Yielding [after this number of segments are
+// processed] allows other events to be processed as well (e.g., timeouts,
+// resets, etc.).
+const maxSegmentsPerWake = 100
+
+type handshakeState int
+
+// The following are the possible states of the TCP connection during a 3-way
+// handshake. A depiction of the states and transitions can be found in RFC 793,
+// page 23.
+const (
+	handshakeSynSent handshakeState = iota
+	handshakeSynRcvd
+	handshakeCompleted
+)
+
+// The following are used to set up sleepers.
+const (
+	wakerForNotification = iota
+	wakerForNewSegment
+	wakerForResend
+	wakerForResolution
+)
+
+const (
+	// Maximum space available for options.
+	maxOptionSize = 40
+)
+
+// handshake holds the state used during a TCP 3-way handshake.
+//
+// NOTE: handshake.ep.mu is held during handshake processing. It is released if
+// we are going to block and reacquired when we start processing an event.
+type handshake struct {
+	ep     *endpoint
+	state  handshakeState
+	active bool
+	flags  uint8
+	ackNum seqnum.Value
+
+	// iss is the initial send sequence number, as defined in RFC 793.
+	iss seqnum.Value
+
+	// rcvWnd is the receive window, as defined in RFC 793.
+	rcvWnd seqnum.Size
+
+	// sndWnd is the send window, as defined in RFC 793.
+	sndWnd seqnum.Size
+
+	// mss is the maximum segment size received from the peer.
+	mss uint16
+
+	// sndWndScale is the send window scale, as defined in RFC 1323. A
+	// negative value means no scaling is supported by the peer.
+	sndWndScale int
+
+	// rcvWndScale is the receive window scale, as defined in RFC 1323.
+	rcvWndScale int
+
+	// startTime is the time at which the first SYN/SYN-ACK was sent.
+	startTime time.Time
+
+	// deferAccept if non-zero will drop the final ACK for a passive
+	// handshake till an ACK segment with data is received or the timeout is
+	// hit.
+	deferAccept time.Duration
+
+	// acked is true if the the final ACK for a 3-way handshake has
+	// been received. This is required to stop retransmitting the
+	// original SYN-ACK when deferAccept is enabled.
+	acked bool
+}
+
+func newHandshake(ep *endpoint, rcvWnd seqnum.Size) handshake {
+	h := handshake{
+		ep:          ep,
+		active:      true,
+		rcvWnd:      rcvWnd,
+		rcvWndScale: ep.rcvWndScaleForHandshake(),
+	}
+	h.resetState()
+	return h
+}
+
+func newPassiveHandshake(ep *endpoint, rcvWnd seqnum.Size, isn, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) handshake {
+	h := newHandshake(ep, rcvWnd)
+	h.resetToSynRcvd(isn, irs, opts, deferAccept)
+	return h
+}
+
+// FindWndScale determines the window scale to use for the given maximum window
+// size.
+func FindWndScale(wnd seqnum.Size) int {
+	if wnd < 0x10000 {
+		return 0
+	}
+
+	max := seqnum.Size(0xffff)
+	s := 0
+	for wnd > max && s < header.MaxWndScale {
+		s++
+		max <<= 1
+	}
+
+	return s
+}
+
+// resetState resets the state of the handshake object such that it becomes
+// ready for a new 3-way handshake.
+func (h *handshake) resetState() {
+	b := make([]byte, 4)
+	if _, err := rand.Read(b); err != nil {
+		panic(err)
+	}
+
+	h.state = handshakeSynSent
+	h.flags = header.TCPFlagSyn
+	h.ackNum = 0
+	h.mss = 0
+	h.iss = generateSecureISN(h.ep.ID, h.ep.stack.Seed())
+}
+
+// generateSecureISN generates a secure Initial Sequence number based on the
+// recommendation here https://tools.ietf.org/html/rfc6528#page-3.
+func generateSecureISN(id stack.TransportEndpointID, seed uint32) seqnum.Value {
+	isnHasher := jenkins.Sum32(seed)
+	isnHasher.Write([]byte(id.LocalAddress))
+	isnHasher.Write([]byte(id.RemoteAddress))
+	portBuf := make([]byte, 2)
+	binary.LittleEndian.PutUint16(portBuf, id.LocalPort)
+	isnHasher.Write(portBuf)
+	binary.LittleEndian.PutUint16(portBuf, id.RemotePort)
+	isnHasher.Write(portBuf)
+	// The time period here is 64ns. This is similar to what linux uses
+	// generate a sequence number that overlaps less than one
+	// time per MSL (2 minutes).
+	//
+	// A 64ns clock ticks 10^9/64 = 15625000) times in a second.
+	// To wrap the whole 32 bit space would require
+	// 2^32/1562500 ~ 274 seconds.
+	//
+	// Which sort of guarantees that we won't reuse the ISN for a new
+	// connection for the same tuple for at least 274s.
+	isn := isnHasher.Sum32() + uint32(time.Now().UnixNano()>>6)
+	return seqnum.Value(isn)
+}
+
+// effectiveRcvWndScale returns the effective receive window scale to be used.
+// If the peer doesn't support window scaling, the effective rcv wnd scale is
+// zero; otherwise it's the value calculated based on the initial rcv wnd.
+func (h *handshake) effectiveRcvWndScale() uint8 {
+	if h.sndWndScale < 0 {
+		return 0
+	}
+	return uint8(h.rcvWndScale)
+}
+
+// resetToSynRcvd resets the state of the handshake object to the SYN-RCVD
+// state.
+func (h *handshake) resetToSynRcvd(iss seqnum.Value, irs seqnum.Value, opts *header.TCPSynOptions, deferAccept time.Duration) {
+	h.active = false
+	h.state = handshakeSynRcvd
+	h.flags = header.TCPFlagSyn | header.TCPFlagAck
+	h.iss = iss
+	h.ackNum = irs + 1
+	h.mss = opts.MSS
+	h.sndWndScale = opts.WS
+	h.deferAccept = deferAccept
+	h.ep.setEndpointState(StateSynRecv)
+}
+
+// checkAck checks if the ACK number, if present, of a segment received during
+// a TCP 3-way handshake is valid. If it's not, a RST segment is sent back in
+// response.
+func (h *handshake) checkAck(s *segment) bool {
+	if s.flagIsSet(header.TCPFlagAck) && s.ackNumber != h.iss+1 {
+		// RFC 793, page 36, states that a reset must be generated when
+		// the connection is in any non-synchronized state and an
+		// incoming segment acknowledges something not yet sent. The
+		// connection remains in the same state.
+		ack := s.sequenceNumber.Add(s.logicalLen())
+		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, s.ackNumber, ack, 0)
+		return false
+	}
+
+	return true
+}
+
+// synSentState handles a segment received when the TCP 3-way handshake is in
+// the SYN-SENT state.
+func (h *handshake) synSentState(s *segment) *tcpip.Error {
+	// RFC 793, page 37, states that in the SYN-SENT state, a reset is
+	// acceptable if the ack field acknowledges the SYN.
+	if s.flagIsSet(header.TCPFlagRst) {
+		if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == h.iss+1 {
+			// RFC 793, page 67, states that "If the RST bit is set [and] If the ACK
+			// was acceptable then signal the user "error: connection reset", drop
+			// the segment, enter CLOSED state, delete TCB, and return."
+			h.ep.workerCleanup = true
+			// Although the RFC above calls out ECONNRESET, Linux actually returns
+			// ECONNREFUSED here so we do as well.
+			return tcpip.ErrConnectionRefused
+		}
+		return nil
+	}
+
+	if !h.checkAck(s) {
+		return nil
+	}
+
+	// We are in the SYN-SENT state. We only care about segments that have
+	// the SYN flag.
+	if !s.flagIsSet(header.TCPFlagSyn) {
+		return nil
+	}
+
+	// Parse the SYN options.
+	rcvSynOpts := parseSynSegmentOptions(s)
+
+	// Remember if the Timestamp option was negotiated.
+	h.ep.maybeEnableTimestamp(&rcvSynOpts)
+
+	// Remember if the SACKPermitted option was negotiated.
+	h.ep.maybeEnableSACKPermitted(&rcvSynOpts)
+
+	// Remember the sequence we'll ack from now on.
+	h.ackNum = s.sequenceNumber + 1
+	h.flags |= header.TCPFlagAck
+	h.mss = rcvSynOpts.MSS
+	h.sndWndScale = rcvSynOpts.WS
+
+	// If this is a SYN ACK response, we only need to acknowledge the SYN
+	// and the handshake is completed.
+	if s.flagIsSet(header.TCPFlagAck) {
+		h.state = handshakeCompleted
+
+		h.ep.transitionToStateEstablishedLocked(h)
+
+		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd>>h.effectiveRcvWndScale())
+		return nil
+	}
+
+	// A SYN segment was received, but no ACK in it. We acknowledge the SYN
+	// but resend our own SYN and wait for it to be acknowledged in the
+	// SYN-RCVD state.
+	h.state = handshakeSynRcvd
+	ttl := h.ep.ttl
+	amss := h.ep.amss
+	h.ep.setEndpointState(StateSynRecv)
+	synOpts := header.TCPSynOptions{
+		WS:    int(h.effectiveRcvWndScale()),
+		TS:    rcvSynOpts.TS,
+		TSVal: h.ep.timestamp(),
+		TSEcr: h.ep.recentTimestamp(),
+
+		// We only send SACKPermitted if the other side indicated it
+		// permits SACK. This is not explicitly defined in the RFC but
+		// this is the behaviour implemented by Linux.
+		SACKPermitted: rcvSynOpts.SACKPermitted,
+		MSS:           amss,
+	}
+	if ttl == 0 {
+		ttl = s.route.DefaultTTL()
+	}
+	h.ep.sendSynTCP(&s.route, tcpFields{
+		id:     h.ep.ID,
+		ttl:    ttl,
+		tos:    h.ep.sendTOS,
+		flags:  h.flags,
+		seq:    h.iss,
+		ack:    h.ackNum,
+		rcvWnd: h.rcvWnd,
+	}, synOpts)
+	return nil
+}
+
+// synRcvdState handles a segment received when the TCP 3-way handshake is in
+// the SYN-RCVD state.
+func (h *handshake) synRcvdState(s *segment) *tcpip.Error {
+	if s.flagIsSet(header.TCPFlagRst) {
+		// RFC 793, page 37, states that in the SYN-RCVD state, a reset
+		// is acceptable if the sequence number is in the window.
+		if s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
+			return tcpip.ErrConnectionRefused
+		}
+		return nil
+	}
+
+	if !h.checkAck(s) {
+		return nil
+	}
+
+	// RFC 793, Section 3.9, page 69, states that in the SYN-RCVD state, a
+	// sequence number outside of the window causes an ACK with the proper seq
+	// number and "After sending the acknowledgment, drop the unacceptable
+	// segment and return."
+	if !s.sequenceNumber.InWindow(h.ackNum, h.rcvWnd) {
+		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, h.iss+1, h.ackNum, h.rcvWnd)
+		return nil
+	}
+
+	if s.flagIsSet(header.TCPFlagSyn) && s.sequenceNumber != h.ackNum-1 {
+		// We received two SYN segments with different sequence
+		// numbers, so we reset this and restart the whole
+		// process, except that we don't reset the timer.
+		ack := s.sequenceNumber.Add(s.logicalLen())
+		seq := seqnum.Value(0)
+		if s.flagIsSet(header.TCPFlagAck) {
+			seq = s.ackNumber
+		}
+		h.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagRst|header.TCPFlagAck, seq, ack, 0)
+
+		if !h.active {
+			return tcpip.ErrInvalidEndpointState
+		}
+
+		h.resetState()
+		synOpts := header.TCPSynOptions{
+			WS:            h.rcvWndScale,
+			TS:            h.ep.sendTSOk,
+			TSVal:         h.ep.timestamp(),
+			TSEcr:         h.ep.recentTimestamp(),
+			SACKPermitted: h.ep.sackPermitted,
+			MSS:           h.ep.amss,
+		}
+		h.ep.sendSynTCP(&s.route, tcpFields{
+			id:     h.ep.ID,
+			ttl:    h.ep.ttl,
+			tos:    h.ep.sendTOS,
+			flags:  h.flags,
+			seq:    h.iss,
+			ack:    h.ackNum,
+			rcvWnd: h.rcvWnd,
+		}, synOpts)
+		return nil
+	}
+
+	// We have previously received (and acknowledged) the peer's SYN. If the
+	// peer acknowledges our SYN, the handshake is completed.
+	if s.flagIsSet(header.TCPFlagAck) {
+		// If deferAccept is not zero and this is a bare ACK and the
+		// timeout is not hit then drop the ACK.
+		if h.deferAccept != 0 && s.data.Size() == 0 && time.Since(h.startTime) < h.deferAccept {
+			h.acked = true
+			h.ep.stack.Stats().DroppedPackets.Increment()
+			return nil
+		}
+
+		// If the timestamp option is negotiated and the segment does
+		// not carry a timestamp option then the segment must be dropped
+		// as per https://tools.ietf.org/html/rfc7323#section-3.2.
+		if h.ep.sendTSOk && !s.parsedOptions.TS {
+			h.ep.stack.Stats().DroppedPackets.Increment()
+			return nil
+		}
+
+		// Update timestamp if required. See RFC7323, section-4.3.
+		if h.ep.sendTSOk && s.parsedOptions.TS {
+			h.ep.updateRecentTimestamp(s.parsedOptions.TSVal, h.ackNum, s.sequenceNumber)
+		}
+		h.state = handshakeCompleted
+
+		h.ep.transitionToStateEstablishedLocked(h)
+
+		// If the segment has data then requeue it for the receiver
+		// to process it again once main loop is started.
+		if s.data.Size() > 0 {
+			s.incRef()
+			h.ep.enqueueSegment(s)
+		}
+		return nil
+	}
+
+	return nil
+}
+
+func (h *handshake) handleSegment(s *segment) *tcpip.Error {
+	h.sndWnd = s.window
+	if !s.flagIsSet(header.TCPFlagSyn) && h.sndWndScale > 0 {
+		h.sndWnd <<= uint8(h.sndWndScale)
+	}
+
+	switch h.state {
+	case handshakeSynRcvd:
+		return h.synRcvdState(s)
+	case handshakeSynSent:
+		return h.synSentState(s)
+	}
+	return nil
+}
+
+// processSegments goes through the segment queue and processes up to
+// maxSegmentsPerWake (if they're available).
+func (h *handshake) processSegments() *tcpip.Error {
+	for i := 0; i < maxSegmentsPerWake; i++ {
+		s := h.ep.segmentQueue.dequeue()
+		if s == nil {
+			return nil
+		}
+
+		err := h.handleSegment(s)
+		s.decRef()
+		if err != nil {
+			return err
+		}
+
+		// We stop processing packets once the handshake is completed,
+		// otherwise we may process packets meant to be processed by
+		// the main protocol goroutine.
+		if h.state == handshakeCompleted {
+			break
+		}
+	}
+
+	// If the queue is not empty, make sure we'll wake up in the next
+	// iteration.
+	if !h.ep.segmentQueue.empty() {
+		h.ep.newSegmentWaker.Assert()
+	}
+
+	return nil
+}
+
+func (h *handshake) resolveRoute() *tcpip.Error {
+	// Set up the wakers.
+	s := sleep.Sleeper{}
+	resolutionWaker := &sleep.Waker{}
+	s.AddWaker(resolutionWaker, wakerForResolution)
+	s.AddWaker(&h.ep.notificationWaker, wakerForNotification)
+	defer s.Done()
+
+	// Initial action is to resolve route.
+	index := wakerForResolution
+	for {
+		switch index {
+		case wakerForResolution:
+			if _, err := h.ep.route.Resolve(resolutionWaker); err != tcpip.ErrWouldBlock {
+				if err == tcpip.ErrNoLinkAddress {
+					h.ep.stats.SendErrors.NoLinkAddr.Increment()
+				} else if err != nil {
+					h.ep.stats.SendErrors.NoRoute.Increment()
+				}
+				// Either success (err == nil) or failure.
+				return err
+			}
+			// Resolution not completed. Keep trying...
+
+		case wakerForNotification:
+			n := h.ep.fetchNotifications()
+			if n&notifyClose != 0 {
+				h.ep.route.RemoveWaker(resolutionWaker)
+				return tcpip.ErrAborted
+			}
+			if n&notifyDrain != 0 {
+				close(h.ep.drainDone)
+				h.ep.mu.Unlock()
+				<-h.ep.undrain
+				h.ep.mu.Lock()
+			}
+		}
+
+		// Wait for notification.
+		index, _ = s.Fetch(true)
+	}
+}
+
+// execute executes the TCP 3-way handshake.
+func (h *handshake) execute() *tcpip.Error {
+	if h.ep.route.IsResolutionRequired() {
+		if err := h.resolveRoute(); err != nil {
+			return err
+		}
+	}
+
+	h.startTime = time.Now()
+	// Initialize the resend timer.
+	resendWaker := sleep.Waker{}
+	timeOut := time.Duration(time.Second)
+	rt := time.AfterFunc(timeOut, resendWaker.Assert)
+	defer rt.Stop()
+
+	// Set up the wakers.
+	s := sleep.Sleeper{}
+	s.AddWaker(&resendWaker, wakerForResend)
+	s.AddWaker(&h.ep.notificationWaker, wakerForNotification)
+	s.AddWaker(&h.ep.newSegmentWaker, wakerForNewSegment)
+	defer s.Done()
+
+	var sackEnabled SACKEnabled
+	if err := h.ep.stack.TransportProtocolOption(ProtocolNumber, &sackEnabled); err != nil {
+		// If stack returned an error when checking for SACKEnabled
+		// status then just default to switching off SACK negotiation.
+		sackEnabled = false
+	}
+
+	// Send the initial SYN segment and loop until the handshake is
+	// completed.
+	h.ep.amss = calculateAdvertisedMSS(h.ep.userMSS, h.ep.route)
+
+	synOpts := header.TCPSynOptions{
+		WS:            h.rcvWndScale,
+		TS:            true,
+		TSVal:         h.ep.timestamp(),
+		TSEcr:         h.ep.recentTimestamp(),
+		SACKPermitted: bool(sackEnabled),
+		MSS:           h.ep.amss,
+	}
+
+	// Execute is also called in a listen context so we want to make sure we
+	// only send the TS/SACK option when we received the TS/SACK in the
+	// initial SYN.
+	if h.state == handshakeSynRcvd {
+		synOpts.TS = h.ep.sendTSOk
+		synOpts.SACKPermitted = h.ep.sackPermitted && bool(sackEnabled)
+		if h.sndWndScale < 0 {
+			// Disable window scaling if the peer did not send us
+			// the window scaling option.
+			synOpts.WS = -1
+		}
+	}
+
+	h.ep.sendSynTCP(&h.ep.route, tcpFields{
+		id:     h.ep.ID,
+		ttl:    h.ep.ttl,
+		tos:    h.ep.sendTOS,
+		flags:  h.flags,
+		seq:    h.iss,
+		ack:    h.ackNum,
+		rcvWnd: h.rcvWnd,
+	}, synOpts)
+
+	for h.state != handshakeCompleted {
+		h.ep.mu.Unlock()
+		index, _ := s.Fetch(true)
+		h.ep.mu.Lock()
+		switch index {
+
+		case wakerForResend:
+			timeOut *= 2
+			if timeOut > MaxRTO {
+				return tcpip.ErrTimeout
+			}
+			rt.Reset(timeOut)
+			// Resend the SYN/SYN-ACK only if the following conditions hold.
+			//  - It's an active handshake (deferAccept does not apply)
+			//  - It's a passive handshake and we have not yet got the final-ACK.
+			//  - It's a passive handshake and we got an ACK but deferAccept is
+			//    enabled and we are now past the deferAccept duration.
+			// The last is required to provide a way for the peer to complete
+			// the connection with another ACK or data (as ACKs are never
+			// retransmitted on their own).
+			if h.active || !h.acked || h.deferAccept != 0 && time.Since(h.startTime) > h.deferAccept {
+				h.ep.sendSynTCP(&h.ep.route, tcpFields{
+					id:     h.ep.ID,
+					ttl:    h.ep.ttl,
+					tos:    h.ep.sendTOS,
+					flags:  h.flags,
+					seq:    h.iss,
+					ack:    h.ackNum,
+					rcvWnd: h.rcvWnd,
+				}, synOpts)
+			}
+
+		case wakerForNotification:
+			n := h.ep.fetchNotifications()
+			if (n&notifyClose)|(n&notifyAbort) != 0 {
+				return tcpip.ErrAborted
+			}
+			if n&notifyDrain != 0 {
+				for !h.ep.segmentQueue.empty() {
+					s := h.ep.segmentQueue.dequeue()
+					err := h.handleSegment(s)
+					s.decRef()
+					if err != nil {
+						return err
+					}
+					if h.state == handshakeCompleted {
+						return nil
+					}
+				}
+				close(h.ep.drainDone)
+				h.ep.mu.Unlock()
+				<-h.ep.undrain
+				h.ep.mu.Lock()
+			}
+
+		case wakerForNewSegment:
+			if err := h.processSegments(); err != nil {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+func parseSynSegmentOptions(s *segment) header.TCPSynOptions {
+	synOpts := header.ParseSynOptions(s.options, s.flagIsSet(header.TCPFlagAck))
+	if synOpts.TS {
+		s.parsedOptions.TSVal = synOpts.TSVal
+		s.parsedOptions.TSEcr = synOpts.TSEcr
+	}
+	return synOpts
+}
+
+var optionPool = sync.Pool{
+	New: func() interface{} {
+		return &[maxOptionSize]byte{}
+	},
+}
+
+func getOptions() []byte {
+	return (*optionPool.Get().(*[maxOptionSize]byte))[:]
+}
+
+func putOptions(options []byte) {
+	// Reslice to full capacity.
+	optionPool.Put(optionsToArray(options))
+}
+
+func makeSynOptions(opts header.TCPSynOptions) []byte {
+	// Emulate linux option order. This is as follows:
+	//
+	// if md5: NOP NOP MD5SIG 18 md5sig(16)
+	// if mss: MSS 4 mss(2)
+	// if ts and sack_advertise:
+	//	SACK 2 TIMESTAMP 2 timestamp(8)
+	// elif ts: NOP NOP TIMESTAMP 10 timestamp(8)
+	// elif sack: NOP NOP SACK 2
+	// if wscale: NOP WINDOW 3 ws(1)
+	// if sack_blocks: NOP NOP SACK ((2 + (#blocks * 8))
+	//	[for each block] start_seq(4) end_seq(4)
+	// if fastopen_cookie:
+	//	if exp: EXP (4 + len(cookie)) FASTOPEN_MAGIC(2)
+	// 	else: FASTOPEN (2 + len(cookie))
+	//	cookie(variable) [padding to four bytes]
+	//
+	options := getOptions()
+
+	// Always encode the mss.
+	offset := header.EncodeMSSOption(uint32(opts.MSS), options)
+
+	// Special ordering is required here. If both TS and SACK are enabled,
+	// then the SACK option precedes TS, with no padding. If they are
+	// enabled individually, then we see padding before the option.
+	if opts.TS && opts.SACKPermitted {
+		offset += header.EncodeSACKPermittedOption(options[offset:])
+		offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
+	} else if opts.TS {
+		offset += header.EncodeNOP(options[offset:])
+		offset += header.EncodeNOP(options[offset:])
+		offset += header.EncodeTSOption(opts.TSVal, opts.TSEcr, options[offset:])
+	} else if opts.SACKPermitted {
+		offset += header.EncodeNOP(options[offset:])
+		offset += header.EncodeNOP(options[offset:])
+		offset += header.EncodeSACKPermittedOption(options[offset:])
+	}
+
+	// Initialize the WS option.
+	if opts.WS >= 0 {
+		offset += header.EncodeNOP(options[offset:])
+		offset += header.EncodeWSOption(opts.WS, options[offset:])
+	}
+
+	// Padding to the end; note that this never apply unless we add a
+	// fastopen option, we always expect the offset to remain the same.
+	if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
+		panic("unexpected option encoding")
+	}
+
+	return options[:offset]
+}
+
+// tcpFields is a struct to carry different parameters required by the
+// send*TCP variant functions below.
+type tcpFields struct {
+	id     stack.TransportEndpointID
+	ttl    uint8
+	tos    uint8
+	flags  byte
+	seq    seqnum.Value
+	ack    seqnum.Value
+	rcvWnd seqnum.Size
+	opts   []byte
+	txHash uint32
+}
+
+func (e *endpoint) sendSynTCP(r *stack.Route, tf tcpFields, opts header.TCPSynOptions) *tcpip.Error {
+	tf.opts = makeSynOptions(opts)
+	// We ignore SYN send errors and let the callers re-attempt send.
+	if err := e.sendTCP(r, tf, buffer.VectorisedView{}, nil); err != nil {
+		e.stats.SendErrors.SynSendToNetworkFailed.Increment()
+	}
+	putOptions(tf.opts)
+	return nil
+}
+
+func (e *endpoint) sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO) *tcpip.Error {
+	tf.txHash = e.txHash
+	if err := sendTCP(r, tf, data, gso, e.owner); err != nil {
+		e.stats.SendErrors.SegmentSendToNetworkFailed.Increment()
+		return err
+	}
+	e.stats.SegmentsSent.Increment()
+	return nil
+}
+
+func buildTCPHdr(r *stack.Route, tf tcpFields, pkt *stack.PacketBuffer, gso *stack.GSO) {
+	optLen := len(tf.opts)
+	hdr := &pkt.Header
+	packetSize := pkt.Data.Size()
+	// Initialize the header.
+	tcp := header.TCP(hdr.Prepend(header.TCPMinimumSize + optLen))
+	pkt.TransportHeader = buffer.View(tcp)
+	tcp.Encode(&header.TCPFields{
+		SrcPort:    tf.id.LocalPort,
+		DstPort:    tf.id.RemotePort,
+		SeqNum:     uint32(tf.seq),
+		AckNum:     uint32(tf.ack),
+		DataOffset: uint8(header.TCPMinimumSize + optLen),
+		Flags:      tf.flags,
+		WindowSize: uint16(tf.rcvWnd),
+	})
+	copy(tcp[header.TCPMinimumSize:], tf.opts)
+
+	length := uint16(hdr.UsedLength() + packetSize)
+	xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
+	// Only calculate the checksum if offloading isn't supported.
+	if gso != nil && gso.NeedsCsum {
+		// This is called CHECKSUM_PARTIAL in the Linux kernel. We
+		// calculate a checksum of the pseudo-header and save it in the
+		// TCP header, then the kernel calculate a checksum of the
+		// header and data and get the right sum of the TCP packet.
+		tcp.SetChecksum(xsum)
+	} else if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 {
+		xsum = header.ChecksumVV(pkt.Data, xsum)
+		tcp.SetChecksum(^tcp.CalculateChecksum(xsum))
+	}
+}
+
+func sendTCPBatch(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error {
+	// We need to shallow clone the VectorisedView here as ReadToView will
+	// split the VectorisedView and Trim underlying views as it splits. Not
+	// doing the clone here will cause the underlying views of data itself
+	// to be altered.
+	data = data.Clone(nil)
+
+	optLen := len(tf.opts)
+	if tf.rcvWnd > 0xffff {
+		tf.rcvWnd = 0xffff
+	}
+
+	mss := int(gso.MSS)
+	n := (data.Size() + mss - 1) / mss
+
+	size := data.Size()
+	hdrSize := header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen
+	var pkts stack.PacketBufferList
+	for i := 0; i < n; i++ {
+		packetSize := mss
+		if packetSize > size {
+			packetSize = size
+		}
+		size -= packetSize
+		var pkt stack.PacketBuffer
+		pkt.Header = buffer.NewPrependable(hdrSize)
+		pkt.Hash = tf.txHash
+		pkt.Owner = owner
+		pkt.EgressRoute = r
+		pkt.GSOOptions = gso
+		pkt.NetworkProtocolNumber = r.NetworkProtocolNumber()
+		data.ReadToVV(&pkt.Data, packetSize)
+		buildTCPHdr(r, tf, &pkt, gso)
+		tf.seq = tf.seq.Add(seqnum.Size(packetSize))
+		pkts.PushBack(&pkt)
+	}
+
+	if tf.ttl == 0 {
+		tf.ttl = r.DefaultTTL()
+	}
+	sent, err := r.WritePackets(gso, pkts, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos})
+	if err != nil {
+		r.Stats().TCP.SegmentSendErrors.IncrementBy(uint64(n - sent))
+	}
+	r.Stats().TCP.SegmentsSent.IncrementBy(uint64(sent))
+	return err
+}
+
+// sendTCP sends a TCP segment with the provided options via the provided
+// network endpoint and under the provided identity.
+func sendTCP(r *stack.Route, tf tcpFields, data buffer.VectorisedView, gso *stack.GSO, owner tcpip.PacketOwner) *tcpip.Error {
+	optLen := len(tf.opts)
+	if tf.rcvWnd > 0xffff {
+		tf.rcvWnd = 0xffff
+	}
+
+	if r.Loop&stack.PacketLoop == 0 && gso != nil && gso.Type == stack.GSOSW && int(gso.MSS) < data.Size() {
+		return sendTCPBatch(r, tf, data, gso, owner)
+	}
+
+	pkt := &stack.PacketBuffer{
+		Header: buffer.NewPrependable(header.TCPMinimumSize + int(r.MaxHeaderLength()) + optLen),
+		Data:   data,
+		Hash:   tf.txHash,
+		Owner:  owner,
+	}
+	buildTCPHdr(r, tf, pkt, gso)
+
+	if tf.ttl == 0 {
+		tf.ttl = r.DefaultTTL()
+	}
+	if err := r.WritePacket(gso, stack.NetworkHeaderParams{Protocol: ProtocolNumber, TTL: tf.ttl, TOS: tf.tos}, pkt); err != nil {
+		r.Stats().TCP.SegmentSendErrors.Increment()
+		return err
+	}
+	r.Stats().TCP.SegmentsSent.Increment()
+	if (tf.flags & header.TCPFlagRst) != 0 {
+		r.Stats().TCP.ResetsSent.Increment()
+	}
+	return nil
+}
+
+// makeOptions makes an options slice.
+func (e *endpoint) makeOptions(sackBlocks []header.SACKBlock) []byte {
+	options := getOptions()
+	offset := 0
+
+	// N.B. the ordering here matches the ordering used by Linux internally
+	// and described in the raw makeOptions function. We don't include
+	// unnecessary cases here (post connection.)
+	if e.sendTSOk {
+		// Embed the timestamp if timestamp has been enabled.
+		//
+		// We only use the lower 32 bits of the unix time in
+		// milliseconds. This is similar to what Linux does where it
+		// uses the lower 32 bits of the jiffies value in the tsVal
+		// field of the timestamp option.
+		//
+		// Further, RFC7323 section-5.4 recommends millisecond
+		// resolution as the lowest recommended resolution for the
+		// timestamp clock.
+		//
+		// Ref: https://tools.ietf.org/html/rfc7323#section-5.4.
+		offset += header.EncodeNOP(options[offset:])
+		offset += header.EncodeNOP(options[offset:])
+		offset += header.EncodeTSOption(e.timestamp(), e.recentTimestamp(), options[offset:])
+	}
+	if e.sackPermitted && len(sackBlocks) > 0 {
+		offset += header.EncodeNOP(options[offset:])
+		offset += header.EncodeNOP(options[offset:])
+		offset += header.EncodeSACKBlocks(sackBlocks, options[offset:])
+	}
+
+	// We expect the above to produce an aligned offset.
+	if delta := header.AddTCPOptionPadding(options, offset); delta != 0 {
+		panic("unexpected option encoding")
+	}
+
+	return options[:offset]
+}
+
+// sendRaw sends a TCP segment to the endpoint's peer.
+func (e *endpoint) sendRaw(data buffer.VectorisedView, flags byte, seq, ack seqnum.Value, rcvWnd seqnum.Size) *tcpip.Error {
+	var sackBlocks []header.SACKBlock
+	if e.EndpointState() == StateEstablished && e.rcv.pendingBufSize > 0 && (flags&header.TCPFlagAck != 0) {
+		sackBlocks = e.sack.Blocks[:e.sack.NumBlocks]
+	}
+	options := e.makeOptions(sackBlocks)
+	err := e.sendTCP(&e.route, tcpFields{
+		id:     e.ID,
+		ttl:    e.ttl,
+		tos:    e.sendTOS,
+		flags:  flags,
+		seq:    seq,
+		ack:    ack,
+		rcvWnd: rcvWnd,
+		opts:   options,
+	}, data, e.gso)
+	putOptions(options)
+	return err
+}
+
+func (e *endpoint) handleWrite() *tcpip.Error {
+	// Move packets from send queue to send list. The queue is accessible
+	// from other goroutines and protected by the send mutex, while the send
+	// list is only accessible from the handler goroutine, so it needs no
+	// mutexes.
+	e.sndBufMu.Lock()
+
+	first := e.sndQueue.Front()
+	if first != nil {
+		e.snd.writeList.PushBackList(&e.sndQueue)
+		e.sndBufInQueue = 0
+	}
+
+	e.sndBufMu.Unlock()
+
+	// Initialize the next segment to write if it's currently nil.
+	if e.snd.writeNext == nil {
+		e.snd.writeNext = first
+	}
+
+	// Push out any new packets.
+	e.snd.sendData()
+
+	return nil
+}
+
+func (e *endpoint) handleClose() *tcpip.Error {
+	if !e.EndpointState().connected() {
+		return nil
+	}
+	// Drain the send queue.
+	e.handleWrite()
+
+	// Mark send side as closed.
+	e.snd.closed = true
+
+	return nil
+}
+
+// resetConnectionLocked puts the endpoint in an error state with the given
+// error code and sends a RST if and only if the error is not ErrConnectionReset
+// indicating that the connection is being reset due to receiving a RST. This
+// method must only be called from the protocol goroutine.
+func (e *endpoint) resetConnectionLocked(err *tcpip.Error) {
+	// Only send a reset if the connection is being aborted for a reason
+	// other than receiving a reset.
+	e.setEndpointState(StateError)
+	e.HardError = err
+	if err != tcpip.ErrConnectionReset && err != tcpip.ErrTimeout {
+		// The exact sequence number to be used for the RST is the same as the
+		// one used by Linux. We need to handle the case of window being shrunk
+		// which can cause sndNxt to be outside the acceptable window on the
+		// receiver.
+		//
+		// See: https://www.snellman.net/blog/archive/2016-02-01-tcp-rst/ for more
+		// information.
+		sndWndEnd := e.snd.sndUna.Add(e.snd.sndWnd)
+		resetSeqNum := sndWndEnd
+		if !sndWndEnd.LessThan(e.snd.sndNxt) || e.snd.sndNxt.Size(sndWndEnd) < (1<<e.snd.sndWndScale) {
+			resetSeqNum = e.snd.sndNxt
+		}
+		e.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck|header.TCPFlagRst, resetSeqNum, e.rcv.rcvNxt, 0)
+	}
+}
+
+// completeWorkerLocked is called by the worker goroutine when it's about to
+// exit.
+func (e *endpoint) completeWorkerLocked() {
+	// Worker is terminating(either due to moving to
+	// CLOSED or ERROR state, ensure we release all
+	// registrations port reservations even if the socket
+	// itself is not yet closed by the application.
+	e.workerRunning = false
+	if e.workerCleanup {
+		e.cleanupLocked()
+	}
+}
+
+// transitionToStateEstablisedLocked transitions a given endpoint
+// to an established state using the handshake parameters provided.
+// It also initializes sender/receiver.
+func (e *endpoint) transitionToStateEstablishedLocked(h *handshake) {
+	// Transfer handshake state to TCP connection. We disable
+	// receive window scaling if the peer doesn't support it
+	// (indicated by a negative send window scale).
+	e.snd = newSender(e, h.iss, h.ackNum-1, h.sndWnd, h.mss, h.sndWndScale)
+
+	rcvBufSize := seqnum.Size(e.receiveBufferSize())
+	e.rcvListMu.Lock()
+	e.rcv = newReceiver(e, h.ackNum-1, h.rcvWnd, h.effectiveRcvWndScale(), rcvBufSize)
+	// Bootstrap the auto tuning algorithm. Starting at zero will
+	// result in a really large receive window after the first auto
+	// tuning adjustment.
+	e.rcvAutoParams.prevCopied = int(h.rcvWnd)
+	e.rcvListMu.Unlock()
+
+	e.setEndpointState(StateEstablished)
+}
+
+// transitionToStateCloseLocked ensures that the endpoint is
+// cleaned up from the transport demuxer, "before" moving to
+// StateClose. This will ensure that no packet will be
+// delivered to this endpoint from the demuxer when the endpoint
+// is transitioned to StateClose.
+func (e *endpoint) transitionToStateCloseLocked() {
+	if e.EndpointState() == StateClose {
+		return
+	}
+	// Mark the endpoint as fully closed for reads/writes.
+	e.cleanupLocked()
+	e.setEndpointState(StateClose)
+	e.stack.Stats().TCP.CurrentConnected.Decrement()
+	e.stack.Stats().TCP.EstablishedClosed.Increment()
+}
+
+// tryDeliverSegmentFromClosedEndpoint attempts to deliver the parsed
+// segment to any other endpoint other than the current one. This is called
+// only when the endpoint is in StateClose and we want to deliver the segment
+// to any other listening endpoint. We reply with RST if we cannot find one.
+func (e *endpoint) tryDeliverSegmentFromClosedEndpoint(s *segment) {
+	ep := e.stack.FindTransportEndpoint(e.NetProto, e.TransProto, e.ID, &s.route)
+	if ep == nil && e.NetProto == header.IPv6ProtocolNumber && e.EndpointInfo.TransportEndpointInfo.ID.LocalAddress.To4() != "" {
+		// Dual-stack socket, try IPv4.
+		ep = e.stack.FindTransportEndpoint(header.IPv4ProtocolNumber, e.TransProto, e.ID, &s.route)
+	}
+	if ep == nil {
+		replyWithReset(s, stack.DefaultTOS, s.route.DefaultTTL())
+		s.decRef()
+		return
+	}
+
+	if e == ep {
+		panic("current endpoint not removed from demuxer, enqueing segments to itself")
+	}
+
+	if ep := ep.(*endpoint); ep.enqueueSegment(s) {
+		ep.newSegmentWaker.Assert()
+	}
+}
+
+// Drain segment queue from the endpoint and try to re-match the segment to a
+// different endpoint. This is used when the current endpoint is transitioned to
+// StateClose and has been unregistered from the transport demuxer.
+func (e *endpoint) drainClosingSegmentQueue() {
+	for {
+		s := e.segmentQueue.dequeue()
+		if s == nil {
+			break
+		}
+
+		e.tryDeliverSegmentFromClosedEndpoint(s)
+	}
+}
+
+func (e *endpoint) handleReset(s *segment) (ok bool, err *tcpip.Error) {
+	if e.rcv.acceptable(s.sequenceNumber, 0) {
+		// RFC 793, page 37 states that "in all states
+		// except SYN-SENT, all reset (RST) segments are
+		// validated by checking their SEQ-fields." So
+		// we only process it if it's acceptable.
+		switch e.EndpointState() {
+		// In case of a RST in CLOSE-WAIT linux moves
+		// the socket to closed state with an error set
+		// to indicate EPIPE.
+		//
+		// Technically this seems to be at odds w/ RFC.
+		// As per https://tools.ietf.org/html/rfc793#section-2.7
+		// page 69 the behavior for a segment arriving
+		// w/ RST bit set in CLOSE-WAIT is inlined below.
+		//
+		//  ESTABLISHED
+		//  FIN-WAIT-1
+		//  FIN-WAIT-2
+		//  CLOSE-WAIT
+
+		//  If the RST bit is set then, any outstanding RECEIVEs and
+		//  SEND should receive "reset" responses. All segment queues
+		//  should be flushed.  Users should also receive an unsolicited
+		//  general "connection reset" signal. Enter the CLOSED state,
+		//  delete the TCB, and return.
+		case StateCloseWait:
+			e.transitionToStateCloseLocked()
+			e.HardError = tcpip.ErrAborted
+			e.notifyProtocolGoroutine(notifyTickleWorker)
+			return false, nil
+		default:
+			// RFC 793, page 37 states that "in all states
+			// except SYN-SENT, all reset (RST) segments are
+			// validated by checking their SEQ-fields." So
+			// we only process it if it's acceptable.
+
+			// Notify protocol goroutine. This is required when
+			// handleSegment is invoked from the processor goroutine
+			// rather than the worker goroutine.
+			e.notifyProtocolGoroutine(notifyResetByPeer)
+			return false, tcpip.ErrConnectionReset
+		}
+	}
+	return true, nil
+}
+
+// handleSegments processes all inbound segments.
+func (e *endpoint) handleSegments(fastPath bool) *tcpip.Error {
+	checkRequeue := true
+	for i := 0; i < maxSegmentsPerWake; i++ {
+		if e.EndpointState().closed() {
+			return nil
+		}
+		s := e.segmentQueue.dequeue()
+		if s == nil {
+			checkRequeue = false
+			break
+		}
+
+		cont, err := e.handleSegment(s)
+		if err != nil {
+			s.decRef()
+			return err
+		}
+		if !cont {
+			s.decRef()
+			return nil
+		}
+	}
+
+	// When fastPath is true we don't want to wake up the worker
+	// goroutine. If the endpoint has more segments to process the
+	// dispatcher will call handleSegments again anyway.
+	if !fastPath && checkRequeue && !e.segmentQueue.empty() {
+		e.newSegmentWaker.Assert()
+	}
+
+	// Send an ACK for all processed packets if needed.
+	if e.rcv.rcvNxt != e.snd.maxSentAck {
+		e.snd.sendAck()
+	}
+
+	e.resetKeepaliveTimer(true /* receivedData */)
+
+	return nil
+}
+
+// handleSegment handles a given segment and notifies the worker goroutine if
+// if the connection should be terminated.
+func (e *endpoint) handleSegment(s *segment) (cont bool, err *tcpip.Error) {
+	// Invoke the tcp probe if installed.
+	if e.probe != nil {
+		e.probe(e.completeState())
+	}
+
+	if s.flagIsSet(header.TCPFlagRst) {
+		if ok, err := e.handleReset(s); !ok {
+			return false, err
+		}
+	} else if s.flagIsSet(header.TCPFlagSyn) {
+		// See: https://tools.ietf.org/html/rfc5961#section-4.1
+		//   1) If the SYN bit is set, irrespective of the sequence number, TCP
+		//    MUST send an ACK (also referred to as challenge ACK) to the remote
+		//    peer:
+		//
+		//    <SEQ=SND.NXT><ACK=RCV.NXT><CTL=ACK>
+		//
+		//    After sending the acknowledgment, TCP MUST drop the unacceptable
+		//    segment and stop processing further.
+		//
+		// By sending an ACK, the remote peer is challenged to confirm the loss
+		// of the previous connection and the request to start a new connection.
+		// A legitimate peer, after restart, would not have a TCB in the
+		// synchronized state.  Thus, when the ACK arrives, the peer should send
+		// a RST segment back with the sequence number derived from the ACK
+		// field that caused the RST.
+
+		// This RST will confirm that the remote peer has indeed closed the
+		// previous connection.  Upon receipt of a valid RST, the local TCP
+		// endpoint MUST terminate its connection.  The local TCP endpoint
+		// should then rely on SYN retransmission from the remote end to
+		// re-establish the connection.
+
+		e.snd.sendAck()
+	} else if s.flagIsSet(header.TCPFlagAck) {
+		// Patch the window size in the segment according to the
+		// send window scale.
+		s.window <<= e.snd.sndWndScale
+
+		// RFC 793, page 41 states that "once in the ESTABLISHED
+		// state all segments must carry current acknowledgment
+		// information."
+		drop, err := e.rcv.handleRcvdSegment(s)
+		if err != nil {
+			return false, err
+		}
+		if drop {
+			return true, nil
+		}
+
+		// Now check if the received segment has caused us to transition
+		// to a CLOSED state, if yes then terminate processing and do
+		// not invoke the sender.
+		state := e.state
+		if state == StateClose {
+			// When we get into StateClose while processing from the queue,
+			// return immediately and let the protocolMainloop handle it.
+			//
+			// We can reach StateClose only while processing a previous segment
+			// or a notification from the protocolMainLoop (caller goroutine).
+			// This means that with this return, the segment dequeue below can
+			// never occur on a closed endpoint.
+			s.decRef()
+			return false, nil
+		}
+
+		e.snd.handleRcvdSegment(s)
+	}
+
+	return true, nil
+}
+
+// keepaliveTimerExpired is called when the keepaliveTimer fires. We send TCP
+// keepalive packets periodically when the connection is idle. If we don't hear
+// from the other side after a number of tries, we terminate the connection.
+func (e *endpoint) keepaliveTimerExpired() *tcpip.Error {
+	userTimeout := e.userTimeout
+
+	e.keepalive.Lock()
+	if !e.keepalive.enabled || !e.keepalive.timer.checkExpiration() {
+		e.keepalive.Unlock()
+		return nil
+	}
+
+	// If a userTimeout is set then abort the connection if it is
+	// exceeded.
+	if userTimeout != 0 && time.Since(e.rcv.lastRcvdAckTime) >= userTimeout && e.keepalive.unacked > 0 {
+		e.keepalive.Unlock()
+		e.stack.Stats().TCP.EstablishedTimedout.Increment()
+		return tcpip.ErrTimeout
+	}
+
+	if e.keepalive.unacked >= e.keepalive.count {
+		e.keepalive.Unlock()
+		e.stack.Stats().TCP.EstablishedTimedout.Increment()
+		return tcpip.ErrTimeout
+	}
+
+	// RFC1122 4.2.3.6: TCP keepalive is a dataless ACK with
+	// seg.seq = snd.nxt-1.
+	e.keepalive.unacked++
+	e.keepalive.Unlock()
+	e.snd.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, e.snd.sndNxt-1)
+	e.resetKeepaliveTimer(false)
+	return nil
+}
+
+// resetKeepaliveTimer restarts or stops the keepalive timer, depending on
+// whether it is enabled for this endpoint.
+func (e *endpoint) resetKeepaliveTimer(receivedData bool) {
+	e.keepalive.Lock()
+	if receivedData {
+		e.keepalive.unacked = 0
+	}
+	// Start the keepalive timer IFF it's enabled and there is no pending
+	// data to send.
+	if !e.keepalive.enabled || e.snd == nil || e.snd.sndUna != e.snd.sndNxt {
+		e.keepalive.timer.disable()
+		e.keepalive.Unlock()
+		return
+	}
+	if e.keepalive.unacked > 0 {
+		e.keepalive.timer.enable(e.keepalive.interval)
+	} else {
+		e.keepalive.timer.enable(e.keepalive.idle)
+	}
+	e.keepalive.Unlock()
+}
+
+// disableKeepaliveTimer stops the keepalive timer.
+func (e *endpoint) disableKeepaliveTimer() {
+	e.keepalive.Lock()
+	e.keepalive.timer.disable()
+	e.keepalive.Unlock()
+}
+
+// protocolMainLoop is the main loop of the TCP protocol. It runs in its own
+// goroutine and is responsible for sending segments and handling received
+// segments.
+func (e *endpoint) protocolMainLoop(handshake bool, wakerInitDone chan<- struct{}) *tcpip.Error {
+	e.mu.Lock()
+	var closeTimer *time.Timer
+	var closeWaker sleep.Waker
+
+	epilogue := func() {
+		// e.mu is expected to be hold upon entering this section.
+
+		if e.snd != nil {
+			e.snd.resendTimer.cleanup()
+		}
+
+		if closeTimer != nil {
+			closeTimer.Stop()
+		}
+
+		e.completeWorkerLocked()
+
+		if e.drainDone != nil {
+			close(e.drainDone)
+		}
+
+		e.mu.Unlock()
+
+		e.drainClosingSegmentQueue()
+
+		// When the protocol loop exits we should wake up our waiters.
+		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+	}
+
+	if handshake {
+		// This is an active connection, so we must initiate the 3-way
+		// handshake, and then inform potential waiters about its
+		// completion.
+		initialRcvWnd := e.initialReceiveWindow()
+		h := newHandshake(e, seqnum.Size(initialRcvWnd))
+		h.ep.setEndpointState(StateSynSent)
+
+		if err := h.execute(); err != nil {
+			e.lastErrorMu.Lock()
+			e.lastError = err
+			e.lastErrorMu.Unlock()
+
+			e.setEndpointState(StateError)
+			e.HardError = err
+
+			e.workerCleanup = true
+			// Lock released below.
+			epilogue()
+			return err
+		}
+	}
+
+	e.keepalive.timer.init(&e.keepalive.waker)
+	defer e.keepalive.timer.cleanup()
+
+	drained := e.drainDone != nil
+	if drained {
+		close(e.drainDone)
+		<-e.undrain
+	}
+
+	// Set up the functions that will be called when the main protocol loop
+	// wakes up.
+	funcs := []struct {
+		w *sleep.Waker
+		f func() *tcpip.Error
+	}{
+		{
+			w: &e.sndWaker,
+			f: e.handleWrite,
+		},
+		{
+			w: &e.sndCloseWaker,
+			f: e.handleClose,
+		},
+		{
+			w: &closeWaker,
+			f: func() *tcpip.Error {
+				// This means the socket is being closed due
+				// to the TCP-FIN-WAIT2 timeout was hit. Just
+				// mark the socket as closed.
+				e.transitionToStateCloseLocked()
+				e.workerCleanup = true
+				return nil
+			},
+		},
+		{
+			w: &e.snd.resendWaker,
+			f: func() *tcpip.Error {
+				if !e.snd.retransmitTimerExpired() {
+					e.stack.Stats().TCP.EstablishedTimedout.Increment()
+					return tcpip.ErrTimeout
+				}
+				return nil
+			},
+		},
+		{
+			w: &e.newSegmentWaker,
+			f: func() *tcpip.Error {
+				return e.handleSegments(false /* fastPath */)
+			},
+		},
+		{
+			w: &e.keepalive.waker,
+			f: e.keepaliveTimerExpired,
+		},
+		{
+			w: &e.notificationWaker,
+			f: func() *tcpip.Error {
+				n := e.fetchNotifications()
+				if n&notifyNonZeroReceiveWindow != 0 {
+					e.rcv.nonZeroWindow()
+				}
+
+				if n&notifyReceiveWindowChanged != 0 {
+					e.rcv.pendingBufSize = seqnum.Size(e.receiveBufferSize())
+				}
+
+				if n&notifyMTUChanged != 0 {
+					e.sndBufMu.Lock()
+					count := e.packetTooBigCount
+					e.packetTooBigCount = 0
+					mtu := e.sndMTU
+					e.sndBufMu.Unlock()
+
+					e.snd.updateMaxPayloadSize(mtu, count)
+				}
+
+				if n&notifyReset != 0 || n&notifyAbort != 0 {
+					return tcpip.ErrConnectionAborted
+				}
+
+				if n&notifyResetByPeer != 0 {
+					return tcpip.ErrConnectionReset
+				}
+
+				if n&notifyClose != 0 && closeTimer == nil {
+					if e.EndpointState() == StateFinWait2 && e.closed {
+						// The socket has been closed and we are in FIN_WAIT2
+						// so start the FIN_WAIT2 timer.
+						closeTimer = time.AfterFunc(e.tcpLingerTimeout, closeWaker.Assert)
+						e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+					}
+				}
+
+				if n&notifyKeepaliveChanged != 0 {
+					// The timer could fire in background
+					// when the endpoint is drained. That's
+					// OK. See above.
+					e.resetKeepaliveTimer(true)
+				}
+
+				if n&notifyDrain != 0 {
+					for !e.segmentQueue.empty() {
+						if err := e.handleSegments(false /* fastPath */); err != nil {
+							return err
+						}
+					}
+					if !e.EndpointState().closed() {
+						// Only block the worker if the endpoint
+						// is not in closed state or error state.
+						close(e.drainDone)
+						e.mu.Unlock()
+						<-e.undrain
+						e.mu.Lock()
+					}
+				}
+
+				if n&notifyTickleWorker != 0 {
+					// Just a tickle notification. No need to do
+					// anything.
+					return nil
+				}
+
+				return nil
+			},
+		},
+	}
+
+	// Initialize the sleeper based on the wakers in funcs.
+	s := sleep.Sleeper{}
+	for i := range funcs {
+		s.AddWaker(funcs[i].w, i)
+	}
+
+	// Notify the caller that the waker initialization is complete and the
+	// endpoint is ready.
+	if wakerInitDone != nil {
+		close(wakerInitDone)
+	}
+
+	// Tell waiters that the endpoint is connected and writable.
+	e.waiterQueue.Notify(waiter.EventOut)
+
+	// The following assertions and notifications are needed for restored
+	// endpoints. Fresh newly created endpoints have empty states and should
+	// not invoke any.
+	if !e.segmentQueue.empty() {
+		e.newSegmentWaker.Assert()
+	}
+
+	e.rcvListMu.Lock()
+	if !e.rcvList.Empty() {
+		e.waiterQueue.Notify(waiter.EventIn)
+	}
+	e.rcvListMu.Unlock()
+
+	if e.workerCleanup {
+		e.notifyProtocolGoroutine(notifyClose)
+	}
+
+	// Main loop. Handle segments until both send and receive ends of the
+	// connection have completed.
+	cleanupOnError := func(err *tcpip.Error) {
+		e.stack.Stats().TCP.CurrentConnected.Decrement()
+		e.workerCleanup = true
+		if err != nil {
+			e.resetConnectionLocked(err)
+		}
+		// Lock released below.
+		epilogue()
+	}
+
+loop:
+	for {
+		switch e.EndpointState() {
+		case StateTimeWait, StateClose, StateError:
+			break loop
+		}
+
+		e.mu.Unlock()
+		v, _ := s.Fetch(true)
+		e.mu.Lock()
+
+		// We need to double check here because the notification may be
+		// stale by the time we got around to processing it.
+		switch e.EndpointState() {
+		case StateError:
+			// If the endpoint has already transitioned to an ERROR
+			// state just pass nil here as any reset that may need
+			// to be sent etc should already have been done and we
+			// just want to terminate the loop and cleanup the
+			// endpoint.
+			cleanupOnError(nil)
+			return nil
+		case StateTimeWait:
+			fallthrough
+		case StateClose:
+			break loop
+		default:
+			if err := funcs[v].f(); err != nil {
+				cleanupOnError(err)
+				return nil
+			}
+		}
+	}
+
+	var reuseTW func()
+	if e.EndpointState() == StateTimeWait {
+		// Disable close timer as we now entering real TIME_WAIT.
+		if closeTimer != nil {
+			closeTimer.Stop()
+		}
+		// Mark the current sleeper done so as to free all associated
+		// wakers.
+		s.Done()
+		// Wake up any waiters before we enter TIME_WAIT.
+		e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+		e.workerCleanup = true
+		reuseTW = e.doTimeWait()
+	}
+
+	// Handle any StateError transition from StateTimeWait.
+	if e.EndpointState() == StateError {
+		cleanupOnError(nil)
+		return nil
+	}
+
+	e.transitionToStateCloseLocked()
+
+	// Lock released below.
+	epilogue()
+
+	// A new SYN was received during TIME_WAIT and we need to abort
+	// the timewait and redirect the segment to the listener queue
+	if reuseTW != nil {
+		reuseTW()
+	}
+
+	return nil
+}
+
+// handleTimeWaitSegments processes segments received during TIME_WAIT
+// state.
+func (e *endpoint) handleTimeWaitSegments() (extendTimeWait bool, reuseTW func()) {
+	checkRequeue := true
+	for i := 0; i < maxSegmentsPerWake; i++ {
+		s := e.segmentQueue.dequeue()
+		if s == nil {
+			checkRequeue = false
+			break
+		}
+		extTW, newSyn := e.rcv.handleTimeWaitSegment(s)
+		if newSyn {
+			info := e.EndpointInfo.TransportEndpointInfo
+			newID := info.ID
+			newID.RemoteAddress = ""
+			newID.RemotePort = 0
+			netProtos := []tcpip.NetworkProtocolNumber{info.NetProto}
+			// If the local address is an IPv4 address then also
+			// look for IPv6 dual stack endpoints that might be
+			// listening on the local address.
+			if newID.LocalAddress.To4() != "" {
+				netProtos = []tcpip.NetworkProtocolNumber{header.IPv4ProtocolNumber, header.IPv6ProtocolNumber}
+			}
+			for _, netProto := range netProtos {
+				if listenEP := e.stack.FindTransportEndpoint(netProto, info.TransProto, newID, &s.route); listenEP != nil {
+					tcpEP := listenEP.(*endpoint)
+					if EndpointState(tcpEP.State()) == StateListen {
+						reuseTW = func() {
+							if !tcpEP.enqueueSegment(s) {
+								s.decRef()
+								return
+							}
+							tcpEP.newSegmentWaker.Assert()
+						}
+						// We explicitly do not decRef
+						// the segment as it's still
+						// valid and being reflected to
+						// a listening endpoint.
+						return false, reuseTW
+					}
+				}
+			}
+		}
+		if extTW {
+			extendTimeWait = true
+		}
+		s.decRef()
+	}
+	if checkRequeue && !e.segmentQueue.empty() {
+		e.newSegmentWaker.Assert()
+	}
+	return extendTimeWait, nil
+}
+
+// doTimeWait is responsible for handling the TCP behaviour once a socket
+// enters the TIME_WAIT state. Optionally it can return a closure that
+// should be executed after releasing the endpoint registrations. This is
+// done in cases where a new SYN is received during TIME_WAIT that carries
+// a sequence number larger than one see on the connection.
+func (e *endpoint) doTimeWait() (twReuse func()) {
+	// Trigger a 2 * MSL time wait state. During this period
+	// we will drop all incoming segments.
+	// NOTE: On Linux this is not configurable and is fixed at 60 seconds.
+	timeWaitDuration := DefaultTCPTimeWaitTimeout
+
+	// Get the stack wide configuration.
+	var tcpTW tcpip.TCPTimeWaitTimeoutOption
+	if err := e.stack.TransportProtocolOption(ProtocolNumber, &tcpTW); err == nil {
+		timeWaitDuration = time.Duration(tcpTW)
+	}
+
+	const newSegment = 1
+	const notification = 2
+	const timeWaitDone = 3
+
+	s := sleep.Sleeper{}
+	defer s.Done()
+	s.AddWaker(&e.newSegmentWaker, newSegment)
+	s.AddWaker(&e.notificationWaker, notification)
+
+	var timeWaitWaker sleep.Waker
+	s.AddWaker(&timeWaitWaker, timeWaitDone)
+	timeWaitTimer := time.AfterFunc(timeWaitDuration, timeWaitWaker.Assert)
+	defer timeWaitTimer.Stop()
+
+	for {
+		e.mu.Unlock()
+		v, _ := s.Fetch(true)
+		e.mu.Lock()
+		switch v {
+		case newSegment:
+			extendTimeWait, reuseTW := e.handleTimeWaitSegments()
+			if reuseTW != nil {
+				return reuseTW
+			}
+			if extendTimeWait {
+				timeWaitTimer.Reset(timeWaitDuration)
+			}
+		case notification:
+			n := e.fetchNotifications()
+			if n&notifyClose != 0 || n&notifyAbort != 0 {
+				return nil
+			}
+			if n&notifyDrain != 0 {
+				for !e.segmentQueue.empty() {
+					// Ignore extending TIME_WAIT during a
+					// save. For sockets in TIME_WAIT we just
+					// terminate the TIME_WAIT early.
+					e.handleTimeWaitSegments()
+				}
+				close(e.drainDone)
+				e.mu.Unlock()
+				<-e.undrain
+				e.mu.Lock()
+				return nil
+			}
+		case timeWaitDone:
+			return nil
+		}
+	}
+}
diff --git a/pkg/tcpip/transport/tcp/connect_unsafe.go b/pkg/tcpip/transport/tcp/connect_unsafe.go
new file mode 100644
index 000000000..cfc304616
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/connect_unsafe.go
@@ -0,0 +1,30 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"reflect"
+	"unsafe"
+)
+
+// optionsToArray converts a slice of capacity >-= maxOptionSize to an array.
+//
+// optionsToArray panics if the capacity of options is smaller than
+// maxOptionSize.
+func optionsToArray(options []byte) *[maxOptionSize]byte {
+	// Reslice to full capacity.
+	options = options[0:maxOptionSize]
+	return (*[maxOptionSize]byte)(unsafe.Pointer((*reflect.SliceHeader)(unsafe.Pointer(&options)).Data))
+}
diff --git a/pkg/tcpip/transport/tcp/cubic.go b/pkg/tcpip/transport/tcp/cubic.go
new file mode 100644
index 000000000..7b1f5e763
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/cubic.go
@@ -0,0 +1,234 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"math"
+	"time"
+)
+
+// cubicState stores the variables related to TCP CUBIC congestion
+// control algorithm state.
+//
+// See: https://tools.ietf.org/html/rfc8312.
+// +stateify savable
+type cubicState struct {
+	// wLastMax is the previous wMax value.
+	wLastMax float64
+
+	// wMax is the value of the congestion window at the
+	// time of last congestion event.
+	wMax float64
+
+	// t denotes the time when the current congestion avoidance
+	// was entered.
+	t time.Time `state:".(unixTime)"`
+
+	// numCongestionEvents tracks the number of congestion events since last
+	// RTO.
+	numCongestionEvents int
+
+	// c is the cubic constant as specified in RFC8312. It's fixed at 0.4 as
+	// per RFC.
+	c float64
+
+	// k is the time period that the above function takes to increase the
+	// current window size to W_max if there are no further congestion
+	// events and is calculated using the following equation:
+	//
+	// K = cubic_root(W_max*(1-beta_cubic)/C) (Eq. 2)
+	k float64
+
+	// beta is the CUBIC multiplication decrease factor. that is, when a
+	// congestion event is detected, CUBIC reduces its cwnd to
+	// W_cubic(0)=W_max*beta_cubic.
+	beta float64
+
+	// wC is window computed by CUBIC at time t. It's calculated using the
+	// formula:
+	//
+	//  W_cubic(t) = C*(t-K)^3 + W_max (Eq. 1)
+	wC float64
+
+	// wEst is the window computed by CUBIC at time t+RTT i.e
+	// W_cubic(t+RTT).
+	wEst float64
+
+	s *sender
+}
+
+// newCubicCC returns a partially initialized cubic state with the constants
+// beta and c set and t set to current time.
+func newCubicCC(s *sender) *cubicState {
+	return &cubicState{
+		t:    time.Now(),
+		beta: 0.7,
+		c:    0.4,
+		s:    s,
+	}
+}
+
+// enterCongestionAvoidance is used to initialize cubic in cases where we exit
+// SlowStart without a real congestion event taking place. This can happen when
+// a connection goes back to slow start due to a retransmit and we exceed the
+// previously lowered ssThresh without experiencing packet loss.
+//
+// Refer: https://tools.ietf.org/html/rfc8312#section-4.8
+func (c *cubicState) enterCongestionAvoidance() {
+	// See: https://tools.ietf.org/html/rfc8312#section-4.7 &
+	// https://tools.ietf.org/html/rfc8312#section-4.8
+	if c.numCongestionEvents == 0 {
+		c.k = 0
+		c.t = time.Now()
+		c.wLastMax = c.wMax
+		c.wMax = float64(c.s.sndCwnd)
+	}
+}
+
+// updateSlowStart will update the congestion window as per the slow-start
+// algorithm used by NewReno. If after adjusting the congestion window we cross
+// the ssThresh then it will return the number of packets that must be consumed
+// in congestion avoidance mode.
+func (c *cubicState) updateSlowStart(packetsAcked int) int {
+	// Don't let the congestion window cross into the congestion
+	// avoidance range.
+	newcwnd := c.s.sndCwnd + packetsAcked
+	enterCA := false
+	if newcwnd >= c.s.sndSsthresh {
+		newcwnd = c.s.sndSsthresh
+		c.s.sndCAAckCount = 0
+		enterCA = true
+	}
+
+	packetsAcked -= newcwnd - c.s.sndCwnd
+	c.s.sndCwnd = newcwnd
+	if enterCA {
+		c.enterCongestionAvoidance()
+	}
+	return packetsAcked
+}
+
+// Update updates cubic's internal state variables. It must be called on every
+// ACK received.
+// Refer: https://tools.ietf.org/html/rfc8312#section-4
+func (c *cubicState) Update(packetsAcked int) {
+	if c.s.sndCwnd < c.s.sndSsthresh {
+		packetsAcked = c.updateSlowStart(packetsAcked)
+		if packetsAcked == 0 {
+			return
+		}
+	} else {
+		c.s.rtt.Lock()
+		srtt := c.s.rtt.srtt
+		c.s.rtt.Unlock()
+		c.s.sndCwnd = c.getCwnd(packetsAcked, c.s.sndCwnd, srtt)
+	}
+}
+
+// cubicCwnd computes the CUBIC congestion window after t seconds from last
+// congestion event.
+func (c *cubicState) cubicCwnd(t float64) float64 {
+	return c.c*math.Pow(t, 3.0) + c.wMax
+}
+
+// getCwnd returns the current congestion window as computed by CUBIC.
+// Refer: https://tools.ietf.org/html/rfc8312#section-4
+func (c *cubicState) getCwnd(packetsAcked, sndCwnd int, srtt time.Duration) int {
+	elapsed := time.Since(c.t).Seconds()
+
+	// Compute the window as per Cubic after 'elapsed' time
+	// since last congestion event.
+	c.wC = c.cubicCwnd(elapsed - c.k)
+
+	// Compute the TCP friendly estimate of the congestion window.
+	c.wEst = c.wMax*c.beta + (3.0*((1.0-c.beta)/(1.0+c.beta)))*(elapsed/srtt.Seconds())
+
+	// Make sure in the TCP friendly region CUBIC performs at least
+	// as well as Reno.
+	if c.wC < c.wEst && float64(sndCwnd) < c.wEst {
+		// TCP Friendly region of cubic.
+		return int(c.wEst)
+	}
+
+	// In Concave/Convex region of CUBIC, calculate what CUBIC window
+	// will be after 1 RTT and use that to grow congestion window
+	// for every ack.
+	tEst := (time.Since(c.t) + srtt).Seconds()
+	wtRtt := c.cubicCwnd(tEst - c.k)
+	// As per 4.3 for each received ACK cwnd must be incremented
+	// by (w_cubic(t+RTT) - cwnd/cwnd.
+	cwnd := float64(sndCwnd)
+	for i := 0; i < packetsAcked; i++ {
+		// Concave/Convex regions of cubic have the same formulas.
+		// See: https://tools.ietf.org/html/rfc8312#section-4.3
+		cwnd += (wtRtt - cwnd) / cwnd
+	}
+	return int(cwnd)
+}
+
+// HandleNDupAcks implements congestionControl.HandleNDupAcks.
+func (c *cubicState) HandleNDupAcks() {
+	// See: https://tools.ietf.org/html/rfc8312#section-4.5
+	c.numCongestionEvents++
+	c.t = time.Now()
+	c.wLastMax = c.wMax
+	c.wMax = float64(c.s.sndCwnd)
+
+	c.fastConvergence()
+	c.reduceSlowStartThreshold()
+}
+
+// HandleRTOExpired implements congestionContrl.HandleRTOExpired.
+func (c *cubicState) HandleRTOExpired() {
+	// See: https://tools.ietf.org/html/rfc8312#section-4.6
+	c.t = time.Now()
+	c.numCongestionEvents = 0
+	c.wLastMax = c.wMax
+	c.wMax = float64(c.s.sndCwnd)
+
+	c.fastConvergence()
+
+	// We lost a packet, so reduce ssthresh.
+	c.reduceSlowStartThreshold()
+
+	// Reduce the congestion window to 1, i.e., enter slow-start. Per
+	// RFC 5681, page 7, we must use 1 regardless of the value of the
+	// initial congestion window.
+	c.s.sndCwnd = 1
+}
+
+// fastConvergence implements the logic for Fast Convergence algorithm as
+// described in https://tools.ietf.org/html/rfc8312#section-4.6.
+func (c *cubicState) fastConvergence() {
+	if c.wMax < c.wLastMax {
+		c.wLastMax = c.wMax
+		c.wMax = c.wMax * (1.0 + c.beta) / 2.0
+	} else {
+		c.wLastMax = c.wMax
+	}
+	// Recompute k as wMax may have changed.
+	c.k = math.Cbrt(c.wMax * (1 - c.beta) / c.c)
+}
+
+// PostRecovery implemements congestionControl.PostRecovery.
+func (c *cubicState) PostRecovery() {
+	c.t = time.Now()
+}
+
+// reduceSlowStartThreshold returns new SsThresh as described in
+// https://tools.ietf.org/html/rfc8312#section-4.7.
+func (c *cubicState) reduceSlowStartThreshold() {
+	c.s.sndSsthresh = int(math.Max(float64(c.s.sndCwnd)*c.beta, 2.0))
+}
diff --git a/pkg/tcpip/transport/tcp/cubic_state.go b/pkg/tcpip/transport/tcp/cubic_state.go
new file mode 100644
index 000000000..d0f58cfaf
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/cubic_state.go
@@ -0,0 +1,29 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"time"
+)
+
+// saveT is invoked by stateify.
+func (c *cubicState) saveT() unixTime {
+	return unixTime{c.t.Unix(), c.t.UnixNano()}
+}
+
+// loadT is invoked by stateify.
+func (c *cubicState) loadT(unix unixTime) {
+	c.t = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/dispatcher.go b/pkg/tcpip/transport/tcp/dispatcher.go
new file mode 100644
index 000000000..98aecab9e
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/dispatcher.go
@@ -0,0 +1,234 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// epQueue is a queue of endpoints.
+type epQueue struct {
+	mu   sync.Mutex
+	list endpointList
+}
+
+// enqueue adds e to the queue if the endpoint is not already on the queue.
+func (q *epQueue) enqueue(e *endpoint) {
+	q.mu.Lock()
+	if e.pendingProcessing {
+		q.mu.Unlock()
+		return
+	}
+	q.list.PushBack(e)
+	e.pendingProcessing = true
+	q.mu.Unlock()
+}
+
+// dequeue removes and returns the first element from the queue if available,
+// returns nil otherwise.
+func (q *epQueue) dequeue() *endpoint {
+	q.mu.Lock()
+	if e := q.list.Front(); e != nil {
+		q.list.Remove(e)
+		e.pendingProcessing = false
+		q.mu.Unlock()
+		return e
+	}
+	q.mu.Unlock()
+	return nil
+}
+
+// empty returns true if the queue is empty, false otherwise.
+func (q *epQueue) empty() bool {
+	q.mu.Lock()
+	v := q.list.Empty()
+	q.mu.Unlock()
+	return v
+}
+
+// processor is responsible for processing packets queued to a tcp endpoint.
+type processor struct {
+	epQ              epQueue
+	sleeper          sleep.Sleeper
+	newEndpointWaker sleep.Waker
+	closeWaker       sleep.Waker
+}
+
+func (p *processor) close() {
+	p.closeWaker.Assert()
+}
+
+func (p *processor) queueEndpoint(ep *endpoint) {
+	// Queue an endpoint for processing by the processor goroutine.
+	p.epQ.enqueue(ep)
+	p.newEndpointWaker.Assert()
+}
+
+const (
+	newEndpointWaker = 1
+	closeWaker       = 2
+)
+
+func (p *processor) start(wg *sync.WaitGroup) {
+	defer wg.Done()
+	defer p.sleeper.Done()
+
+	for {
+		if id, _ := p.sleeper.Fetch(true); id == closeWaker {
+			break
+		}
+		for {
+			ep := p.epQ.dequeue()
+			if ep == nil {
+				break
+			}
+			if ep.segmentQueue.empty() {
+				continue
+			}
+
+			// If socket has transitioned out of connected state then just let the
+			// worker handle the packet.
+			//
+			// NOTE: We read this outside of e.mu lock which means that by the time
+			// we get to handleSegments the endpoint may not be in ESTABLISHED. But
+			// this should be fine as all normal shutdown states are handled by
+			// handleSegments and if the endpoint moves to a CLOSED/ERROR state
+			// then handleSegments is a noop.
+			if ep.EndpointState() == StateEstablished && ep.mu.TryLock() {
+				// If the endpoint is in a connected state then we do direct delivery
+				// to ensure low latency and avoid scheduler interactions.
+				switch err := ep.handleSegments(true /* fastPath */); {
+				case err != nil:
+					// Send any active resets if required.
+					ep.resetConnectionLocked(err)
+					fallthrough
+				case ep.EndpointState() == StateClose:
+					ep.notifyProtocolGoroutine(notifyTickleWorker)
+				case !ep.segmentQueue.empty():
+					p.epQ.enqueue(ep)
+				}
+				ep.mu.Unlock()
+			} else {
+				ep.newSegmentWaker.Assert()
+			}
+		}
+	}
+}
+
+// dispatcher manages a pool of TCP endpoint processors which are responsible
+// for the processing of inbound segments. This fixed pool of processor
+// goroutines do full tcp processing. The processor is selected based on the
+// hash of the endpoint id to ensure that delivery for the same endpoint happens
+// in-order.
+type dispatcher struct {
+	processors []processor
+	seed       uint32
+	wg         sync.WaitGroup
+}
+
+func (d *dispatcher) init(nProcessors int) {
+	d.close()
+	d.wait()
+	d.processors = make([]processor, nProcessors)
+	d.seed = generateRandUint32()
+	for i := range d.processors {
+		p := &d.processors[i]
+		p.sleeper.AddWaker(&p.newEndpointWaker, newEndpointWaker)
+		p.sleeper.AddWaker(&p.closeWaker, closeWaker)
+		d.wg.Add(1)
+		// NB: sleeper-waker registration must happen synchronously to avoid races
+		// with `close`.  It's possible to pull all this logic into `start`, but
+		// that results in a heap-allocated function literal.
+		go p.start(&d.wg)
+	}
+}
+
+func (d *dispatcher) close() {
+	for i := range d.processors {
+		d.processors[i].close()
+	}
+}
+
+func (d *dispatcher) wait() {
+	d.wg.Wait()
+}
+
+func (d *dispatcher) queuePacket(r *stack.Route, stackEP stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
+	ep := stackEP.(*endpoint)
+	s := newSegment(r, id, pkt)
+	if !s.parse() {
+		ep.stack.Stats().MalformedRcvdPackets.Increment()
+		ep.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
+		ep.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
+		s.decRef()
+		return
+	}
+
+	if !s.csumValid {
+		ep.stack.Stats().MalformedRcvdPackets.Increment()
+		ep.stack.Stats().TCP.ChecksumErrors.Increment()
+		ep.stats.ReceiveErrors.ChecksumErrors.Increment()
+		s.decRef()
+		return
+	}
+
+	ep.stack.Stats().TCP.ValidSegmentsReceived.Increment()
+	ep.stats.SegmentsReceived.Increment()
+	if (s.flags & header.TCPFlagRst) != 0 {
+		ep.stack.Stats().TCP.ResetsReceived.Increment()
+	}
+
+	if !ep.enqueueSegment(s) {
+		s.decRef()
+		return
+	}
+
+	// For sockets not in established state let the worker goroutine
+	// handle the packets.
+	if ep.EndpointState() != StateEstablished {
+		ep.newSegmentWaker.Assert()
+		return
+	}
+
+	d.selectProcessor(id).queueEndpoint(ep)
+}
+
+func generateRandUint32() uint32 {
+	b := make([]byte, 4)
+	if _, err := rand.Read(b); err != nil {
+		panic(err)
+	}
+	return binary.LittleEndian.Uint32(b)
+}
+
+func (d *dispatcher) selectProcessor(id stack.TransportEndpointID) *processor {
+	var payload [4]byte
+	binary.LittleEndian.PutUint16(payload[0:], id.LocalPort)
+	binary.LittleEndian.PutUint16(payload[2:], id.RemotePort)
+
+	h := jenkins.Sum32(d.seed)
+	h.Write(payload[:])
+	h.Write([]byte(id.LocalAddress))
+	h.Write([]byte(id.RemoteAddress))
+
+	return &d.processors[h.Sum32()%uint32(len(d.processors))]
+}
diff --git a/pkg/tcpip/transport/tcp/dual_stack_test.go b/pkg/tcpip/transport/tcp/dual_stack_test.go
new file mode 100644
index 000000000..804e95aea
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/dual_stack_test.go
@@ -0,0 +1,651 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_test
+
+import (
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func TestV4MappedConnectOnV6Only(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(true)
+
+	// Start connection attempt, it must fail.
+	err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestV4MappedAddr, Port: context.TestPort})
+	if err != tcpip.ErrNoRoute {
+		t.Fatalf("Unexpected return value from Connect: %v", err)
+	}
+}
+
+func testV4Connect(t *testing.T, c *context.Context, checkers ...checker.NetworkChecker) {
+	// Start connection attempt.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventOut)
+	defer c.WQ.EventUnregister(&we)
+
+	err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestV4MappedAddr, Port: context.TestPort})
+	if err != tcpip.ErrConnectStarted {
+		t.Fatalf("Unexpected return value from Connect: %v", err)
+	}
+
+	// Receive SYN packet.
+	b := c.GetPacket()
+	synCheckers := append(checkers, checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagSyn),
+	))
+	checker.IPv4(t, b, synCheckers...)
+
+	tcp := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcp.SequenceNumber())
+
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: tcp.DestinationPort(),
+		DstPort: tcp.SourcePort(),
+		Flags:   header.TCPFlagSyn | header.TCPFlagAck,
+		SeqNum:  iss,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Receive ACK packet.
+	ackCheckers := append(checkers, checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(c.IRS)+1),
+		checker.AckNum(uint32(iss)+1),
+	))
+	checker.IPv4(t, c.GetPacket(), ackCheckers...)
+
+	// Wait for connection to be established.
+	select {
+	case <-ch:
+		err = c.EP.GetSockOpt(tcpip.ErrorOption{})
+		if err != nil {
+			t.Fatalf("Unexpected error when connecting: %v", err)
+		}
+	case <-time.After(1 * time.Second):
+		t.Fatalf("Timed out waiting for connection")
+	}
+}
+
+func TestV4MappedConnect(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(false)
+
+	// Test the connection request.
+	testV4Connect(t, c)
+}
+
+func TestV4ConnectWhenBoundToWildcard(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(false)
+
+	// Bind to wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Test the connection request.
+	testV4Connect(t, c)
+}
+
+func TestV4ConnectWhenBoundToV4MappedWildcard(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(false)
+
+	// Bind to v4 mapped wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.V4MappedWildcardAddr}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Test the connection request.
+	testV4Connect(t, c)
+}
+
+func TestV4ConnectWhenBoundToV4Mapped(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(false)
+
+	// Bind to v4 mapped address.
+	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackV4MappedAddr}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Test the connection request.
+	testV4Connect(t, c)
+}
+
+func testV6Connect(t *testing.T, c *context.Context, checkers ...checker.NetworkChecker) {
+	// Start connection attempt to IPv6 address.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventOut)
+	defer c.WQ.EventUnregister(&we)
+
+	err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestV6Addr, Port: context.TestPort})
+	if err != tcpip.ErrConnectStarted {
+		t.Fatalf("Unexpected return value from Connect: %v", err)
+	}
+
+	// Receive SYN packet.
+	b := c.GetV6Packet()
+	synCheckers := append(checkers, checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagSyn),
+	))
+	checker.IPv6(t, b, synCheckers...)
+
+	tcp := header.TCP(header.IPv6(b).Payload())
+	c.IRS = seqnum.Value(tcp.SequenceNumber())
+
+	iss := seqnum.Value(789)
+	c.SendV6Packet(nil, &context.Headers{
+		SrcPort: tcp.DestinationPort(),
+		DstPort: tcp.SourcePort(),
+		Flags:   header.TCPFlagSyn | header.TCPFlagAck,
+		SeqNum:  iss,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Receive ACK packet.
+	ackCheckers := append(checkers, checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(c.IRS)+1),
+		checker.AckNum(uint32(iss)+1),
+	))
+	checker.IPv6(t, c.GetV6Packet(), ackCheckers...)
+
+	// Wait for connection to be established.
+	select {
+	case <-ch:
+		err = c.EP.GetSockOpt(tcpip.ErrorOption{})
+		if err != nil {
+			t.Fatalf("Unexpected error when connecting: %v", err)
+		}
+	case <-time.After(1 * time.Second):
+		t.Fatalf("Timed out waiting for connection")
+	}
+}
+
+func TestV6Connect(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(false)
+
+	// Test the connection request.
+	testV6Connect(t, c)
+}
+
+func TestV6ConnectV6Only(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(true)
+
+	// Test the connection request.
+	testV6Connect(t, c)
+}
+
+func TestV6ConnectWhenBoundToWildcard(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(false)
+
+	// Bind to wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Test the connection request.
+	testV6Connect(t, c)
+}
+
+func TestV6ConnectWhenBoundToLocalAddress(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(false)
+
+	// Bind to local address.
+	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackV6Addr}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Test the connection request.
+	testV6Connect(t, c)
+}
+
+func TestV4RefuseOnV6Only(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(true)
+
+	// Bind to wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Start listening.
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %v", err)
+	}
+
+	// Send a SYN request.
+	irs := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  irs,
+		RcvWnd:  30000,
+	})
+
+	// Receive the RST reply.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.SrcPort(context.StackPort),
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+			checker.AckNum(uint32(irs)+1),
+		),
+	)
+}
+
+func TestV6RefuseOnBoundToV4Mapped(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(false)
+
+	// Bind and listen.
+	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.V4MappedWildcardAddr, Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %v", err)
+	}
+
+	// Send a SYN request.
+	irs := seqnum.Value(789)
+	c.SendV6Packet(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  irs,
+		RcvWnd:  30000,
+	})
+
+	// Receive the RST reply.
+	checker.IPv6(t, c.GetV6Packet(),
+		checker.TCP(
+			checker.SrcPort(context.StackPort),
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+			checker.AckNum(uint32(irs)+1),
+		),
+	)
+}
+
+func testV4Accept(t *testing.T, c *context.Context) {
+	c.SetGSOEnabled(true)
+	defer c.SetGSOEnabled(false)
+
+	// Start listening.
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %v", err)
+	}
+
+	// Send a SYN request.
+	irs := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  irs,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcp := header.TCP(header.IPv4(b).Payload())
+	iss := seqnum.Value(tcp.SequenceNumber())
+	checker.IPv4(t, b,
+		checker.TCP(
+			checker.SrcPort(context.StackPort),
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
+			checker.AckNum(uint32(irs)+1),
+		),
+	)
+
+	// Send ACK.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+		RcvWnd:  30000,
+	})
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	nep, _, err := c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			nep, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %v", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Make sure we get the same error when calling the original ep and the
+	// new one. This validates that v4-mapped endpoints are still able to
+	// query the V6Only flag, whereas pure v4 endpoints are not.
+	_, expected := c.EP.GetSockOptBool(tcpip.V6OnlyOption)
+	if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != expected {
+		t.Fatalf("GetSockOpt returned unexpected value: got %v, want %v", err, expected)
+	}
+
+	// Check the peer address.
+	addr, err := nep.GetRemoteAddress()
+	if err != nil {
+		t.Fatalf("GetRemoteAddress failed failed: %v", err)
+	}
+
+	if addr.Addr != context.TestAddr {
+		t.Fatalf("Unexpected remote address: got %v, want %v", addr.Addr, context.TestAddr)
+	}
+
+	data := "Don't panic"
+	nep.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{})
+	b = c.GetPacket()
+	tcp = header.TCP(header.IPv4(b).Payload())
+	if string(tcp.Payload()) != data {
+		t.Fatalf("Unexpected data: got %v, want %v", string(tcp.Payload()), data)
+	}
+}
+
+func TestV4AcceptOnV6(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(false)
+
+	// Bind to wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Test acceptance.
+	testV4Accept(t, c)
+}
+
+func TestV4AcceptOnBoundToV4MappedWildcard(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(false)
+
+	// Bind to v4 mapped wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.V4MappedWildcardAddr, Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Test acceptance.
+	testV4Accept(t, c)
+}
+
+func TestV4AcceptOnBoundToV4Mapped(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(false)
+
+	// Bind and listen.
+	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackV4MappedAddr, Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Test acceptance.
+	testV4Accept(t, c)
+}
+
+func TestV6AcceptOnV6(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(false)
+
+	// Bind and listen.
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %v", err)
+	}
+
+	// Send a SYN request.
+	irs := seqnum.Value(789)
+	c.SendV6Packet(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  irs,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetV6Packet()
+	tcp := header.TCP(header.IPv6(b).Payload())
+	iss := seqnum.Value(tcp.SequenceNumber())
+	checker.IPv6(t, b,
+		checker.TCP(
+			checker.SrcPort(context.StackPort),
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
+			checker.AckNum(uint32(irs)+1),
+		),
+	)
+
+	// Send ACK.
+	c.SendV6Packet(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+		RcvWnd:  30000,
+	})
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	nep, _, err := c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			nep, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %v", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Make sure we can still query the v6 only status of the new endpoint,
+	// that is, that it is in fact a v6 socket.
+	if _, err := nep.GetSockOptBool(tcpip.V6OnlyOption); err != nil {
+		t.Fatalf("GetSockOpt failed failed: %v", err)
+	}
+
+	// Check the peer address.
+	addr, err := nep.GetRemoteAddress()
+	if err != nil {
+		t.Fatalf("GetRemoteAddress failed failed: %v", err)
+	}
+
+	if addr.Addr != context.TestV6Addr {
+		t.Fatalf("Unexpected remote address: got %v, want %v", addr.Addr, context.TestV6Addr)
+	}
+}
+
+func TestV4AcceptOnV4(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create TCP endpoint.
+	var err *tcpip.Error
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	// Bind to wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Test acceptance.
+	testV4Accept(t, c)
+}
+
+func testV4ListenClose(t *testing.T, c *context.Context) {
+	// Set the SynRcvd threshold to zero to force a syn cookie based accept
+	// to happen.
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+		t.Fatalf("setting TCPSynRcvdCountThresholdOption failed: %s", err)
+	}
+
+	const n = uint16(32)
+
+	// Start listening.
+	if err := c.EP.Listen(int(tcp.SynRcvdCountThreshold + 1)); err != nil {
+		t.Fatalf("Listen failed: %v", err)
+	}
+
+	irs := seqnum.Value(789)
+	for i := uint16(0); i < n; i++ {
+		// Send a SYN request.
+		c.SendPacket(nil, &context.Headers{
+			SrcPort: context.TestPort + i,
+			DstPort: context.StackPort,
+			Flags:   header.TCPFlagSyn,
+			SeqNum:  irs,
+			RcvWnd:  30000,
+		})
+	}
+
+	// Each of these ACK's will cause a syn-cookie based connection to be
+	// accepted and delivered to the listening endpoint.
+	for i := uint16(0); i < n; i++ {
+		b := c.GetPacket()
+		tcp := header.TCP(header.IPv4(b).Payload())
+		iss := seqnum.Value(tcp.SequenceNumber())
+		// Send ACK.
+		c.SendPacket(nil, &context.Headers{
+			SrcPort: tcp.DestinationPort(),
+			DstPort: context.StackPort,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  irs + 1,
+			AckNum:  iss + 1,
+			RcvWnd:  30000,
+		})
+	}
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+	nep, _, err := c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			nep, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %v", err)
+			}
+
+		case <-time.After(10 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+	nep.Close()
+	c.EP.Close()
+}
+
+func TestV4ListenCloseOnV4(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create TCP endpoint.
+	var err *tcpip.Error
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	// Bind to wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %v", err)
+	}
+
+	// Test acceptance.
+	testV4ListenClose(t, c)
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint.go b/pkg/tcpip/transport/tcp/endpoint.go
new file mode 100644
index 000000000..caac6ef57
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/endpoint.go
@@ -0,0 +1,2888 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"encoding/binary"
+	"fmt"
+	"math"
+	"runtime"
+	"strings"
+	"sync/atomic"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/rand"
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/hash/jenkins"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// EndpointState represents the state of a TCP endpoint.
+type EndpointState uint32
+
+// Endpoint states. Note that are represented in a netstack-specific manner and
+// may not be meaningful externally. Specifically, they need to be translated to
+// Linux's representation for these states if presented to userspace.
+const (
+	// Endpoint states internal to netstack. These map to the TCP state CLOSED.
+	StateInitial EndpointState = iota
+	StateBound
+	StateConnecting // Connect() called, but the initial SYN hasn't been sent.
+	StateError
+
+	// TCP protocol states.
+	StateEstablished
+	StateSynSent
+	StateSynRecv
+	StateFinWait1
+	StateFinWait2
+	StateTimeWait
+	StateClose
+	StateCloseWait
+	StateLastAck
+	StateListen
+	StateClosing
+)
+
+// connected returns true when s is one of the states representing an
+// endpoint connected to a peer.
+func (s EndpointState) connected() bool {
+	switch s {
+	case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
+		return true
+	default:
+		return false
+	}
+}
+
+// connecting returns true when s is one of the states representing a
+// connection in progress, but not yet fully established.
+func (s EndpointState) connecting() bool {
+	switch s {
+	case StateConnecting, StateSynSent, StateSynRecv:
+		return true
+	default:
+		return false
+	}
+}
+
+// handshake returns true when s is one of the states representing an endpoint
+// in the middle of a TCP handshake.
+func (s EndpointState) handshake() bool {
+	switch s {
+	case StateSynSent, StateSynRecv:
+		return true
+	default:
+		return false
+	}
+}
+
+// closed returns true when s is one of the states an endpoint transitions to
+// when closed or when it encounters an error. This is distinct from a newly
+// initialized endpoint that was never connected.
+func (s EndpointState) closed() bool {
+	switch s {
+	case StateClose, StateError:
+		return true
+	default:
+		return false
+	}
+}
+
+// String implements fmt.Stringer.String.
+func (s EndpointState) String() string {
+	switch s {
+	case StateInitial:
+		return "INITIAL"
+	case StateBound:
+		return "BOUND"
+	case StateConnecting:
+		return "CONNECTING"
+	case StateError:
+		return "ERROR"
+	case StateEstablished:
+		return "ESTABLISHED"
+	case StateSynSent:
+		return "SYN-SENT"
+	case StateSynRecv:
+		return "SYN-RCVD"
+	case StateFinWait1:
+		return "FIN-WAIT1"
+	case StateFinWait2:
+		return "FIN-WAIT2"
+	case StateTimeWait:
+		return "TIME-WAIT"
+	case StateClose:
+		return "CLOSED"
+	case StateCloseWait:
+		return "CLOSE-WAIT"
+	case StateLastAck:
+		return "LAST-ACK"
+	case StateListen:
+		return "LISTEN"
+	case StateClosing:
+		return "CLOSING"
+	default:
+		panic("unreachable")
+	}
+}
+
+// Reasons for notifying the protocol goroutine.
+const (
+	notifyNonZeroReceiveWindow = 1 << iota
+	notifyReceiveWindowChanged
+	notifyClose
+	notifyMTUChanged
+	notifyDrain
+	notifyReset
+	notifyResetByPeer
+	// notifyAbort is a request for an expedited teardown.
+	notifyAbort
+	notifyKeepaliveChanged
+	notifyMSSChanged
+	// notifyTickleWorker is used to tickle the protocol main loop during a
+	// restore after we update the endpoint state to the correct one. This
+	// ensures the loop terminates if the final state of the endpoint is
+	// say TIME_WAIT.
+	notifyTickleWorker
+	notifyError
+)
+
+// SACKInfo holds TCP SACK related information for a given endpoint.
+//
+// +stateify savable
+type SACKInfo struct {
+	// Blocks is the maximum number of SACK blocks we track
+	// per endpoint.
+	Blocks [MaxSACKBlocks]header.SACKBlock
+
+	// NumBlocks is the number of valid SACK blocks stored in the
+	// blocks array above.
+	NumBlocks int
+}
+
+// rcvBufAutoTuneParams are used to hold state variables to compute
+// the auto tuned recv buffer size.
+//
+// +stateify savable
+type rcvBufAutoTuneParams struct {
+	// measureTime is the time at which the current measurement
+	// was started.
+	measureTime time.Time `state:".(unixTime)"`
+
+	// copied is the number of bytes copied out of the receive
+	// buffers since this measure began.
+	copied int
+
+	// prevCopied is the number of bytes copied out of the receive
+	// buffers in the previous RTT period.
+	prevCopied int
+
+	// rtt is the non-smoothed minimum RTT as measured by observing the time
+	// between when a byte is first acknowledged and the receipt of data
+	// that is at least one window beyond the sequence number that was
+	// acknowledged.
+	rtt time.Duration
+
+	// rttMeasureSeqNumber is the highest acceptable sequence number at the
+	// time this RTT measurement period began.
+	rttMeasureSeqNumber seqnum.Value
+
+	// rttMeasureTime is the absolute time at which the current rtt
+	// measurement period began.
+	rttMeasureTime time.Time `state:".(unixTime)"`
+
+	// disabled is true if an explicit receive buffer is set for the
+	// endpoint.
+	disabled bool
+}
+
+// ReceiveErrors collect segment receive errors within transport layer.
+type ReceiveErrors struct {
+	tcpip.ReceiveErrors
+
+	// SegmentQueueDropped is the number of segments dropped due to
+	// a full segment queue.
+	SegmentQueueDropped tcpip.StatCounter
+
+	// ChecksumErrors is the number of segments dropped due to bad checksums.
+	ChecksumErrors tcpip.StatCounter
+
+	// ListenOverflowSynDrop is the number of times the listen queue overflowed
+	// and a SYN was dropped.
+	ListenOverflowSynDrop tcpip.StatCounter
+
+	// ListenOverflowAckDrop is the number of times the final ACK
+	// in the handshake was dropped due to overflow.
+	ListenOverflowAckDrop tcpip.StatCounter
+
+	// ZeroRcvWindowState is the number of times we advertised
+	// a zero receive window when rcvList is full.
+	ZeroRcvWindowState tcpip.StatCounter
+}
+
+// SendErrors collect segment send errors within the transport layer.
+type SendErrors struct {
+	tcpip.SendErrors
+
+	// SegmentSendToNetworkFailed is the number of TCP segments failed to be sent
+	// to the network endpoint.
+	SegmentSendToNetworkFailed tcpip.StatCounter
+
+	// SynSendToNetworkFailed is the number of TCP SYNs failed to be sent
+	// to the network endpoint.
+	SynSendToNetworkFailed tcpip.StatCounter
+
+	// Retransmits is the number of TCP segments retransmitted.
+	Retransmits tcpip.StatCounter
+
+	// FastRetransmit is the number of segments retransmitted in fast
+	// recovery.
+	FastRetransmit tcpip.StatCounter
+
+	// Timeouts is the number of times the RTO expired.
+	Timeouts tcpip.StatCounter
+}
+
+// Stats holds statistics about the endpoint.
+type Stats struct {
+	// SegmentsReceived is the number of TCP segments received that
+	// the transport layer successfully parsed.
+	SegmentsReceived tcpip.StatCounter
+
+	// SegmentsSent is the number of TCP segments sent.
+	SegmentsSent tcpip.StatCounter
+
+	// FailedConnectionAttempts is the number of times we saw Connect and
+	// Accept errors.
+	FailedConnectionAttempts tcpip.StatCounter
+
+	// ReceiveErrors collects segment receive errors within the
+	// transport layer.
+	ReceiveErrors ReceiveErrors
+
+	// ReadErrors collects segment read errors from an endpoint read call.
+	ReadErrors tcpip.ReadErrors
+
+	// SendErrors collects segment send errors within the transport layer.
+	SendErrors SendErrors
+
+	// WriteErrors collects segment write errors from an endpoint write call.
+	WriteErrors tcpip.WriteErrors
+}
+
+// IsEndpointStats is an empty method to implement the tcpip.EndpointStats
+// marker interface.
+func (*Stats) IsEndpointStats() {}
+
+// EndpointInfo holds useful information about a transport endpoint which
+// can be queried by monitoring tools.
+//
+// +stateify savable
+type EndpointInfo struct {
+	stack.TransportEndpointInfo
+
+	// HardError is meaningful only when state is stateError. It stores the
+	// error to be returned when read/write syscalls are called and the
+	// endpoint is in this state. HardError is protected by endpoint mu.
+	HardError *tcpip.Error `state:".(string)"`
+}
+
+// IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
+// marker interface.
+func (*EndpointInfo) IsEndpointInfo() {}
+
+// endpoint represents a TCP endpoint. This struct serves as the interface
+// between users of the endpoint and the protocol implementation; it is legal to
+// have concurrent goroutines make calls into the endpoint, they are properly
+// synchronized. The protocol implementation, however, runs in a single
+// goroutine.
+//
+// Each endpoint has a few mutexes:
+//
+// e.mu -> Primary mutex for an endpoint must be held for all operations except
+// in e.Readiness where acquiring it will result in a deadlock in epoll
+// implementation.
+//
+// The following three mutexes can be acquired independent of e.mu but if
+// acquired with e.mu then e.mu must be acquired first.
+//
+// e.acceptMu -> protects acceptedChan.
+// e.rcvListMu -> Protects the rcvList and associated fields.
+// e.sndBufMu -> Protects the sndQueue and associated fields.
+// e.lastErrorMu -> Protects the lastError field.
+//
+// LOCKING/UNLOCKING of the endpoint.  The locking of an endpoint is different
+// based on the context in which the lock is acquired. In the syscall context
+// e.LockUser/e.UnlockUser should be used and when doing background processing
+// e.mu.Lock/e.mu.Unlock should be used. The distinction is described below
+// in brief.
+//
+// The reason for this locking behaviour is to avoid wakeups to handle packets.
+// In cases where the endpoint is already locked the background processor can
+// queue the packet up and go its merry way and the lock owner will eventually
+// process the backlog when releasing the lock. Similarly when acquiring the
+// lock from say a syscall goroutine we can implement a bit of spinning if we
+// know that the lock is not held by another syscall goroutine. Background
+// processors should never hold the lock for long and we can avoid an expensive
+// sleep/wakeup by spinning for a shortwhile.
+//
+// For more details please see the detailed documentation on
+// e.LockUser/e.UnlockUser methods.
+//
+// +stateify savable
+type endpoint struct {
+	EndpointInfo
+
+	// endpointEntry is used to queue endpoints for processing to the
+	// a given tcp processor goroutine.
+	//
+	// Precondition: epQueue.mu must be held to read/write this field..
+	endpointEntry `state:"nosave"`
+
+	// pendingProcessing is true if this endpoint is queued for processing
+	// to a TCP processor.
+	//
+	// Precondition: epQueue.mu must be held to read/write this field..
+	pendingProcessing bool `state:"nosave"`
+
+	// The following fields are initialized at creation time and do not
+	// change throughout the lifetime of the endpoint.
+	stack       *stack.Stack  `state:"manual"`
+	waiterQueue *waiter.Queue `state:"wait"`
+	uniqueID    uint64
+
+	// lastError represents the last error that the endpoint reported;
+	// access to it is protected by the following mutex.
+	lastErrorMu sync.Mutex   `state:"nosave"`
+	lastError   *tcpip.Error `state:".(string)"`
+
+	// The following fields are used to manage the receive queue. The
+	// protocol goroutine adds ready-for-delivery segments to rcvList,
+	// which are returned by Read() calls to users.
+	//
+	// Once the peer has closed its send side, rcvClosed is set to true
+	// to indicate to users that no more data is coming.
+	//
+	// rcvListMu can be taken after the endpoint mu below.
+	rcvListMu     sync.Mutex  `state:"nosave"`
+	rcvList       segmentList `state:"wait"`
+	rcvClosed     bool
+	rcvBufSize    int
+	rcvBufUsed    int
+	rcvAutoParams rcvBufAutoTuneParams
+
+	// mu protects all endpoint fields unless documented otherwise. mu must
+	// be acquired before interacting with the endpoint fields.
+	mu          sync.Mutex `state:"nosave"`
+	ownedByUser uint32
+
+	// state must be read/set using the EndpointState()/setEndpointState()
+	// methods.
+	state EndpointState `state:".(EndpointState)"`
+
+	// origEndpointState is only used during a restore phase to save the
+	// endpoint state at restore time as the socket is moved to it's correct
+	// state.
+	origEndpointState EndpointState `state:"nosave"`
+
+	isPortReserved    bool `state:"manual"`
+	isRegistered      bool `state:"manual"`
+	boundNICID        tcpip.NICID
+	route             stack.Route `state:"manual"`
+	ttl               uint8
+	v6only            bool
+	isConnectNotified bool
+	// TCP should never broadcast but Linux nevertheless supports enabling/
+	// disabling SO_BROADCAST, albeit as a NOOP.
+	broadcast bool
+
+	// portFlags stores the current values of port related flags.
+	portFlags ports.Flags
+
+	// Values used to reserve a port or register a transport endpoint
+	// (which ever happens first).
+	boundBindToDevice tcpip.NICID
+	boundPortFlags    ports.Flags
+	boundDest         tcpip.FullAddress
+
+	// effectiveNetProtos contains the network protocols actually in use. In
+	// most cases it will only contain "netProto", but in cases like IPv6
+	// endpoints with v6only set to false, this could include multiple
+	// protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
+	// IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
+	// address).
+	effectiveNetProtos []tcpip.NetworkProtocolNumber
+
+	// workerRunning specifies if a worker goroutine is running.
+	workerRunning bool
+
+	// workerCleanup specifies if the worker goroutine must perform cleanup
+	// before exiting. This can only be set to true when workerRunning is
+	// also true, and they're both protected by the mutex.
+	workerCleanup bool
+
+	// sendTSOk is used to indicate when the TS Option has been negotiated.
+	// When sendTSOk is true every non-RST segment should carry a TS as per
+	// RFC7323#section-1.1
+	sendTSOk bool
+
+	// recentTS is the timestamp that should be sent in the TSEcr field of
+	// the timestamp for future segments sent by the endpoint. This field is
+	// updated if required when a new segment is received by this endpoint.
+	//
+	// recentTS must be read/written atomically.
+	recentTS uint32
+
+	// tsOffset is a randomized offset added to the value of the
+	// TSVal field in the timestamp option.
+	tsOffset uint32
+
+	// shutdownFlags represent the current shutdown state of the endpoint.
+	shutdownFlags tcpip.ShutdownFlags
+
+	// sackPermitted is set to true if the peer sends the TCPSACKPermitted
+	// option in the SYN/SYN-ACK.
+	sackPermitted bool
+
+	// sack holds TCP SACK related information for this endpoint.
+	sack SACKInfo
+
+	// bindToDevice is set to the NIC on which to bind or disabled if 0.
+	bindToDevice tcpip.NICID
+
+	// delay enables Nagle's algorithm.
+	//
+	// delay is a boolean (0 is false) and must be accessed atomically.
+	delay uint32
+
+	// cork holds back segments until full.
+	//
+	// cork is a boolean (0 is false) and must be accessed atomically.
+	cork uint32
+
+	// scoreboard holds TCP SACK Scoreboard information for this endpoint.
+	scoreboard *SACKScoreboard
+
+	// The options below aren't implemented, but we remember the user
+	// settings because applications expect to be able to set/query these
+	// options.
+
+	// slowAck holds the negated state of quick ack. It is stubbed out and
+	// does nothing.
+	//
+	// slowAck is a boolean (0 is false) and must be accessed atomically.
+	slowAck uint32
+
+	// segmentQueue is used to hand received segments to the protocol
+	// goroutine. Segments are queued as long as the queue is not full,
+	// and dropped when it is.
+	segmentQueue segmentQueue `state:"wait"`
+
+	// synRcvdCount is the number of connections for this endpoint that are
+	// in SYN-RCVD state.
+	synRcvdCount int
+
+	// userMSS if non-zero is the MSS value explicitly set by the user
+	// for this endpoint using the TCP_MAXSEG setsockopt.
+	userMSS uint16
+
+	// maxSynRetries is the maximum number of SYN retransmits that TCP should
+	// send before aborting the attempt to connect. It cannot exceed 255.
+	//
+	// NOTE: This is currently a no-op and does not change the SYN
+	// retransmissions.
+	maxSynRetries uint8
+
+	// windowClamp is used to bound the size of the advertised window to
+	// this value.
+	windowClamp uint32
+
+	// The following fields are used to manage the send buffer. When
+	// segments are ready to be sent, they are added to sndQueue and the
+	// protocol goroutine is signaled via sndWaker.
+	//
+	// When the send side is closed, the protocol goroutine is notified via
+	// sndCloseWaker, and sndClosed is set to true.
+	sndBufMu      sync.Mutex `state:"nosave"`
+	sndBufSize    int
+	sndBufUsed    int
+	sndClosed     bool
+	sndBufInQueue seqnum.Size
+	sndQueue      segmentList `state:"wait"`
+	sndWaker      sleep.Waker `state:"manual"`
+	sndCloseWaker sleep.Waker `state:"manual"`
+
+	// cc stores the name of the Congestion Control algorithm to use for
+	// this endpoint.
+	cc tcpip.CongestionControlOption
+
+	// The following are used when a "packet too big" control packet is
+	// received. They are protected by sndBufMu. They are used to
+	// communicate to the main protocol goroutine how many such control
+	// messages have been received since the last notification was processed
+	// and what was the smallest MTU seen.
+	packetTooBigCount int
+	sndMTU            int
+
+	// newSegmentWaker is used to indicate to the protocol goroutine that
+	// it needs to wake up and handle new segments queued to it.
+	newSegmentWaker sleep.Waker `state:"manual"`
+
+	// notificationWaker is used to indicate to the protocol goroutine that
+	// it needs to wake up and check for notifications.
+	notificationWaker sleep.Waker `state:"manual"`
+
+	// notifyFlags is a bitmask of flags used to indicate to the protocol
+	// goroutine what it was notified; this is only accessed atomically.
+	notifyFlags uint32 `state:"nosave"`
+
+	// keepalive manages TCP keepalive state. When the connection is idle
+	// (no data sent or received) for keepaliveIdle, we start sending
+	// keepalives every keepalive.interval. If we send keepalive.count
+	// without hearing a response, the connection is closed.
+	keepalive keepalive
+
+	// userTimeout if non-zero specifies a user specified timeout for
+	// a connection w/ pending data to send. A connection that has pending
+	// unacked data will be forcibily aborted if the timeout is reached
+	// without any data being acked.
+	userTimeout time.Duration
+
+	// deferAccept if non-zero specifies a user specified time during
+	// which the final ACK of a handshake will be dropped provided the
+	// ACK is a bare ACK and carries no data. If the timeout is crossed then
+	// the bare ACK is accepted and the connection is delivered to the
+	// listener.
+	deferAccept time.Duration
+
+	// pendingAccepted is a synchronization primitive used to track number
+	// of connections that are queued up to be delivered to the accepted
+	// channel. We use this to ensure that all goroutines blocked on writing
+	// to the acceptedChan below terminate before we close acceptedChan.
+	pendingAccepted sync.WaitGroup `state:"nosave"`
+
+	// acceptMu protects acceptedChan.
+	acceptMu sync.Mutex `state:"nosave"`
+
+	// acceptCond is a condition variable that can be used to block on when
+	// acceptedChan is full and an endpoint is ready to be delivered.
+	//
+	// This condition variable is required because just blocking on sending
+	// to acceptedChan does not work in cases where endpoint.Listen is
+	// called twice with different backlog values. In such cases the channel
+	// is closed and a new one created. Any pending goroutines blocking on
+	// the write to the channel will panic.
+	//
+	// We use this condition variable to block/unblock goroutines which
+	// tried to deliver an endpoint but couldn't because accept backlog was
+	// full ( See: endpoint.deliverAccepted ).
+	acceptCond *sync.Cond `state:"nosave"`
+
+	// acceptedChan is used by a listening endpoint protocol goroutine to
+	// send newly accepted connections to the endpoint so that they can be
+	// read by Accept() calls.
+	acceptedChan chan *endpoint `state:".([]*endpoint)"`
+
+	// The following are only used from the protocol goroutine, and
+	// therefore don't need locks to protect them.
+	rcv *receiver `state:"wait"`
+	snd *sender   `state:"wait"`
+
+	// The goroutine drain completion notification channel.
+	drainDone chan struct{} `state:"nosave"`
+
+	// The goroutine undrain notification channel. This is currently used as
+	// a way to block the worker goroutines. Today nothing closes/writes
+	// this channel and this causes any goroutines waiting on this to just
+	// block. This is used during save/restore to prevent worker goroutines
+	// from mutating state as it's being saved.
+	undrain chan struct{} `state:"nosave"`
+
+	// probe if not nil is invoked on every received segment. It is passed
+	// a copy of the current state of the endpoint.
+	probe stack.TCPProbeFunc `state:"nosave"`
+
+	// The following are only used to assist the restore run to re-connect.
+	connectingAddress tcpip.Address
+
+	// amss is the advertised MSS to the peer by this endpoint.
+	amss uint16
+
+	// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
+	// applied while sending packets. Defaults to 0 as on Linux.
+	sendTOS uint8
+
+	gso *stack.GSO
+
+	// TODO(b/142022063): Add ability to save and restore per endpoint stats.
+	stats Stats `state:"nosave"`
+
+	// tcpLingerTimeout is the maximum amount of a time a socket
+	// a socket stays in TIME_WAIT state before being marked
+	// closed.
+	tcpLingerTimeout time.Duration
+
+	// closed indicates that the user has called closed on the
+	// endpoint and at this point the endpoint is only around
+	// to complete the TCP shutdown.
+	closed bool
+
+	// txHash is the transport layer hash to be set on outbound packets
+	// emitted by this endpoint.
+	txHash uint32
+
+	// owner is used to get uid and gid of the packet.
+	owner tcpip.PacketOwner
+}
+
+// UniqueID implements stack.TransportEndpoint.UniqueID.
+func (e *endpoint) UniqueID() uint64 {
+	return e.uniqueID
+}
+
+// calculateAdvertisedMSS calculates the MSS to advertise.
+//
+// If userMSS is non-zero and is not greater than the maximum possible MSS for
+// r, it will be used; otherwise, the maximum possible MSS will be used.
+func calculateAdvertisedMSS(userMSS uint16, r stack.Route) uint16 {
+	// The maximum possible MSS is dependent on the route.
+	maxMSS := mssForRoute(&r)
+
+	if userMSS != 0 && userMSS < maxMSS {
+		return userMSS
+	}
+
+	return maxMSS
+}
+
+// LockUser tries to lock e.mu and if it fails it will check if the lock is held
+// by another syscall goroutine. If yes, then it will goto sleep waiting for the
+// lock to be released, if not then it will spin till it acquires the lock or
+// another syscall goroutine acquires it in which case it will goto sleep as
+// described above.
+//
+// The assumption behind spinning here being that background packet processing
+// should not be holding the lock for long and spinning reduces latency as we
+// avoid an expensive sleep/wakeup of of the syscall goroutine).
+func (e *endpoint) LockUser() {
+	for {
+		// Try first if the sock is locked then check if it's owned
+		// by another user goroutine if not then we spin, otherwise
+		// we just goto sleep on the Lock() and wait.
+		if !e.mu.TryLock() {
+			// If socket is owned by the user then just goto sleep
+			// as the lock could be held for a reasonably long time.
+			if atomic.LoadUint32(&e.ownedByUser) == 1 {
+				e.mu.Lock()
+				atomic.StoreUint32(&e.ownedByUser, 1)
+				return
+			}
+			// Spin but yield the processor since the lower half
+			// should yield the lock soon.
+			runtime.Gosched()
+			continue
+		}
+		atomic.StoreUint32(&e.ownedByUser, 1)
+		return
+	}
+}
+
+// UnlockUser will check if there are any segments already queued for processing
+// and process any such segments before unlocking e.mu. This is required because
+// we when packets arrive and endpoint lock is already held then such packets
+// are queued up to be processed. If the lock is held by the endpoint goroutine
+// then it will process these packets but if the lock is instead held by the
+// syscall goroutine then we can have the syscall goroutine process the backlog
+// before unlocking.
+//
+// This avoids an unnecessary wakeup of the endpoint protocol goroutine for the
+// endpoint. It's also required eventually when we get rid of the endpoint
+// protocol goroutine altogether.
+//
+// Precondition: e.LockUser() must have been called before calling e.UnlockUser()
+func (e *endpoint) UnlockUser() {
+	// Lock segment queue before checking so that we avoid a race where
+	// segments can be queued between the time we check if queue is empty
+	// and actually unlock the endpoint mutex.
+	for {
+		e.segmentQueue.mu.Lock()
+		if e.segmentQueue.emptyLocked() {
+			if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
+				panic("e.UnlockUser() called without calling e.LockUser()")
+			}
+			e.mu.Unlock()
+			e.segmentQueue.mu.Unlock()
+			return
+		}
+		e.segmentQueue.mu.Unlock()
+
+		switch e.EndpointState() {
+		case StateEstablished:
+			if err := e.handleSegments(true /* fastPath */); err != nil {
+				e.notifyProtocolGoroutine(notifyTickleWorker)
+			}
+		default:
+			// Since we are waking the endpoint goroutine here just unlock
+			// and let it process the queued segments.
+			e.newSegmentWaker.Assert()
+			if atomic.SwapUint32(&e.ownedByUser, 0) != 1 {
+				panic("e.UnlockUser() called without calling e.LockUser()")
+			}
+			e.mu.Unlock()
+			return
+		}
+	}
+}
+
+// StopWork halts packet processing. Only to be used in tests.
+func (e *endpoint) StopWork() {
+	e.mu.Lock()
+}
+
+// ResumeWork resumes packet processing. Only to be used in tests.
+func (e *endpoint) ResumeWork() {
+	e.mu.Unlock()
+}
+
+// setEndpointState updates the state of the endpoint to state atomically. This
+// method is unexported as the only place we should update the state is in this
+// package but we allow the state to be read freely without holding e.mu.
+//
+// Precondition: e.mu must be held to call this method.
+func (e *endpoint) setEndpointState(state EndpointState) {
+	oldstate := EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
+	switch state {
+	case StateEstablished:
+		e.stack.Stats().TCP.CurrentEstablished.Increment()
+		e.stack.Stats().TCP.CurrentConnected.Increment()
+	case StateError:
+		fallthrough
+	case StateClose:
+		if oldstate == StateCloseWait || oldstate == StateEstablished {
+			e.stack.Stats().TCP.EstablishedResets.Increment()
+		}
+		fallthrough
+	default:
+		if oldstate == StateEstablished {
+			e.stack.Stats().TCP.CurrentEstablished.Decrement()
+		}
+	}
+	atomic.StoreUint32((*uint32)(&e.state), uint32(state))
+}
+
+// EndpointState returns the current state of the endpoint.
+func (e *endpoint) EndpointState() EndpointState {
+	return EndpointState(atomic.LoadUint32((*uint32)(&e.state)))
+}
+
+// setRecentTimestamp atomically sets the recentTS field to the
+// provided value.
+func (e *endpoint) setRecentTimestamp(recentTS uint32) {
+	atomic.StoreUint32(&e.recentTS, recentTS)
+}
+
+// recentTimestamp atomically reads and returns the value of the recentTS field.
+func (e *endpoint) recentTimestamp() uint32 {
+	return atomic.LoadUint32(&e.recentTS)
+}
+
+// keepalive is a synchronization wrapper used to appease stateify. See the
+// comment in endpoint, where it is used.
+//
+// +stateify savable
+type keepalive struct {
+	sync.Mutex `state:"nosave"`
+	enabled    bool
+	idle       time.Duration
+	interval   time.Duration
+	count      int
+	unacked    int
+	timer      timer       `state:"nosave"`
+	waker      sleep.Waker `state:"nosave"`
+}
+
+func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
+	e := &endpoint{
+		stack: s,
+		EndpointInfo: EndpointInfo{
+			TransportEndpointInfo: stack.TransportEndpointInfo{
+				NetProto:   netProto,
+				TransProto: header.TCPProtocolNumber,
+			},
+		},
+		waiterQueue: waiterQueue,
+		state:       StateInitial,
+		rcvBufSize:  DefaultReceiveBufferSize,
+		sndBufSize:  DefaultSendBufferSize,
+		sndMTU:      int(math.MaxInt32),
+		keepalive: keepalive{
+			// Linux defaults.
+			idle:     2 * time.Hour,
+			interval: 75 * time.Second,
+			count:    9,
+		},
+		uniqueID:      s.UniqueID(),
+		txHash:        s.Rand().Uint32(),
+		windowClamp:   DefaultReceiveBufferSize,
+		maxSynRetries: DefaultSynRetries,
+	}
+
+	var ss SendBufferSizeOption
+	if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
+		e.sndBufSize = ss.Default
+	}
+
+	var rs ReceiveBufferSizeOption
+	if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
+		e.rcvBufSize = rs.Default
+	}
+
+	var cs tcpip.CongestionControlOption
+	if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil {
+		e.cc = cs
+	}
+
+	var mrb tcpip.ModerateReceiveBufferOption
+	if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
+		e.rcvAutoParams.disabled = !bool(mrb)
+	}
+
+	var de DelayEnabled
+	if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de {
+		e.SetSockOptBool(tcpip.DelayOption, true)
+	}
+
+	var tcpLT tcpip.TCPLingerTimeoutOption
+	if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil {
+		e.tcpLingerTimeout = time.Duration(tcpLT)
+	}
+
+	var synRetries tcpip.TCPSynRetriesOption
+	if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil {
+		e.maxSynRetries = uint8(synRetries)
+	}
+
+	if p := s.GetTCPProbe(); p != nil {
+		e.probe = p
+	}
+
+	e.segmentQueue.setLimit(MaxUnprocessedSegments)
+	e.tsOffset = timeStampOffset()
+	e.acceptCond = sync.NewCond(&e.acceptMu)
+
+	return e
+}
+
+// Readiness returns the current readiness of the endpoint. For example, if
+// waiter.EventIn is set, the endpoint is immediately readable.
+func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+	result := waiter.EventMask(0)
+
+	switch e.EndpointState() {
+	case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
+		// Ready for nothing.
+
+	case StateClose, StateError:
+		// Ready for anything.
+		result = mask
+
+	case StateListen:
+		// Check if there's anything in the accepted channel.
+		if (mask & waiter.EventIn) != 0 {
+			e.acceptMu.Lock()
+			if len(e.acceptedChan) > 0 {
+				result |= waiter.EventIn
+			}
+			e.acceptMu.Unlock()
+		}
+	}
+	if e.EndpointState().connected() {
+		// Determine if the endpoint is writable if requested.
+		if (mask & waiter.EventOut) != 0 {
+			e.sndBufMu.Lock()
+			if e.sndClosed || e.sndBufUsed < e.sndBufSize {
+				result |= waiter.EventOut
+			}
+			e.sndBufMu.Unlock()
+		}
+
+		// Determine if the endpoint is readable if requested.
+		if (mask & waiter.EventIn) != 0 {
+			e.rcvListMu.Lock()
+			if e.rcvBufUsed > 0 || e.rcvClosed {
+				result |= waiter.EventIn
+			}
+			e.rcvListMu.Unlock()
+		}
+	}
+
+	return result
+}
+
+func (e *endpoint) fetchNotifications() uint32 {
+	return atomic.SwapUint32(&e.notifyFlags, 0)
+}
+
+func (e *endpoint) notifyProtocolGoroutine(n uint32) {
+	for {
+		v := atomic.LoadUint32(&e.notifyFlags)
+		if v&n == n {
+			// The flags are already set.
+			return
+		}
+
+		if atomic.CompareAndSwapUint32(&e.notifyFlags, v, v|n) {
+			if v == 0 {
+				// We are causing a transition from no flags to
+				// at least one flag set, so we must cause the
+				// protocol goroutine to wake up.
+				e.notificationWaker.Assert()
+			}
+			return
+		}
+	}
+}
+
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	// The abort notification is not processed synchronously, so no
+	// synchronization is needed.
+	//
+	// If the endpoint becomes connected after this check, we still close
+	// the endpoint. This worst case results in a slower abort.
+	//
+	// If the endpoint disconnected after the check, nothing needs to be
+	// done, so sending a notification which will potentially be ignored is
+	// fine.
+	//
+	// If the endpoint connecting finishes after the check, the endpoint
+	// is either in a connected state (where we would notifyAbort anyway),
+	// SYN-RECV (where we would also notifyAbort anyway), or in an error
+	// state where nothing is required and the notification can be safely
+	// ignored.
+	//
+	// Endpoints where a Close during connecting or SYN-RECV state would be
+	// problematic are set to state connecting before being registered (and
+	// thus possible to be Aborted). They are never available in initial
+	// state.
+	//
+	// Endpoints transitioning from initial to connecting state may be
+	// safely either closed or sent notifyAbort.
+	if s := e.EndpointState(); s == StateConnecting || s == StateSynRecv || s.connected() {
+		e.notifyProtocolGoroutine(notifyAbort)
+		return
+	}
+	e.Close()
+}
+
+// Close puts the endpoint in a closed state and frees all resources associated
+// with it. It must be called only once and with no other concurrent calls to
+// the endpoint.
+func (e *endpoint) Close() {
+	e.LockUser()
+	defer e.UnlockUser()
+	if e.closed {
+		return
+	}
+
+	// Issue a shutdown so that the peer knows we won't send any more data
+	// if we're connected, or stop accepting if we're listening.
+	e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead)
+	e.closeNoShutdownLocked()
+}
+
+// closeNoShutdown closes the endpoint without doing a full shutdown. This is
+// used when a connection needs to be aborted with a RST and we want to skip
+// a full 4 way TCP shutdown.
+func (e *endpoint) closeNoShutdownLocked() {
+	// For listening sockets, we always release ports inline so that they
+	// are immediately available for reuse after Close() is called. If also
+	// registered, we unregister as well otherwise the next user would fail
+	// in Listen() when trying to register.
+	if e.EndpointState() == StateListen && e.isPortReserved {
+		if e.isRegistered {
+			e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
+			e.isRegistered = false
+		}
+
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice, e.boundDest)
+		e.isPortReserved = false
+		e.boundBindToDevice = 0
+		e.boundPortFlags = ports.Flags{}
+		e.boundDest = tcpip.FullAddress{}
+	}
+
+	// Mark endpoint as closed.
+	e.closed = true
+
+	switch e.EndpointState() {
+	case StateClose, StateError:
+		return
+	}
+
+	// Either perform the local cleanup or kick the worker to make sure it
+	// knows it needs to cleanup.
+	if e.workerRunning {
+		e.workerCleanup = true
+		tcpip.AddDanglingEndpoint(e)
+		// Worker will remove the dangling endpoint when the endpoint
+		// goroutine terminates.
+		e.notifyProtocolGoroutine(notifyClose)
+	} else {
+		e.transitionToStateCloseLocked()
+	}
+}
+
+// closePendingAcceptableConnections closes all connections that have completed
+// handshake but not yet been delivered to the application.
+func (e *endpoint) closePendingAcceptableConnectionsLocked() {
+	e.acceptMu.Lock()
+	if e.acceptedChan == nil {
+		e.acceptMu.Unlock()
+		return
+	}
+	close(e.acceptedChan)
+	ch := e.acceptedChan
+	e.acceptedChan = nil
+	e.acceptCond.Broadcast()
+	e.acceptMu.Unlock()
+
+	// Reset all connections that are waiting to be accepted.
+	for n := range ch {
+		n.notifyProtocolGoroutine(notifyReset)
+	}
+	// Wait for reset of all endpoints that are still waiting to be delivered to
+	// the now closed acceptedChan.
+	e.pendingAccepted.Wait()
+}
+
+// cleanupLocked frees all resources associated with the endpoint. It is called
+// after Close() is called and the worker goroutine (if any) is done with its
+// work.
+func (e *endpoint) cleanupLocked() {
+	// Close all endpoints that might have been accepted by TCP but not by
+	// the client.
+	e.closePendingAcceptableConnectionsLocked()
+
+	e.workerCleanup = false
+
+	if e.isRegistered {
+		e.stack.StartTransportEndpointCleanup(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
+		e.isRegistered = false
+	}
+
+	if e.isPortReserved {
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice, e.boundDest)
+		e.isPortReserved = false
+	}
+	e.boundBindToDevice = 0
+	e.boundPortFlags = ports.Flags{}
+	e.boundDest = tcpip.FullAddress{}
+
+	e.route.Release()
+	e.stack.CompleteTransportEndpointCleanup(e)
+	tcpip.DeleteDanglingEndpoint(e)
+}
+
+// initialReceiveWindow returns the initial receive window to advertise in the
+// SYN/SYN-ACK.
+func (e *endpoint) initialReceiveWindow() int {
+	rcvWnd := e.receiveBufferAvailable()
+	if rcvWnd > math.MaxUint16 {
+		rcvWnd = math.MaxUint16
+	}
+
+	// Use the user supplied MSS, if available.
+	routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2
+	if rcvWnd > routeWnd {
+		rcvWnd = routeWnd
+	}
+	rcvWndScale := e.rcvWndScaleForHandshake()
+
+	// Round-down the rcvWnd to a multiple of wndScale. This ensures that the
+	// window offered in SYN won't be reduced due to the loss of precision if
+	// window scaling is enabled after the handshake.
+	rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale)
+
+	// Ensure we can always accept at least 1 byte if the scale specified
+	// was too high for the provided rcvWnd.
+	if rcvWnd == 0 {
+		rcvWnd = 1
+	}
+
+	return rcvWnd
+}
+
+// ModerateRecvBuf adjusts the receive buffer and the advertised window
+// based on the number of bytes copied to userspace.
+func (e *endpoint) ModerateRecvBuf(copied int) {
+	e.LockUser()
+	defer e.UnlockUser()
+
+	e.rcvListMu.Lock()
+	if e.rcvAutoParams.disabled {
+		e.rcvListMu.Unlock()
+		return
+	}
+	now := time.Now()
+	if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt {
+		e.rcvAutoParams.copied += copied
+		e.rcvListMu.Unlock()
+		return
+	}
+	prevRTTCopied := e.rcvAutoParams.copied + copied
+	prevCopied := e.rcvAutoParams.prevCopied
+	rcvWnd := 0
+	if prevRTTCopied > prevCopied {
+		// The minimal receive window based on what was copied by the app
+		// in the immediate preceding RTT and some extra buffer for 16
+		// segments to account for variations.
+		// We multiply by 2 to account for packet losses.
+		rcvWnd = prevRTTCopied*2 + 16*int(e.amss)
+
+		// Scale for slow start based on bytes copied in this RTT vs previous.
+		grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied
+
+		// Multiply growth factor by 2 again to account for sender being
+		// in slow-start where the sender grows it's congestion window
+		// by 100% per RTT.
+		rcvWnd += grow * 2
+
+		// Make sure auto tuned buffer size can always receive upto 2x
+		// the initial window of 10 segments.
+		if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd {
+			rcvWnd = minRcvWnd
+		}
+
+		// Cap the auto tuned buffer size by the maximum permissible
+		// receive buffer size.
+		if max := e.maxReceiveBufferSize(); rcvWnd > max {
+			rcvWnd = max
+		}
+
+		// We do not adjust downwards as that can cause the receiver to
+		// reject valid data that might already be in flight as the
+		// acceptable window will shrink.
+		if rcvWnd > e.rcvBufSize {
+			availBefore := e.receiveBufferAvailableLocked()
+			e.rcvBufSize = rcvWnd
+			availAfter := e.receiveBufferAvailableLocked()
+			mask := uint32(notifyReceiveWindowChanged)
+			if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
+				mask |= notifyNonZeroReceiveWindow
+			}
+			e.notifyProtocolGoroutine(mask)
+		}
+
+		// We only update prevCopied when we grow the buffer because in cases
+		// where prevCopied > prevRTTCopied the existing buffer is already big
+		// enough to handle the current rate and we don't need to do any
+		// adjustments.
+		e.rcvAutoParams.prevCopied = prevRTTCopied
+	}
+	e.rcvAutoParams.measureTime = now
+	e.rcvAutoParams.copied = 0
+	e.rcvListMu.Unlock()
+}
+
+func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
+	e.owner = owner
+}
+
+// Read reads data from the endpoint.
+func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	e.LockUser()
+	defer e.UnlockUser()
+
+	// When in SYN-SENT state, let the caller block on the receive.
+	// An application can initiate a non-blocking connect and then block
+	// on a receive. It can expect to read any data after the handshake
+	// is complete. RFC793, section 3.9, p58.
+	if e.EndpointState() == StateSynSent {
+		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrWouldBlock
+	}
+
+	// The endpoint can be read if it's connected, or if it's already closed
+	// but has some pending unread data. Also note that a RST being received
+	// would cause the state to become StateError so we should allow the
+	// reads to proceed before returning a ECONNRESET.
+	e.rcvListMu.Lock()
+	bufUsed := e.rcvBufUsed
+	if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 {
+		e.rcvListMu.Unlock()
+		he := e.HardError
+		if s == StateError {
+			return buffer.View{}, tcpip.ControlMessages{}, he
+		}
+		e.stats.ReadErrors.NotConnected.Increment()
+		return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrNotConnected
+	}
+
+	v, err := e.readLocked()
+	e.rcvListMu.Unlock()
+
+	if err == tcpip.ErrClosedForReceive {
+		e.stats.ReadErrors.ReadClosed.Increment()
+	}
+	return v, tcpip.ControlMessages{}, err
+}
+
+func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
+	if e.rcvBufUsed == 0 {
+		if e.rcvClosed || !e.EndpointState().connected() {
+			return buffer.View{}, tcpip.ErrClosedForReceive
+		}
+		return buffer.View{}, tcpip.ErrWouldBlock
+	}
+
+	s := e.rcvList.Front()
+	views := s.data.Views()
+	v := views[s.viewToDeliver]
+	s.viewToDeliver++
+
+	if s.viewToDeliver >= len(views) {
+		e.rcvList.Remove(s)
+		s.decRef()
+	}
+
+	e.rcvBufUsed -= len(v)
+
+	// If the window was small before this read and if the read freed up
+	// enough buffer space, to either fit an aMSS or half a receive buffer
+	// (whichever smaller), then notify the protocol goroutine to send a
+	// window update.
+	if crossed, above := e.windowCrossedACKThresholdLocked(len(v)); crossed && above {
+		e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
+	}
+
+	return v, nil
+}
+
+// isEndpointWritableLocked checks if a given endpoint is writable
+// and also returns the number of bytes that can be written at this
+// moment. If the endpoint is not writable then it returns an error
+// indicating the reason why it's not writable.
+// Caller must hold e.mu and e.sndBufMu
+func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
+	// The endpoint cannot be written to if it's not connected.
+	if !e.EndpointState().connected() {
+		switch e.EndpointState() {
+		case StateError:
+			return 0, e.HardError
+		default:
+			return 0, tcpip.ErrClosedForSend
+		}
+	}
+
+	// Check if the connection has already been closed for sends.
+	if e.sndClosed {
+		return 0, tcpip.ErrClosedForSend
+	}
+
+	avail := e.sndBufSize - e.sndBufUsed
+	if avail <= 0 {
+		return 0, tcpip.ErrWouldBlock
+	}
+	return avail, nil
+}
+
+// Write writes data to the endpoint's peer.
+func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+	// Linux completely ignores any address passed to sendto(2) for TCP sockets
+	// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
+	// and opts.EndOfRecord are also ignored.
+
+	e.LockUser()
+	e.sndBufMu.Lock()
+
+	avail, err := e.isEndpointWritableLocked()
+	if err != nil {
+		e.sndBufMu.Unlock()
+		e.UnlockUser()
+		e.stats.WriteErrors.WriteClosed.Increment()
+		return 0, nil, err
+	}
+
+	// We can release locks while copying data.
+	//
+	// This is not possible if atomic is set, because we can't allow the
+	// available buffer space to be consumed by some other caller while we
+	// are copying data in.
+	if !opts.Atomic {
+		e.sndBufMu.Unlock()
+		e.UnlockUser()
+	}
+
+	// Fetch data.
+	v, perr := p.Payload(avail)
+	if perr != nil || len(v) == 0 {
+		// Note that perr may be nil if len(v) == 0.
+		if opts.Atomic {
+			e.sndBufMu.Unlock()
+			e.UnlockUser()
+		}
+		return 0, nil, perr
+	}
+
+	queueAndSend := func() (int64, <-chan struct{}, *tcpip.Error) {
+		// Add data to the send queue.
+		s := newSegmentFromView(&e.route, e.ID, v)
+		e.sndBufUsed += len(v)
+		e.sndBufInQueue += seqnum.Size(len(v))
+		e.sndQueue.PushBack(s)
+		e.sndBufMu.Unlock()
+
+		// Do the work inline.
+		e.handleWrite()
+		e.UnlockUser()
+		return int64(len(v)), nil, nil
+	}
+
+	if opts.Atomic {
+		// Locks released in queueAndSend()
+		return queueAndSend()
+	}
+
+	// Since we released locks in between it's possible that the
+	// endpoint transitioned to a CLOSED/ERROR states so make
+	// sure endpoint is still writable before trying to write.
+	e.LockUser()
+	e.sndBufMu.Lock()
+	avail, err = e.isEndpointWritableLocked()
+	if err != nil {
+		e.sndBufMu.Unlock()
+		e.UnlockUser()
+		e.stats.WriteErrors.WriteClosed.Increment()
+		return 0, nil, err
+	}
+
+	// Discard any excess data copied in due to avail being reduced due
+	// to a simultaneous write call to the socket.
+	if avail < len(v) {
+		v = v[:avail]
+	}
+
+	// Locks released in queueAndSend()
+	return queueAndSend()
+}
+
+// Peek reads data without consuming it from the endpoint.
+//
+// This method does not block if there is no data pending.
+func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+	e.LockUser()
+	defer e.UnlockUser()
+
+	// The endpoint can be read if it's connected, or if it's already closed
+	// but has some pending unread data.
+	if s := e.EndpointState(); !s.connected() && s != StateClose {
+		if s == StateError {
+			return 0, tcpip.ControlMessages{}, e.HardError
+		}
+		e.stats.ReadErrors.InvalidEndpointState.Increment()
+		return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
+	}
+
+	e.rcvListMu.Lock()
+	defer e.rcvListMu.Unlock()
+
+	if e.rcvBufUsed == 0 {
+		if e.rcvClosed || !e.EndpointState().connected() {
+			e.stats.ReadErrors.ReadClosed.Increment()
+			return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
+		}
+		return 0, tcpip.ControlMessages{}, tcpip.ErrWouldBlock
+	}
+
+	// Make a copy of vec so we can modify the slide headers.
+	vec = append([][]byte(nil), vec...)
+
+	var num int64
+	for s := e.rcvList.Front(); s != nil; s = s.Next() {
+		views := s.data.Views()
+
+		for i := s.viewToDeliver; i < len(views); i++ {
+			v := views[i]
+
+			for len(v) > 0 {
+				if len(vec) == 0 {
+					return num, tcpip.ControlMessages{}, nil
+				}
+				if len(vec[0]) == 0 {
+					vec = vec[1:]
+					continue
+				}
+
+				n := copy(vec[0], v)
+				v = v[n:]
+				vec[0] = vec[0][n:]
+				num += int64(n)
+			}
+		}
+	}
+
+	return num, tcpip.ControlMessages{}, nil
+}
+
+// windowCrossedACKThresholdLocked checks if the receive window to be announced
+// now would be under aMSS or under half receive buffer, whichever smaller. This
+// is useful as a receive side silly window syndrome prevention mechanism. If
+// window grows to reasonable value, we should send ACK to the sender to inform
+// the rx space is now large. We also want ensure a series of small read()'s
+// won't trigger a flood of spurious tiny ACK's.
+//
+// For large receive buffers, the threshold is aMSS - once reader reads more
+// than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of
+// receive buffer size. This is chosen arbitrairly.
+// crossed will be true if the window size crossed the ACK threshold.
+// above will be true if the new window is >= ACK threshold and false
+// otherwise.
+//
+// Precondition: e.mu and e.rcvListMu must be held.
+func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed bool, above bool) {
+	newAvail := e.receiveBufferAvailableLocked()
+	oldAvail := newAvail - deltaBefore
+	if oldAvail < 0 {
+		oldAvail = 0
+	}
+
+	threshold := int(e.amss)
+	if threshold > e.rcvBufSize/2 {
+		threshold = e.rcvBufSize / 2
+	}
+
+	switch {
+	case oldAvail < threshold && newAvail >= threshold:
+		return true, true
+	case oldAvail >= threshold && newAvail < threshold:
+		return true, false
+	}
+	return false, false
+}
+
+// SetSockOptBool sets a socket option.
+func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	switch opt {
+
+	case tcpip.BroadcastOption:
+		e.LockUser()
+		e.broadcast = v
+		e.UnlockUser()
+
+	case tcpip.CorkOption:
+		e.LockUser()
+		if !v {
+			atomic.StoreUint32(&e.cork, 0)
+
+			// Handle the corked data.
+			e.sndWaker.Assert()
+		} else {
+			atomic.StoreUint32(&e.cork, 1)
+		}
+		e.UnlockUser()
+
+	case tcpip.DelayOption:
+		if v {
+			atomic.StoreUint32(&e.delay, 1)
+		} else {
+			atomic.StoreUint32(&e.delay, 0)
+
+			// Handle delayed data.
+			e.sndWaker.Assert()
+		}
+
+	case tcpip.KeepaliveEnabledOption:
+		e.keepalive.Lock()
+		e.keepalive.enabled = v
+		e.keepalive.Unlock()
+		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
+
+	case tcpip.QuickAckOption:
+		o := uint32(1)
+		if v {
+			o = 0
+		}
+		atomic.StoreUint32(&e.slowAck, o)
+
+	case tcpip.ReuseAddressOption:
+		e.LockUser()
+		e.portFlags.TupleOnly = v
+		e.UnlockUser()
+
+	case tcpip.ReusePortOption:
+		e.LockUser()
+		e.portFlags.LoadBalanced = v
+		e.UnlockUser()
+
+	case tcpip.V6OnlyOption:
+		// We only recognize this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return tcpip.ErrInvalidEndpointState
+		}
+
+		// We only allow this to be set when we're in the initial state.
+		if e.EndpointState() != StateInitial {
+			return tcpip.ErrInvalidEndpointState
+		}
+
+		e.LockUser()
+		e.v6only = v
+		e.UnlockUser()
+	}
+
+	return nil
+}
+
+// SetSockOptInt sets a socket option.
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
+	const inetECNMask = 3
+
+	switch opt {
+	case tcpip.KeepaliveCountOption:
+		e.keepalive.Lock()
+		e.keepalive.count = v
+		e.keepalive.Unlock()
+		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
+
+	case tcpip.IPv4TOSOption:
+		e.LockUser()
+		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
+		// ignore the bits for now.
+		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
+		e.UnlockUser()
+
+	case tcpip.IPv6TrafficClassOption:
+		e.LockUser()
+		// TODO(gvisor.dev/issue/995): ECN is not currently supported,
+		// ignore the bits for now.
+		e.sendTOS = uint8(v) & ^uint8(inetECNMask)
+		e.UnlockUser()
+
+	case tcpip.MaxSegOption:
+		userMSS := v
+		if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
+			return tcpip.ErrInvalidOptionValue
+		}
+		e.LockUser()
+		e.userMSS = uint16(userMSS)
+		e.UnlockUser()
+		e.notifyProtocolGoroutine(notifyMSSChanged)
+
+	case tcpip.MTUDiscoverOption:
+		// Return not supported if attempting to set this option to
+		// anything other than path MTU discovery disabled.
+		if v != tcpip.PMTUDiscoveryDont {
+			return tcpip.ErrNotSupported
+		}
+
+	case tcpip.ReceiveBufferSizeOption:
+		// Make sure the receive buffer size is within the min and max
+		// allowed.
+		var rs ReceiveBufferSizeOption
+		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
+			if v < rs.Min {
+				v = rs.Min
+			}
+			if v > rs.Max {
+				v = rs.Max
+			}
+		}
+
+		mask := uint32(notifyReceiveWindowChanged)
+
+		e.LockUser()
+		e.rcvListMu.Lock()
+
+		// Make sure the receive buffer size allows us to send a
+		// non-zero window size.
+		scale := uint8(0)
+		if e.rcv != nil {
+			scale = e.rcv.rcvWndScale
+		}
+		if v>>scale == 0 {
+			v = 1 << scale
+		}
+
+		// Make sure 2*size doesn't overflow.
+		if v > math.MaxInt32/2 {
+			v = math.MaxInt32 / 2
+		}
+
+		availBefore := e.receiveBufferAvailableLocked()
+		e.rcvBufSize = v
+		availAfter := e.receiveBufferAvailableLocked()
+
+		e.rcvAutoParams.disabled = true
+
+		// Immediately send an ACK to uncork the sender silly window
+		// syndrome prevetion, when our available space grows above aMSS
+		// or half receive buffer, whichever smaller.
+		if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above {
+			mask |= notifyNonZeroReceiveWindow
+		}
+
+		e.rcvListMu.Unlock()
+		e.UnlockUser()
+		e.notifyProtocolGoroutine(mask)
+
+	case tcpip.SendBufferSizeOption:
+		// Make sure the send buffer size is within the min and max
+		// allowed.
+		var ss SendBufferSizeOption
+		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
+			if v < ss.Min {
+				v = ss.Min
+			}
+			if v > ss.Max {
+				v = ss.Max
+			}
+		}
+
+		e.sndBufMu.Lock()
+		e.sndBufSize = v
+		e.sndBufMu.Unlock()
+
+	case tcpip.TTLOption:
+		e.LockUser()
+		e.ttl = uint8(v)
+		e.UnlockUser()
+
+	case tcpip.TCPSynCountOption:
+		if v < 1 || v > 255 {
+			return tcpip.ErrInvalidOptionValue
+		}
+		e.LockUser()
+		e.maxSynRetries = uint8(v)
+		e.UnlockUser()
+
+	case tcpip.TCPWindowClampOption:
+		if v == 0 {
+			e.LockUser()
+			switch e.EndpointState() {
+			case StateClose, StateInitial:
+				e.windowClamp = 0
+				e.UnlockUser()
+				return nil
+			default:
+				e.UnlockUser()
+				return tcpip.ErrInvalidOptionValue
+			}
+		}
+		var rs ReceiveBufferSizeOption
+		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
+			if v < rs.Min/2 {
+				v = rs.Min / 2
+			}
+		}
+		e.LockUser()
+		e.windowClamp = uint32(v)
+		e.UnlockUser()
+	}
+	return nil
+}
+
+// SetSockOpt sets a socket option.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	switch v := opt.(type) {
+	case tcpip.BindToDeviceOption:
+		id := tcpip.NICID(v)
+		if id != 0 && !e.stack.HasNIC(id) {
+			return tcpip.ErrUnknownDevice
+		}
+		e.LockUser()
+		e.bindToDevice = id
+		e.UnlockUser()
+
+	case tcpip.KeepaliveIdleOption:
+		e.keepalive.Lock()
+		e.keepalive.idle = time.Duration(v)
+		e.keepalive.Unlock()
+		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
+
+	case tcpip.KeepaliveIntervalOption:
+		e.keepalive.Lock()
+		e.keepalive.interval = time.Duration(v)
+		e.keepalive.Unlock()
+		e.notifyProtocolGoroutine(notifyKeepaliveChanged)
+
+	case tcpip.OutOfBandInlineOption:
+		// We don't currently support disabling this option.
+
+	case tcpip.TCPUserTimeoutOption:
+		e.LockUser()
+		e.userTimeout = time.Duration(v)
+		e.UnlockUser()
+
+	case tcpip.CongestionControlOption:
+		// Query the available cc algorithms in the stack and
+		// validate that the specified algorithm is actually
+		// supported in the stack.
+		var avail tcpip.AvailableCongestionControlOption
+		if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
+			return err
+		}
+		availCC := strings.Split(string(avail), " ")
+		for _, cc := range availCC {
+			if v == tcpip.CongestionControlOption(cc) {
+				e.LockUser()
+				state := e.EndpointState()
+				e.cc = v
+				switch state {
+				case StateEstablished:
+					if e.EndpointState() == state {
+						e.snd.cc = e.snd.initCongestionControl(e.cc)
+					}
+				}
+				e.UnlockUser()
+				return nil
+			}
+		}
+
+		// Linux returns ENOENT when an invalid congestion
+		// control algorithm is specified.
+		return tcpip.ErrNoSuchFile
+
+	case tcpip.TCPLingerTimeoutOption:
+		e.LockUser()
+		if v < 0 {
+			// Same as effectively disabling TCPLinger timeout.
+			v = 0
+		}
+		var stkTCPLingerTimeout tcpip.TCPLingerTimeoutOption
+		if err := e.stack.TransportProtocolOption(header.TCPProtocolNumber, &stkTCPLingerTimeout); err != nil {
+			// We were unable to retrieve a stack config, just use
+			// the DefaultTCPLingerTimeout.
+			if v > tcpip.TCPLingerTimeoutOption(DefaultTCPLingerTimeout) {
+				stkTCPLingerTimeout = tcpip.TCPLingerTimeoutOption(DefaultTCPLingerTimeout)
+			}
+		}
+		// Cap it to the stack wide TCPLinger timeout.
+		if v > stkTCPLingerTimeout {
+			v = stkTCPLingerTimeout
+		}
+		e.tcpLingerTimeout = time.Duration(v)
+		e.UnlockUser()
+
+	case tcpip.TCPDeferAcceptOption:
+		e.LockUser()
+		if time.Duration(v) > MaxRTO {
+			v = tcpip.TCPDeferAcceptOption(MaxRTO)
+		}
+		e.deferAccept = time.Duration(v)
+		e.UnlockUser()
+
+	default:
+		return nil
+	}
+	return nil
+}
+
+// readyReceiveSize returns the number of bytes ready to be received.
+func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
+	e.LockUser()
+	defer e.UnlockUser()
+
+	// The endpoint cannot be in listen state.
+	if e.EndpointState() == StateListen {
+		return 0, tcpip.ErrInvalidEndpointState
+	}
+
+	e.rcvListMu.Lock()
+	defer e.rcvListMu.Unlock()
+
+	return e.rcvBufUsed, nil
+}
+
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	switch opt {
+	case tcpip.BroadcastOption:
+		e.LockUser()
+		v := e.broadcast
+		e.UnlockUser()
+		return v, nil
+
+	case tcpip.CorkOption:
+		return atomic.LoadUint32(&e.cork) != 0, nil
+
+	case tcpip.DelayOption:
+		return atomic.LoadUint32(&e.delay) != 0, nil
+
+	case tcpip.KeepaliveEnabledOption:
+		e.keepalive.Lock()
+		v := e.keepalive.enabled
+		e.keepalive.Unlock()
+
+		return v, nil
+
+	case tcpip.QuickAckOption:
+		v := atomic.LoadUint32(&e.slowAck) == 0
+		return v, nil
+
+	case tcpip.ReuseAddressOption:
+		e.LockUser()
+		v := e.portFlags.TupleOnly
+		e.UnlockUser()
+
+		return v, nil
+
+	case tcpip.ReusePortOption:
+		e.LockUser()
+		v := e.portFlags.LoadBalanced
+		e.UnlockUser()
+
+		return v, nil
+
+	case tcpip.V6OnlyOption:
+		// We only recognize this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return false, tcpip.ErrUnknownProtocolOption
+		}
+
+		e.LockUser()
+		v := e.v6only
+		e.UnlockUser()
+
+		return v, nil
+
+	case tcpip.MulticastLoopOption:
+		return true, nil
+
+	default:
+		return false, tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
+func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
+	switch opt {
+	case tcpip.KeepaliveCountOption:
+		e.keepalive.Lock()
+		v := e.keepalive.count
+		e.keepalive.Unlock()
+		return v, nil
+
+	case tcpip.IPv4TOSOption:
+		e.LockUser()
+		v := int(e.sendTOS)
+		e.UnlockUser()
+		return v, nil
+
+	case tcpip.IPv6TrafficClassOption:
+		e.LockUser()
+		v := int(e.sendTOS)
+		e.UnlockUser()
+		return v, nil
+
+	case tcpip.MaxSegOption:
+		// This is just stubbed out. Linux never returns the user_mss
+		// value as it either returns the defaultMSS or returns the
+		// actual current MSS. Netstack just returns the defaultMSS
+		// always for now.
+		v := header.TCPDefaultMSS
+		return v, nil
+
+	case tcpip.MTUDiscoverOption:
+		// Always return the path MTU discovery disabled setting since
+		// it's the only one supported.
+		return tcpip.PMTUDiscoveryDont, nil
+
+	case tcpip.ReceiveQueueSizeOption:
+		return e.readyReceiveSize()
+
+	case tcpip.SendBufferSizeOption:
+		e.sndBufMu.Lock()
+		v := e.sndBufSize
+		e.sndBufMu.Unlock()
+		return v, nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		e.rcvListMu.Lock()
+		v := e.rcvBufSize
+		e.rcvListMu.Unlock()
+		return v, nil
+
+	case tcpip.TTLOption:
+		e.LockUser()
+		v := int(e.ttl)
+		e.UnlockUser()
+		return v, nil
+
+	case tcpip.TCPSynCountOption:
+		e.LockUser()
+		v := int(e.maxSynRetries)
+		e.UnlockUser()
+		return v, nil
+
+	case tcpip.TCPWindowClampOption:
+		e.LockUser()
+		v := int(e.windowClamp)
+		e.UnlockUser()
+		return v, nil
+
+	case tcpip.MulticastTTLOption:
+		return 1, nil
+
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	switch o := opt.(type) {
+	case tcpip.ErrorOption:
+		e.lastErrorMu.Lock()
+		err := e.lastError
+		e.lastError = nil
+		e.lastErrorMu.Unlock()
+		return err
+
+	case *tcpip.BindToDeviceOption:
+		e.LockUser()
+		*o = tcpip.BindToDeviceOption(e.bindToDevice)
+		e.UnlockUser()
+
+	case *tcpip.TCPInfoOption:
+		*o = tcpip.TCPInfoOption{}
+		e.LockUser()
+		snd := e.snd
+		e.UnlockUser()
+		if snd != nil {
+			snd.rtt.Lock()
+			o.RTT = snd.rtt.srtt
+			o.RTTVar = snd.rtt.rttvar
+			snd.rtt.Unlock()
+		}
+
+	case *tcpip.KeepaliveIdleOption:
+		e.keepalive.Lock()
+		*o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
+		e.keepalive.Unlock()
+
+	case *tcpip.KeepaliveIntervalOption:
+		e.keepalive.Lock()
+		*o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
+		e.keepalive.Unlock()
+
+	case *tcpip.TCPUserTimeoutOption:
+		e.LockUser()
+		*o = tcpip.TCPUserTimeoutOption(e.userTimeout)
+		e.UnlockUser()
+
+	case *tcpip.OutOfBandInlineOption:
+		// We don't currently support disabling this option.
+		*o = 1
+
+	case *tcpip.CongestionControlOption:
+		e.LockUser()
+		*o = e.cc
+		e.UnlockUser()
+
+	case *tcpip.TCPLingerTimeoutOption:
+		e.LockUser()
+		*o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout)
+		e.UnlockUser()
+
+	case *tcpip.TCPDeferAcceptOption:
+		e.LockUser()
+		*o = tcpip.TCPDeferAcceptOption(e.deferAccept)
+		e.UnlockUser()
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+	return nil
+}
+
+// checkV4MappedLocked determines the effective network protocol and converts
+// addr to its canonical form.
+func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only)
+	if err != nil {
+		return tcpip.FullAddress{}, 0, err
+	}
+	return unwrapped, netProto, nil
+}
+
+// Disconnect implements tcpip.Endpoint.Disconnect.
+func (*endpoint) Disconnect() *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Connect connects the endpoint to its peer.
+func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+	err := e.connect(addr, true, true)
+	if err != nil && !err.IgnoreStats() {
+		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
+		e.stats.FailedConnectionAttempts.Increment()
+	}
+	return err
+}
+
+// connect connects the endpoint to its peer. In the normal non-S/R case, the
+// new connection is expected to run the main goroutine and perform handshake.
+// In restore of previously connected endpoints, both ends will be passively
+// created (so no new handshaking is done); for stack-accepted connections not
+// yet accepted by the app, they are restored without running the main goroutine
+// here.
+func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tcpip.Error {
+	e.LockUser()
+	defer e.UnlockUser()
+
+	connectingAddr := addr.Addr
+
+	addr, netProto, err := e.checkV4MappedLocked(addr)
+	if err != nil {
+		return err
+	}
+
+	if e.EndpointState().connected() {
+		// The endpoint is already connected. If caller hasn't been
+		// notified yet, return success.
+		if !e.isConnectNotified {
+			e.isConnectNotified = true
+			return nil
+		}
+		// Otherwise return that it's already connected.
+		return tcpip.ErrAlreadyConnected
+	}
+
+	nicID := addr.NIC
+	switch e.EndpointState() {
+	case StateBound:
+		// If we're already bound to a NIC but the caller is requesting
+		// that we use a different one now, we cannot proceed.
+		if e.boundNICID == 0 {
+			break
+		}
+
+		if nicID != 0 && nicID != e.boundNICID {
+			return tcpip.ErrNoRoute
+		}
+
+		nicID = e.boundNICID
+
+	case StateInitial:
+		// Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
+		// when we find a route.
+
+	case StateConnecting, StateSynSent, StateSynRecv:
+		// A connection request has already been issued but hasn't completed
+		// yet.
+		return tcpip.ErrAlreadyConnecting
+
+	case StateError:
+		return e.HardError
+
+	default:
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	// Find a route to the desired destination.
+	r, err := e.stack.FindRoute(nicID, e.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
+	if err != nil {
+		return err
+	}
+	defer r.Release()
+
+	netProtos := []tcpip.NetworkProtocolNumber{netProto}
+	e.ID.LocalAddress = r.LocalAddress
+	e.ID.RemoteAddress = r.RemoteAddress
+	e.ID.RemotePort = addr.Port
+
+	if e.ID.LocalPort != 0 {
+		// The endpoint is bound to a port, attempt to register it.
+		err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
+		if err != nil {
+			return err
+		}
+	} else {
+		// The endpoint doesn't have a local port yet, so try to get
+		// one. Make sure that it isn't one that will result in the same
+		// address/port for both local and remote (otherwise this
+		// endpoint would be trying to connect to itself).
+		sameAddr := e.ID.LocalAddress == e.ID.RemoteAddress
+
+		// Calculate a port offset based on the destination IP/port and
+		// src IP to ensure that for a given tuple (srcIP, destIP,
+		// destPort) the offset used as a starting point is the same to
+		// ensure that we can cycle through the port space effectively.
+		h := jenkins.Sum32(e.stack.Seed())
+		h.Write([]byte(e.ID.LocalAddress))
+		h.Write([]byte(e.ID.RemoteAddress))
+		portBuf := make([]byte, 2)
+		binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort)
+		h.Write(portBuf)
+		portOffset := h.Sum32()
+
+		if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, *tcpip.Error) {
+			if sameAddr && p == e.ID.RemotePort {
+				return false, nil
+			}
+			if _, err := e.stack.ReservePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr); err != nil {
+				return false, nil
+			}
+
+			id := e.ID
+			id.LocalPort = p
+			if err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.portFlags, e.bindToDevice); err != nil {
+				e.stack.ReleasePort(netProtos, ProtocolNumber, e.ID.LocalAddress, p, e.portFlags, e.bindToDevice, addr)
+				if err == tcpip.ErrPortInUse {
+					return false, nil
+				}
+				return false, err
+			}
+
+			// Port picking successful. Save the details of
+			// the selected port.
+			e.ID = id
+			e.isPortReserved = true
+			e.boundBindToDevice = e.bindToDevice
+			e.boundPortFlags = e.portFlags
+			e.boundDest = addr
+			return true, nil
+		}); err != nil {
+			return err
+		}
+	}
+
+	e.isRegistered = true
+	e.setEndpointState(StateConnecting)
+	e.route = r.Clone()
+	e.boundNICID = nicID
+	e.effectiveNetProtos = netProtos
+	e.connectingAddress = connectingAddr
+
+	e.initGSO()
+
+	// Connect in the restore phase does not perform handshake. Restore its
+	// connection setting here.
+	if !handshake {
+		e.segmentQueue.mu.Lock()
+		for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} {
+			for s := l.Front(); s != nil; s = s.Next() {
+				s.id = e.ID
+				s.route = r.Clone()
+				e.sndWaker.Assert()
+			}
+		}
+		e.segmentQueue.mu.Unlock()
+		e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
+		e.setEndpointState(StateEstablished)
+	}
+
+	if run {
+		e.workerRunning = true
+		e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
+		go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save.
+	}
+
+	return tcpip.ErrConnectStarted
+}
+
+// ConnectEndpoint is not supported.
+func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// Shutdown closes the read and/or write end of the endpoint connection to its
+// peer.
+func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+	e.LockUser()
+	defer e.UnlockUser()
+	return e.shutdownLocked(flags)
+}
+
+func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) *tcpip.Error {
+	e.shutdownFlags |= flags
+	switch {
+	case e.EndpointState().connected():
+		// Close for read.
+		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
+			// Mark read side as closed.
+			e.rcvListMu.Lock()
+			e.rcvClosed = true
+			rcvBufUsed := e.rcvBufUsed
+			e.rcvListMu.Unlock()
+
+			// If we're fully closed and we have unread data we need to abort
+			// the connection with a RST.
+			if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 {
+				e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+				// Wake up worker to terminate loop.
+				e.notifyProtocolGoroutine(notifyTickleWorker)
+				return nil
+			}
+		}
+
+		// Close for write.
+		if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
+			e.sndBufMu.Lock()
+			if e.sndClosed {
+				// Already closed.
+				e.sndBufMu.Unlock()
+				if e.EndpointState() == StateTimeWait {
+					return tcpip.ErrNotConnected
+				}
+				return nil
+			}
+
+			// Queue fin segment.
+			s := newSegmentFromView(&e.route, e.ID, nil)
+			e.sndQueue.PushBack(s)
+			e.sndBufInQueue++
+			// Mark endpoint as closed.
+			e.sndClosed = true
+			e.sndBufMu.Unlock()
+			e.handleClose()
+		}
+
+		return nil
+	case e.EndpointState() == StateListen:
+		if e.shutdownFlags&tcpip.ShutdownRead != 0 {
+			// Reset all connections from the accept queue and keep the
+			// worker running so that it can continue handling incoming
+			// segments by replying with RST.
+			//
+			// By not removing this endpoint from the demuxer mapping, we
+			// ensure that any other bind to the same port fails, as on Linux.
+			e.rcvListMu.Lock()
+			e.rcvClosed = true
+			e.rcvListMu.Unlock()
+			e.closePendingAcceptableConnectionsLocked()
+			// Notify waiters that the endpoint is shutdown.
+			e.waiterQueue.Notify(waiter.EventIn | waiter.EventOut | waiter.EventHUp | waiter.EventErr)
+		}
+		return nil
+	default:
+		return tcpip.ErrNotConnected
+	}
+}
+
+// Listen puts the endpoint in "listen" mode, which allows it to accept
+// new connections.
+func (e *endpoint) Listen(backlog int) *tcpip.Error {
+	err := e.listen(backlog)
+	if err != nil && !err.IgnoreStats() {
+		e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
+		e.stats.FailedConnectionAttempts.Increment()
+	}
+	return err
+}
+
+func (e *endpoint) listen(backlog int) *tcpip.Error {
+	e.LockUser()
+	defer e.UnlockUser()
+
+	if e.EndpointState() == StateListen && !e.closed {
+		e.acceptMu.Lock()
+		defer e.acceptMu.Unlock()
+		if e.acceptedChan == nil {
+			// listen is called after shutdown.
+			e.acceptedChan = make(chan *endpoint, backlog)
+			e.shutdownFlags = 0
+			e.rcvListMu.Lock()
+			e.rcvClosed = false
+			e.rcvListMu.Unlock()
+		} else {
+			// Adjust the size of the channel iff we can fix
+			// existing pending connections into the new one.
+			if len(e.acceptedChan) > backlog {
+				return tcpip.ErrInvalidEndpointState
+			}
+			if cap(e.acceptedChan) == backlog {
+				return nil
+			}
+			origChan := e.acceptedChan
+			e.acceptedChan = make(chan *endpoint, backlog)
+			close(origChan)
+			for ep := range origChan {
+				e.acceptedChan <- ep
+			}
+		}
+
+		// Notify any blocked goroutines that they can attempt to
+		// deliver endpoints again.
+		e.acceptCond.Broadcast()
+
+		return nil
+	}
+
+	if e.EndpointState() == StateInitial {
+		// The listen is called on an unbound socket, the socket is
+		// automatically bound to a random free port with the local
+		// address set to INADDR_ANY.
+		if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
+			return err
+		}
+	}
+
+	// Endpoint must be bound before it can transition to listen mode.
+	if e.EndpointState() != StateBound {
+		e.stats.ReadErrors.InvalidEndpointState.Increment()
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	// Register the endpoint.
+	if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil {
+		return err
+	}
+
+	e.isRegistered = true
+	e.setEndpointState(StateListen)
+
+	// The channel may be non-nil when we're restoring the endpoint, and it
+	// may be pre-populated with some previously accepted (but not Accepted)
+	// endpoints.
+	e.acceptMu.Lock()
+	if e.acceptedChan == nil {
+		e.acceptedChan = make(chan *endpoint, backlog)
+	}
+	e.acceptMu.Unlock()
+
+	e.workerRunning = true
+	go e.protocolListenLoop( // S/R-SAFE: drained on save.
+		seqnum.Size(e.receiveBufferAvailable()))
+	return nil
+}
+
+// startAcceptedLoop sets up required state and starts a goroutine with the
+// main loop for accepted connections.
+func (e *endpoint) startAcceptedLoop() {
+	e.workerRunning = true
+	e.mu.Unlock()
+	wakerInitDone := make(chan struct{})
+	go e.protocolMainLoop(false, wakerInitDone) // S/R-SAFE: drained on save.
+	<-wakerInitDone
+}
+
+// Accept returns a new endpoint if a peer has established a connection
+// to an endpoint previously set to listen mode.
+func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+	e.LockUser()
+	defer e.UnlockUser()
+
+	e.rcvListMu.Lock()
+	rcvClosed := e.rcvClosed
+	e.rcvListMu.Unlock()
+	// Endpoint must be in listen state before it can accept connections.
+	if rcvClosed || e.EndpointState() != StateListen {
+		return nil, nil, tcpip.ErrInvalidEndpointState
+	}
+
+	// Get the new accepted endpoint.
+	e.acceptMu.Lock()
+	defer e.acceptMu.Unlock()
+	var n *endpoint
+	select {
+	case n = <-e.acceptedChan:
+		e.acceptCond.Signal()
+	default:
+		return nil, nil, tcpip.ErrWouldBlock
+	}
+	return n, n.waiterQueue, nil
+}
+
+// Bind binds the endpoint to a specific local port and optionally address.
+func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
+	e.LockUser()
+	defer e.UnlockUser()
+
+	return e.bindLocked(addr)
+}
+
+func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err *tcpip.Error) {
+	// Don't allow binding once endpoint is not in the initial state
+	// anymore. This is because once the endpoint goes into a connected or
+	// listen state, it is already bound.
+	if e.EndpointState() != StateInitial {
+		return tcpip.ErrAlreadyBound
+	}
+
+	e.BindAddr = addr.Addr
+	addr, netProto, err := e.checkV4MappedLocked(addr)
+	if err != nil {
+		return err
+	}
+
+	// Expand netProtos to include v4 and v6 if the caller is binding to a
+	// wildcard (empty) address, and this is an IPv6 endpoint with v6only
+	// set to false.
+	netProtos := []tcpip.NetworkProtocolNumber{netProto}
+	if netProto == header.IPv6ProtocolNumber && !e.v6only && addr.Addr == "" {
+		netProtos = []tcpip.NetworkProtocolNumber{
+			header.IPv6ProtocolNumber,
+			header.IPv4ProtocolNumber,
+		}
+	}
+
+	port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.portFlags, e.bindToDevice, tcpip.FullAddress{})
+	if err != nil {
+		return err
+	}
+
+	e.boundBindToDevice = e.bindToDevice
+	e.boundPortFlags = e.portFlags
+	e.isPortReserved = true
+	e.effectiveNetProtos = netProtos
+	e.ID.LocalPort = port
+
+	// Any failures beyond this point must remove the port registration.
+	defer func(portFlags ports.Flags, bindToDevice tcpip.NICID) {
+		if err != nil {
+			e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, portFlags, bindToDevice, tcpip.FullAddress{})
+			e.isPortReserved = false
+			e.effectiveNetProtos = nil
+			e.ID.LocalPort = 0
+			e.ID.LocalAddress = ""
+			e.boundNICID = 0
+			e.boundBindToDevice = 0
+			e.boundPortFlags = ports.Flags{}
+		}
+	}(e.boundPortFlags, e.boundBindToDevice)
+
+	// If an address is specified, we must ensure that it's one of our
+	// local addresses.
+	if len(addr.Addr) != 0 {
+		nic := e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
+		if nic == 0 {
+			return tcpip.ErrBadLocalAddress
+		}
+
+		e.boundNICID = nic
+		e.ID.LocalAddress = addr.Addr
+	}
+
+	if err := e.stack.CheckRegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e.boundPortFlags, e.boundBindToDevice); err != nil {
+		return err
+	}
+
+	// Mark endpoint as bound.
+	e.setEndpointState(StateBound)
+
+	return nil
+}
+
+// GetLocalAddress returns the address to which the endpoint is bound.
+func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	e.LockUser()
+	defer e.UnlockUser()
+
+	return tcpip.FullAddress{
+		Addr: e.ID.LocalAddress,
+		Port: e.ID.LocalPort,
+		NIC:  e.boundNICID,
+	}, nil
+}
+
+// GetRemoteAddress returns the address to which the endpoint is connected.
+func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+	e.LockUser()
+	defer e.UnlockUser()
+
+	if !e.EndpointState().connected() {
+		return tcpip.FullAddress{}, tcpip.ErrNotConnected
+	}
+
+	return tcpip.FullAddress{
+		Addr: e.ID.RemoteAddress,
+		Port: e.ID.RemotePort,
+		NIC:  e.boundNICID,
+	}, nil
+}
+
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
+	// TCP HandlePacket is not required anymore as inbound packets first
+	// land at the Dispatcher which then can either delivery using the
+	// worker go routine or directly do the invoke the tcp processing inline
+	// based on the state of the endpoint.
+}
+
+func (e *endpoint) enqueueSegment(s *segment) bool {
+	// Send packet to worker goroutine.
+	if !e.segmentQueue.enqueue(s) {
+		// The queue is full, so we drop the segment.
+		e.stack.Stats().DroppedPackets.Increment()
+		e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
+		return false
+	}
+	return true
+}
+
+// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
+	switch typ {
+	case stack.ControlPacketTooBig:
+		e.sndBufMu.Lock()
+		e.packetTooBigCount++
+		if v := int(extra); v < e.sndMTU {
+			e.sndMTU = v
+		}
+		e.sndBufMu.Unlock()
+
+		e.notifyProtocolGoroutine(notifyMTUChanged)
+	}
+}
+
+// updateSndBufferUsage is called by the protocol goroutine when room opens up
+// in the send buffer. The number of newly available bytes is v.
+func (e *endpoint) updateSndBufferUsage(v int) {
+	e.sndBufMu.Lock()
+	notify := e.sndBufUsed >= e.sndBufSize>>1
+	e.sndBufUsed -= v
+	// We only notify when there is half the sndBufSize available after
+	// a full buffer event occurs. This ensures that we don't wake up
+	// writers to queue just 1-2 segments and go back to sleep.
+	notify = notify && e.sndBufUsed < e.sndBufSize>>1
+	e.sndBufMu.Unlock()
+
+	if notify {
+		e.waiterQueue.Notify(waiter.EventOut)
+	}
+}
+
+// readyToRead is called by the protocol goroutine when a new segment is ready
+// to be read, or when the connection is closed for receiving (in which case
+// s will be nil).
+func (e *endpoint) readyToRead(s *segment) {
+	e.rcvListMu.Lock()
+	if s != nil {
+		s.incRef()
+		e.rcvBufUsed += s.data.Size()
+		// Increase counter if the receive window falls down below MSS
+		// or half receive buffer size, whichever smaller.
+		if crossed, above := e.windowCrossedACKThresholdLocked(-s.data.Size()); crossed && !above {
+			e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
+		}
+		e.rcvList.PushBack(s)
+	} else {
+		e.rcvClosed = true
+	}
+	e.rcvListMu.Unlock()
+	e.waiterQueue.Notify(waiter.EventIn)
+}
+
+// receiveBufferAvailableLocked calculates how many bytes are still available
+// in the receive buffer.
+// rcvListMu must be held when this function is called.
+func (e *endpoint) receiveBufferAvailableLocked() int {
+	// We may use more bytes than the buffer size when the receive buffer
+	// shrinks.
+	if e.rcvBufUsed >= e.rcvBufSize {
+		return 0
+	}
+
+	return e.rcvBufSize - e.rcvBufUsed
+}
+
+// receiveBufferAvailable calculates how many bytes are still available in the
+// receive buffer.
+func (e *endpoint) receiveBufferAvailable() int {
+	e.rcvListMu.Lock()
+	available := e.receiveBufferAvailableLocked()
+	e.rcvListMu.Unlock()
+	return available
+}
+
+func (e *endpoint) receiveBufferSize() int {
+	e.rcvListMu.Lock()
+	size := e.rcvBufSize
+	e.rcvListMu.Unlock()
+
+	return size
+}
+
+func (e *endpoint) maxReceiveBufferSize() int {
+	var rs ReceiveBufferSizeOption
+	if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
+		// As a fallback return the hardcoded max buffer size.
+		return MaxBufferSize
+	}
+	return rs.Max
+}
+
+// rcvWndScaleForHandshake computes the receive window scale to offer to the
+// peer when window scaling is enabled (true by default). If auto-tuning is
+// disabled then the window scaling factor is based on the size of the
+// receiveBuffer otherwise we use the max permissible receive buffer size to
+// compute the scale.
+func (e *endpoint) rcvWndScaleForHandshake() int {
+	bufSizeForScale := e.receiveBufferSize()
+
+	e.rcvListMu.Lock()
+	autoTuningDisabled := e.rcvAutoParams.disabled
+	e.rcvListMu.Unlock()
+	if autoTuningDisabled {
+		return FindWndScale(seqnum.Size(bufSizeForScale))
+	}
+
+	return FindWndScale(seqnum.Size(e.maxReceiveBufferSize()))
+}
+
+// updateRecentTimestamp updates the recent timestamp using the algorithm
+// described in https://tools.ietf.org/html/rfc7323#section-4.3
+func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
+	if e.sendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
+		e.setRecentTimestamp(tsVal)
+	}
+}
+
+// maybeEnableTimestamp marks the timestamp option enabled for this endpoint if
+// the SYN options indicate that timestamp option was negotiated. It also
+// initializes the recentTS with the value provided in synOpts.TSval.
+func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) {
+	if synOpts.TS {
+		e.sendTSOk = true
+		e.setRecentTimestamp(synOpts.TSVal)
+	}
+}
+
+// timestamp returns the timestamp value to be used in the TSVal field of the
+// timestamp option for outgoing TCP segments for a given endpoint.
+func (e *endpoint) timestamp() uint32 {
+	return tcpTimeStamp(e.tsOffset)
+}
+
+// tcpTimeStamp returns a timestamp offset by the provided offset. This is
+// not inlined above as it's used when SYN cookies are in use and endpoint
+// is not created at the time when the SYN cookie is sent.
+func tcpTimeStamp(offset uint32) uint32 {
+	now := time.Now()
+	return uint32(now.Unix()*1000+int64(now.Nanosecond()/1e6)) + offset
+}
+
+// timeStampOffset returns a randomized timestamp offset to be used when sending
+// timestamp values in a timestamp option for a TCP segment.
+func timeStampOffset() uint32 {
+	b := make([]byte, 4)
+	if _, err := rand.Read(b); err != nil {
+		panic(err)
+	}
+	// Initialize a random tsOffset that will be added to the recentTS
+	// everytime the timestamp is sent when the Timestamp option is enabled.
+	//
+	// See https://tools.ietf.org/html/rfc7323#section-5.4 for details on
+	// why this is required.
+	//
+	// NOTE: This is not completely to spec as normally this should be
+	// initialized in a manner analogous to how sequence numbers are
+	// randomized per connection basis. But for now this is sufficient.
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+// maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint
+// if the SYN options indicate that the SACK option was negotiated and the TCP
+// stack is configured to enable TCP SACK option.
+func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) {
+	var v SACKEnabled
+	if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
+		// Stack doesn't support SACK. So just return.
+		return
+	}
+	if bool(v) && synOpts.SACKPermitted {
+		e.sackPermitted = true
+	}
+}
+
+// maxOptionSize return the maximum size of TCP options.
+func (e *endpoint) maxOptionSize() (size int) {
+	var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
+	options := e.makeOptions(maxSackBlocks[:])
+	size = len(options)
+	putOptions(options)
+
+	return size
+}
+
+// completeState makes a full copy of the endpoint and returns it. This is used
+// before invoking the probe. The state returned may not be fully consistent if
+// there are intervening syscalls when the state is being copied.
+func (e *endpoint) completeState() stack.TCPEndpointState {
+	var s stack.TCPEndpointState
+	s.SegTime = time.Now()
+
+	// Copy EndpointID.
+	s.ID = stack.TCPEndpointID(e.ID)
+
+	// Copy endpoint rcv state.
+	e.rcvListMu.Lock()
+	s.RcvBufSize = e.rcvBufSize
+	s.RcvBufUsed = e.rcvBufUsed
+	s.RcvClosed = e.rcvClosed
+	s.RcvAutoParams.MeasureTime = e.rcvAutoParams.measureTime
+	s.RcvAutoParams.CopiedBytes = e.rcvAutoParams.copied
+	s.RcvAutoParams.PrevCopiedBytes = e.rcvAutoParams.prevCopied
+	s.RcvAutoParams.RTT = e.rcvAutoParams.rtt
+	s.RcvAutoParams.RTTMeasureSeqNumber = e.rcvAutoParams.rttMeasureSeqNumber
+	s.RcvAutoParams.RTTMeasureTime = e.rcvAutoParams.rttMeasureTime
+	s.RcvAutoParams.Disabled = e.rcvAutoParams.disabled
+	e.rcvListMu.Unlock()
+
+	// Endpoint TCP Option state.
+	s.SendTSOk = e.sendTSOk
+	s.RecentTS = e.recentTimestamp()
+	s.TSOffset = e.tsOffset
+	s.SACKPermitted = e.sackPermitted
+	s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
+	copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
+	s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()
+
+	// Copy endpoint send state.
+	e.sndBufMu.Lock()
+	s.SndBufSize = e.sndBufSize
+	s.SndBufUsed = e.sndBufUsed
+	s.SndClosed = e.sndClosed
+	s.SndBufInQueue = e.sndBufInQueue
+	s.PacketTooBigCount = e.packetTooBigCount
+	s.SndMTU = e.sndMTU
+	e.sndBufMu.Unlock()
+
+	// Copy receiver state.
+	s.Receiver = stack.TCPReceiverState{
+		RcvNxt:         e.rcv.rcvNxt,
+		RcvAcc:         e.rcv.rcvAcc,
+		RcvWndScale:    e.rcv.rcvWndScale,
+		PendingBufUsed: e.rcv.pendingBufUsed,
+		PendingBufSize: e.rcv.pendingBufSize,
+	}
+
+	// Copy sender state.
+	s.Sender = stack.TCPSenderState{
+		LastSendTime: e.snd.lastSendTime,
+		DupAckCount:  e.snd.dupAckCount,
+		FastRecovery: stack.TCPFastRecoveryState{
+			Active:    e.snd.fr.active,
+			First:     e.snd.fr.first,
+			Last:      e.snd.fr.last,
+			MaxCwnd:   e.snd.fr.maxCwnd,
+			HighRxt:   e.snd.fr.highRxt,
+			RescueRxt: e.snd.fr.rescueRxt,
+		},
+		SndCwnd:          e.snd.sndCwnd,
+		Ssthresh:         e.snd.sndSsthresh,
+		SndCAAckCount:    e.snd.sndCAAckCount,
+		Outstanding:      e.snd.outstanding,
+		SndWnd:           e.snd.sndWnd,
+		SndUna:           e.snd.sndUna,
+		SndNxt:           e.snd.sndNxt,
+		RTTMeasureSeqNum: e.snd.rttMeasureSeqNum,
+		RTTMeasureTime:   e.snd.rttMeasureTime,
+		Closed:           e.snd.closed,
+		RTO:              e.snd.rto,
+		MaxPayloadSize:   e.snd.maxPayloadSize,
+		SndWndScale:      e.snd.sndWndScale,
+		MaxSentAck:       e.snd.maxSentAck,
+	}
+	e.snd.rtt.Lock()
+	s.Sender.SRTT = e.snd.rtt.srtt
+	s.Sender.SRTTInited = e.snd.rtt.srttInited
+	e.snd.rtt.Unlock()
+
+	if cubic, ok := e.snd.cc.(*cubicState); ok {
+		s.Sender.Cubic = stack.TCPCubicState{
+			WMax:                    cubic.wMax,
+			WLastMax:                cubic.wLastMax,
+			T:                       cubic.t,
+			TimeSinceLastCongestion: time.Since(cubic.t),
+			C:                       cubic.c,
+			K:                       cubic.k,
+			Beta:                    cubic.beta,
+			WC:                      cubic.wC,
+			WEst:                    cubic.wEst,
+		}
+	}
+	return s
+}
+
+func (e *endpoint) initHardwareGSO() {
+	gso := &stack.GSO{}
+	switch e.route.NetProto {
+	case header.IPv4ProtocolNumber:
+		gso.Type = stack.GSOTCPv4
+		gso.L3HdrLen = header.IPv4MinimumSize
+	case header.IPv6ProtocolNumber:
+		gso.Type = stack.GSOTCPv6
+		gso.L3HdrLen = header.IPv6MinimumSize
+	default:
+		panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto))
+	}
+	gso.NeedsCsum = true
+	gso.CsumOffset = header.TCPChecksumOffset
+	gso.MaxSize = e.route.GSOMaxSize()
+	e.gso = gso
+}
+
+func (e *endpoint) initGSO() {
+	if e.route.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+		e.initHardwareGSO()
+	} else if e.route.Capabilities()&stack.CapabilitySoftwareGSO != 0 {
+		e.gso = &stack.GSO{
+			MaxSize:   e.route.GSOMaxSize(),
+			Type:      stack.GSOSW,
+			NeedsCsum: false,
+		}
+	}
+}
+
+// State implements tcpip.Endpoint.State. It exports the endpoint's protocol
+// state for diagnostics.
+func (e *endpoint) State() uint32 {
+	return uint32(e.EndpointState())
+}
+
+// Info returns a copy of the endpoint info.
+func (e *endpoint) Info() tcpip.EndpointInfo {
+	e.LockUser()
+	// Make a copy of the endpoint info.
+	ret := e.EndpointInfo
+	e.UnlockUser()
+	return &ret
+}
+
+// Stats returns a pointer to the endpoint stats.
+func (e *endpoint) Stats() tcpip.EndpointStats {
+	return &e.stats
+}
+
+// Wait implements stack.TransportEndpoint.Wait.
+func (e *endpoint) Wait() {
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	e.waiterQueue.EventRegister(&waitEntry, waiter.EventHUp)
+	defer e.waiterQueue.EventUnregister(&waitEntry)
+	for {
+		e.LockUser()
+		running := e.workerRunning
+		e.UnlockUser()
+		if !running {
+			break
+		}
+		<-notifyCh
+	}
+}
+
+func mssForRoute(r *stack.Route) uint16 {
+	// TODO(b/143359391): Respect TCP Min and Max size.
+	return uint16(r.MTU() - header.TCPMinimumSize)
+}
diff --git a/pkg/tcpip/transport/tcp/endpoint_state.go b/pkg/tcpip/transport/tcp/endpoint_state.go
new file mode 100644
index 000000000..abf1ac5c9
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/endpoint_state.go
@@ -0,0 +1,348 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"fmt"
+	"sync/atomic"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+func (e *endpoint) drainSegmentLocked() {
+	// Drain only up to once.
+	if e.drainDone != nil {
+		return
+	}
+
+	e.drainDone = make(chan struct{})
+	e.undrain = make(chan struct{})
+	e.mu.Unlock()
+
+	e.notifyProtocolGoroutine(notifyDrain)
+	<-e.drainDone
+
+	e.mu.Lock()
+}
+
+// beforeSave is invoked by stateify.
+func (e *endpoint) beforeSave() {
+	// Stop incoming packets.
+	e.segmentQueue.setLimit(0)
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	epState := e.EndpointState()
+	switch {
+	case epState == StateInitial || epState == StateBound:
+	case epState.connected() || epState.handshake():
+		if e.route.Capabilities()&stack.CapabilitySaveRestore == 0 {
+			if e.route.Capabilities()&stack.CapabilityDisconnectOk == 0 {
+				panic(tcpip.ErrSaveRejection{fmt.Errorf("endpoint cannot be saved in connected state: local %v:%d, remote %v:%d", e.ID.LocalAddress, e.ID.LocalPort, e.ID.RemoteAddress, e.ID.RemotePort)})
+			}
+			e.resetConnectionLocked(tcpip.ErrConnectionAborted)
+			e.mu.Unlock()
+			e.Close()
+			e.mu.Lock()
+		}
+		if !e.workerRunning {
+			// The endpoint must be in acceptedChan or has been just
+			// disconnected and closed.
+			break
+		}
+		fallthrough
+	case epState == StateListen || epState == StateConnecting:
+		e.drainSegmentLocked()
+		// Refresh epState, since drainSegmentLocked may have changed it.
+		epState = e.EndpointState()
+		if !epState.closed() {
+			if !e.workerRunning {
+				panic("endpoint has no worker running in listen, connecting, or connected state")
+			}
+		}
+	case epState.closed():
+		for e.workerRunning {
+			e.mu.Unlock()
+			time.Sleep(100 * time.Millisecond)
+			e.mu.Lock()
+		}
+		if e.workerRunning {
+			panic(fmt.Sprintf("endpoint: %+v still has worker running in closed or error state", e.ID))
+		}
+	default:
+		panic(fmt.Sprintf("endpoint in unknown state %v", e.EndpointState()))
+	}
+
+	if e.waiterQueue != nil && !e.waiterQueue.IsEmpty() {
+		panic("endpoint still has waiters upon save")
+	}
+}
+
+// saveAcceptedChan is invoked by stateify.
+func (e *endpoint) saveAcceptedChan() []*endpoint {
+	if e.acceptedChan == nil {
+		return nil
+	}
+	acceptedEndpoints := make([]*endpoint, len(e.acceptedChan), cap(e.acceptedChan))
+	for i := 0; i < len(acceptedEndpoints); i++ {
+		select {
+		case ep := <-e.acceptedChan:
+			acceptedEndpoints[i] = ep
+		default:
+			panic("endpoint acceptedChan buffer got consumed by background context")
+		}
+	}
+	for i := 0; i < len(acceptedEndpoints); i++ {
+		select {
+		case e.acceptedChan <- acceptedEndpoints[i]:
+		default:
+			panic("endpoint acceptedChan buffer got populated by background context")
+		}
+	}
+	return acceptedEndpoints
+}
+
+// loadAcceptedChan is invoked by stateify.
+func (e *endpoint) loadAcceptedChan(acceptedEndpoints []*endpoint) {
+	if cap(acceptedEndpoints) > 0 {
+		e.acceptedChan = make(chan *endpoint, cap(acceptedEndpoints))
+		for _, ep := range acceptedEndpoints {
+			e.acceptedChan <- ep
+		}
+	}
+}
+
+// saveState is invoked by stateify.
+func (e *endpoint) saveState() EndpointState {
+	return e.EndpointState()
+}
+
+// Endpoint loading must be done in the following ordering by their state, to
+// avoid dangling connecting w/o listening peer, and to avoid conflicts in port
+// reservation.
+var connectedLoading sync.WaitGroup
+var listenLoading sync.WaitGroup
+var connectingLoading sync.WaitGroup
+
+// Bound endpoint loading happens last.
+
+// loadState is invoked by stateify.
+func (e *endpoint) loadState(epState EndpointState) {
+	// This is to ensure that the loading wait groups include all applicable
+	// endpoints before any asynchronous calls to the Wait() methods.
+	// For restore purposes we treat TimeWait like a connected endpoint.
+	if epState.connected() || epState == StateTimeWait {
+		connectedLoading.Add(1)
+	}
+	switch {
+	case epState == StateListen:
+		listenLoading.Add(1)
+	case epState.connecting():
+		connectingLoading.Add(1)
+	}
+	// Directly update the state here rather than using e.setEndpointState
+	// as the endpoint is still being loaded and the stack reference is not
+	// yet initialized.
+	atomic.StoreUint32((*uint32)(&e.state), uint32(epState))
+}
+
+// afterLoad is invoked by stateify.
+func (e *endpoint) afterLoad() {
+	e.origEndpointState = e.state
+	// Restore the endpoint to InitialState as it will be moved to
+	// its origEndpointState during Resume.
+	e.state = StateInitial
+	// Condition variables and mutexs are not S/R'ed so reinitialize
+	// acceptCond with e.acceptMu.
+	e.acceptCond = sync.NewCond(&e.acceptMu)
+	stack.StackFromEnv.RegisterRestoredEndpoint(e)
+}
+
+// Resume implements tcpip.ResumableEndpoint.Resume.
+func (e *endpoint) Resume(s *stack.Stack) {
+	e.stack = s
+	e.segmentQueue.setLimit(MaxUnprocessedSegments)
+	epState := e.origEndpointState
+	switch epState {
+	case StateInitial, StateBound, StateListen, StateConnecting, StateEstablished:
+		var ss SendBufferSizeOption
+		if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
+			if e.sndBufSize < ss.Min || e.sndBufSize > ss.Max {
+				panic(fmt.Sprintf("endpoint.sndBufSize %d is outside the min and max allowed [%d, %d]", e.sndBufSize, ss.Min, ss.Max))
+			}
+		}
+
+		var rs ReceiveBufferSizeOption
+		if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
+			if e.rcvBufSize < rs.Min || e.rcvBufSize > rs.Max {
+				panic(fmt.Sprintf("endpoint.rcvBufSize %d is outside the min and max allowed [%d, %d]", e.rcvBufSize, rs.Min, rs.Max))
+			}
+		}
+	}
+
+	bind := func() {
+		addr, _, err := e.checkV4MappedLocked(tcpip.FullAddress{Addr: e.BindAddr, Port: e.ID.LocalPort})
+		if err != nil {
+			panic("unable to parse BindAddr: " + err.String())
+		}
+		if ok := e.stack.ReserveTuple(e.effectiveNetProtos, ProtocolNumber, addr.Addr, addr.Port, e.boundPortFlags, e.boundBindToDevice, e.boundDest); !ok {
+			panic(fmt.Sprintf("unable to re-reserve tuple (%v, %q, %d, %+v, %d, %v)", e.effectiveNetProtos, addr.Addr, addr.Port, e.boundPortFlags, e.boundBindToDevice, e.boundDest))
+		}
+		e.isPortReserved = true
+
+		// Mark endpoint as bound.
+		e.setEndpointState(StateBound)
+	}
+
+	switch {
+	case epState.connected():
+		bind()
+		if len(e.connectingAddress) == 0 {
+			e.connectingAddress = e.ID.RemoteAddress
+			// This endpoint is accepted by netstack but not yet by
+			// the app. If the endpoint is IPv6 but the remote
+			// address is IPv4, we need to connect as IPv6 so that
+			// dual-stack mode can be properly activated.
+			if e.NetProto == header.IPv6ProtocolNumber && len(e.ID.RemoteAddress) != header.IPv6AddressSize {
+				e.connectingAddress = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff" + e.ID.RemoteAddress
+			}
+		}
+		// Reset the scoreboard to reinitialize the sack information as
+		// we do not restore SACK information.
+		e.scoreboard.Reset()
+		if err := e.connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.ID.RemotePort}, false, e.workerRunning); err != tcpip.ErrConnectStarted {
+			panic("endpoint connecting failed: " + err.String())
+		}
+		e.mu.Lock()
+		e.state = e.origEndpointState
+		closed := e.closed
+		e.mu.Unlock()
+		e.notifyProtocolGoroutine(notifyTickleWorker)
+		if epState == StateFinWait2 && closed {
+			// If the endpoint has been closed then make sure we notify so
+			// that the FIN_WAIT2 timer is started after a restore.
+			e.notifyProtocolGoroutine(notifyClose)
+		}
+		connectedLoading.Done()
+	case epState == StateListen:
+		tcpip.AsyncLoading.Add(1)
+		go func() {
+			connectedLoading.Wait()
+			bind()
+			backlog := cap(e.acceptedChan)
+			if err := e.Listen(backlog); err != nil {
+				panic("endpoint listening failed: " + err.String())
+			}
+			e.LockUser()
+			if e.shutdownFlags != 0 {
+				e.shutdownLocked(e.shutdownFlags)
+			}
+			e.UnlockUser()
+			listenLoading.Done()
+			tcpip.AsyncLoading.Done()
+		}()
+	case epState.connecting():
+		tcpip.AsyncLoading.Add(1)
+		go func() {
+			connectedLoading.Wait()
+			listenLoading.Wait()
+			bind()
+			if err := e.Connect(tcpip.FullAddress{NIC: e.boundNICID, Addr: e.connectingAddress, Port: e.ID.RemotePort}); err != tcpip.ErrConnectStarted {
+				panic("endpoint connecting failed: " + err.String())
+			}
+			connectingLoading.Done()
+			tcpip.AsyncLoading.Done()
+		}()
+	case epState == StateBound:
+		tcpip.AsyncLoading.Add(1)
+		go func() {
+			connectedLoading.Wait()
+			listenLoading.Wait()
+			connectingLoading.Wait()
+			bind()
+			tcpip.AsyncLoading.Done()
+		}()
+	case epState == StateClose:
+		e.isPortReserved = false
+		e.state = StateClose
+		e.stack.CompleteTransportEndpointCleanup(e)
+		tcpip.DeleteDanglingEndpoint(e)
+	case epState == StateError:
+		e.state = StateError
+		e.stack.CompleteTransportEndpointCleanup(e)
+		tcpip.DeleteDanglingEndpoint(e)
+	}
+}
+
+// saveLastError is invoked by stateify.
+func (e *endpoint) saveLastError() string {
+	if e.lastError == nil {
+		return ""
+	}
+
+	return e.lastError.String()
+}
+
+// loadLastError is invoked by stateify.
+func (e *endpoint) loadLastError(s string) {
+	if s == "" {
+		return
+	}
+
+	e.lastError = tcpip.StringToError(s)
+}
+
+// saveHardError is invoked by stateify.
+func (e *EndpointInfo) saveHardError() string {
+	if e.HardError == nil {
+		return ""
+	}
+
+	return e.HardError.String()
+}
+
+// loadHardError is invoked by stateify.
+func (e *EndpointInfo) loadHardError(s string) {
+	if s == "" {
+		return
+	}
+
+	e.HardError = tcpip.StringToError(s)
+}
+
+// saveMeasureTime is invoked by stateify.
+func (r *rcvBufAutoTuneParams) saveMeasureTime() unixTime {
+	return unixTime{r.measureTime.Unix(), r.measureTime.UnixNano()}
+}
+
+// loadMeasureTime is invoked by stateify.
+func (r *rcvBufAutoTuneParams) loadMeasureTime(unix unixTime) {
+	r.measureTime = time.Unix(unix.second, unix.nano)
+}
+
+// saveRttMeasureTime is invoked by stateify.
+func (r *rcvBufAutoTuneParams) saveRttMeasureTime() unixTime {
+	return unixTime{r.rttMeasureTime.Unix(), r.rttMeasureTime.UnixNano()}
+}
+
+// loadRttMeasureTime is invoked by stateify.
+func (r *rcvBufAutoTuneParams) loadRttMeasureTime(unix unixTime) {
+	r.rttMeasureTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/forwarder.go b/pkg/tcpip/transport/tcp/forwarder.go
new file mode 100644
index 000000000..070b634b4
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/forwarder.go
@@ -0,0 +1,169 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Forwarder is a connection request forwarder, which allows clients to decide
+// what to do with a connection request, for example: ignore it, send a RST, or
+// attempt to complete the 3-way handshake.
+//
+// The canonical way of using it is to pass the Forwarder.HandlePacket function
+// to stack.SetTransportProtocolHandler.
+type Forwarder struct {
+	maxInFlight int
+	handler     func(*ForwarderRequest)
+
+	mu       sync.Mutex
+	inFlight map[stack.TransportEndpointID]struct{}
+	listen   *listenContext
+}
+
+// NewForwarder allocates and initializes a new forwarder with the given
+// maximum number of in-flight connection attempts. Once the maximum is reached
+// new incoming connection requests will be ignored.
+//
+// If rcvWnd is set to zero, the default buffer size is used instead.
+func NewForwarder(s *stack.Stack, rcvWnd, maxInFlight int, handler func(*ForwarderRequest)) *Forwarder {
+	if rcvWnd == 0 {
+		rcvWnd = DefaultReceiveBufferSize
+	}
+	return &Forwarder{
+		maxInFlight: maxInFlight,
+		handler:     handler,
+		inFlight:    make(map[stack.TransportEndpointID]struct{}),
+		listen:      newListenContext(s, nil /* listenEP */, seqnum.Size(rcvWnd), true, 0),
+	}
+}
+
+// HandlePacket handles a packet if it is of interest to the forwarder (i.e., if
+// it's a SYN packet), returning true if it's the case. Otherwise the packet
+// is not handled and false is returned.
+//
+// This function is expected to be passed as an argument to the
+// stack.SetTransportProtocolHandler function.
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
+	s := newSegment(r, id, pkt)
+	defer s.decRef()
+
+	// We only care about well-formed SYN packets.
+	if !s.parse() || !s.csumValid || s.flags != header.TCPFlagSyn {
+		return false
+	}
+
+	opts := parseSynSegmentOptions(s)
+
+	f.mu.Lock()
+	defer f.mu.Unlock()
+
+	// We have an inflight request for this id, ignore this one for now.
+	if _, ok := f.inFlight[id]; ok {
+		return true
+	}
+
+	// Ignore the segment if we're beyond the limit.
+	if len(f.inFlight) >= f.maxInFlight {
+		return true
+	}
+
+	// Launch a new goroutine to handle the request.
+	f.inFlight[id] = struct{}{}
+	s.incRef()
+	go f.handler(&ForwarderRequest{ // S/R-SAFE: not used by Sentry.
+		forwarder:  f,
+		segment:    s,
+		synOptions: opts,
+	})
+
+	return true
+}
+
+// ForwarderRequest represents a connection request received by the forwarder
+// and passed to the client. Clients must eventually call Complete() on it, and
+// may optionally create an endpoint to represent it via CreateEndpoint.
+type ForwarderRequest struct {
+	mu         sync.Mutex
+	forwarder  *Forwarder
+	segment    *segment
+	synOptions header.TCPSynOptions
+}
+
+// ID returns the 4-tuple (src address, src port, dst address, dst port) that
+// represents the connection request.
+func (r *ForwarderRequest) ID() stack.TransportEndpointID {
+	return r.segment.id
+}
+
+// Complete completes the request, and optionally sends a RST segment back to the
+// sender.
+func (r *ForwarderRequest) Complete(sendReset bool) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.segment == nil {
+		panic("Completing already completed forwarder request")
+	}
+
+	// Remove request from the forwarder.
+	r.forwarder.mu.Lock()
+	delete(r.forwarder.inFlight, r.segment.id)
+	r.forwarder.mu.Unlock()
+
+	// If the caller requested, send a reset.
+	if sendReset {
+		replyWithReset(r.segment, stack.DefaultTOS, r.segment.route.DefaultTTL())
+	}
+
+	// Release all resources.
+	r.segment.decRef()
+	r.segment = nil
+	r.forwarder = nil
+}
+
+// CreateEndpoint creates a TCP endpoint for the connection request, performing
+// the 3-way handshake in the process.
+func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+
+	if r.segment == nil {
+		return nil, tcpip.ErrInvalidEndpointState
+	}
+
+	f := r.forwarder
+	ep, err := f.listen.createEndpointAndPerformHandshake(r.segment, &header.TCPSynOptions{
+		MSS:           r.synOptions.MSS,
+		WS:            r.synOptions.WS,
+		TS:            r.synOptions.TS,
+		TSVal:         r.synOptions.TSVal,
+		TSEcr:         r.synOptions.TSEcr,
+		SACKPermitted: r.synOptions.SACKPermitted,
+	}, queue, nil)
+	if err != nil {
+		return nil, err
+	}
+
+	// Start the protocol goroutine.
+	ep.startAcceptedLoop()
+
+	return ep, nil
+}
diff --git a/pkg/tcpip/transport/tcp/protocol.go b/pkg/tcpip/transport/tcp/protocol.go
new file mode 100644
index 000000000..b34e47bbd
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/protocol.go
@@ -0,0 +1,541 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tcp contains the implementation of the TCP transport protocol. To use
+// it in the networking stack, this package must be added to the project, and
+// activated on the stack by passing tcp.NewProtocol() as one of the
+// transport protocols when calling stack.New(). Then endpoints can be created
+// by passing tcp.ProtocolNumber as the transport protocol number when calling
+// Stack.NewEndpoint().
+package tcp
+
+import (
+	"fmt"
+	"runtime"
+	"strings"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// ProtocolNumber is the tcp protocol number.
+	ProtocolNumber = header.TCPProtocolNumber
+
+	// MinBufferSize is the smallest size of a receive or send buffer.
+	MinBufferSize = 4 << 10 // 4096 bytes.
+
+	// DefaultSendBufferSize is the default size of the send buffer for
+	// an endpoint.
+	DefaultSendBufferSize = 1 << 20 // 1MB
+
+	// DefaultReceiveBufferSize is the default size of the receive buffer
+	// for an endpoint.
+	DefaultReceiveBufferSize = 1 << 20 // 1MB
+
+	// MaxBufferSize is the largest size a receive/send buffer can grow to.
+	MaxBufferSize = 4 << 20 // 4MB
+
+	// MaxUnprocessedSegments is the maximum number of unprocessed segments
+	// that can be queued for a given endpoint.
+	MaxUnprocessedSegments = 300
+
+	// DefaultTCPLingerTimeout is the amount of time that sockets linger in
+	// FIN_WAIT_2 state before being marked closed.
+	DefaultTCPLingerTimeout = 60 * time.Second
+
+	// DefaultTCPTimeWaitTimeout is the amount of time that sockets linger
+	// in TIME_WAIT state before being marked closed.
+	DefaultTCPTimeWaitTimeout = 60 * time.Second
+
+	// DefaultSynRetries is the default value for the number of SYN retransmits
+	// before a connect is aborted.
+	DefaultSynRetries = 6
+)
+
+const (
+	ccReno  = "reno"
+	ccCubic = "cubic"
+)
+
+// SACKEnabled is used by stack.(*Stack).TransportProtocolOption to
+// enable/disable SACK support in TCP. See: https://tools.ietf.org/html/rfc2018.
+type SACKEnabled bool
+
+// DelayEnabled is used by stack.(Stack*).TransportProtocolOption to
+// enable/disable Nagle's algorithm in TCP.
+type DelayEnabled bool
+
+// SendBufferSizeOption is used by stack.(Stack*).TransportProtocolOption
+// to get/set the default, min and max TCP send buffer sizes.
+type SendBufferSizeOption struct {
+	Min     int
+	Default int
+	Max     int
+}
+
+// ReceiveBufferSizeOption is used by
+// stack.(Stack*).TransportProtocolOption to get/set the default, min and max
+// TCP receive buffer sizes.
+type ReceiveBufferSizeOption struct {
+	Min     int
+	Default int
+	Max     int
+}
+
+// syncRcvdCounter tracks the number of endpoints in the SYN-RCVD state. The
+// value is protected by a mutex so that we can increment only when it's
+// guaranteed not to go above a threshold.
+type synRcvdCounter struct {
+	sync.Mutex
+	value     uint64
+	pending   sync.WaitGroup
+	threshold uint64
+}
+
+// inc tries to increment the global number of endpoints in SYN-RCVD state. It
+// succeeds if the increment doesn't make the count go beyond the threshold, and
+// fails otherwise.
+func (s *synRcvdCounter) inc() bool {
+	s.Lock()
+	defer s.Unlock()
+	if s.value >= s.threshold {
+		return false
+	}
+
+	s.pending.Add(1)
+	s.value++
+
+	return true
+}
+
+// dec atomically decrements the global number of endpoints in SYN-RCVD
+// state. It must only be called if a previous call to inc succeeded.
+func (s *synRcvdCounter) dec() {
+	s.Lock()
+	defer s.Unlock()
+	s.value--
+	s.pending.Done()
+}
+
+// synCookiesInUse returns true if the synRcvdCount is greater than
+// SynRcvdCountThreshold.
+func (s *synRcvdCounter) synCookiesInUse() bool {
+	s.Lock()
+	defer s.Unlock()
+	return s.value >= s.threshold
+}
+
+// SetThreshold sets synRcvdCounter.Threshold to ths new threshold.
+func (s *synRcvdCounter) SetThreshold(threshold uint64) {
+	s.Lock()
+	defer s.Unlock()
+	s.threshold = threshold
+}
+
+// Threshold returns the current value of synRcvdCounter.Threhsold.
+func (s *synRcvdCounter) Threshold() uint64 {
+	s.Lock()
+	defer s.Unlock()
+	return s.threshold
+}
+
+type protocol struct {
+	mu                         sync.RWMutex
+	sackEnabled                bool
+	delayEnabled               bool
+	sendBufferSize             SendBufferSizeOption
+	recvBufferSize             ReceiveBufferSizeOption
+	congestionControl          string
+	availableCongestionControl []string
+	moderateReceiveBuffer      bool
+	tcpLingerTimeout           time.Duration
+	tcpTimeWaitTimeout         time.Duration
+	minRTO                     time.Duration
+	maxRTO                     time.Duration
+	maxRetries                 uint32
+	synRcvdCount               synRcvdCounter
+	synRetries                 uint8
+	dispatcher                 dispatcher
+}
+
+// Number returns the tcp protocol number.
+func (*protocol) Number() tcpip.TransportProtocolNumber {
+	return ProtocolNumber
+}
+
+// NewEndpoint creates a new tcp endpoint.
+func (p *protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return newEndpoint(stack, netProto, waiterQueue), nil
+}
+
+// NewRawEndpoint creates a new raw TCP endpoint. Raw TCP sockets are currently
+// unsupported. It implements stack.TransportProtocol.NewRawEndpoint.
+func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return raw.NewEndpoint(stack, netProto, header.TCPProtocolNumber, waiterQueue)
+}
+
+// MinimumPacketSize returns the minimum valid tcp packet size.
+func (*protocol) MinimumPacketSize() int {
+	return header.TCPMinimumSize
+}
+
+// ParsePorts returns the source and destination ports stored in the given tcp
+// packet.
+func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
+	h := header.TCP(v)
+	return h.SourcePort(), h.DestinationPort(), nil
+}
+
+// QueuePacket queues packets targeted at an endpoint after hashing the packet
+// to a specific processing queue. Each queue is serviced by its own processor
+// goroutine which is responsible for dequeuing and doing full TCP dispatch of
+// the packet.
+func (p *protocol) QueuePacket(r *stack.Route, ep stack.TransportEndpoint, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
+	p.dispatcher.queuePacket(r, ep, id, pkt)
+}
+
+// HandleUnknownDestinationPacket handles packets targeted at this protocol but
+// that don't match any existing endpoint.
+//
+// RFC 793, page 36, states that "If the connection does not exist (CLOSED) then
+// a reset is sent in response to any incoming segment except another reset. In
+// particular, SYNs addressed to a non-existent connection are rejected by this
+// means."
+func (*protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
+	s := newSegment(r, id, pkt)
+	defer s.decRef()
+
+	if !s.parse() || !s.csumValid {
+		return false
+	}
+
+	// There's nothing to do if this is already a reset packet.
+	if s.flagIsSet(header.TCPFlagRst) {
+		return true
+	}
+
+	replyWithReset(s, stack.DefaultTOS, s.route.DefaultTTL())
+	return true
+}
+
+// replyWithReset replies to the given segment with a reset segment.
+func replyWithReset(s *segment, tos, ttl uint8) {
+	// Get the seqnum from the packet if the ack flag is set.
+	seq := seqnum.Value(0)
+	ack := seqnum.Value(0)
+	flags := byte(header.TCPFlagRst)
+	// As per RFC 793 page 35 (Reset Generation)
+	//   1.  If the connection does not exist (CLOSED) then a reset is sent
+	//   in response to any incoming segment except another reset.  In
+	//   particular, SYNs addressed to a non-existent connection are rejected
+	//   by this means.
+
+	//   If the incoming segment has an ACK field, the reset takes its
+	//   sequence number from the ACK field of the segment, otherwise the
+	//   reset has sequence number zero and the ACK field is set to the sum
+	//   of the sequence number and segment length of the incoming segment.
+	//   The connection remains in the CLOSED state.
+	if s.flagIsSet(header.TCPFlagAck) {
+		seq = s.ackNumber
+	} else {
+		flags |= header.TCPFlagAck
+		ack = s.sequenceNumber.Add(s.logicalLen())
+	}
+	sendTCP(&s.route, tcpFields{
+		id:     s.id,
+		ttl:    ttl,
+		tos:    tos,
+		flags:  flags,
+		seq:    seq,
+		ack:    ack,
+		rcvWnd: 0,
+	}, buffer.VectorisedView{}, nil /* gso */, nil /* PacketOwner */)
+}
+
+// SetOption implements stack.TransportProtocol.SetOption.
+func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+	switch v := option.(type) {
+	case SACKEnabled:
+		p.mu.Lock()
+		p.sackEnabled = bool(v)
+		p.mu.Unlock()
+		return nil
+
+	case DelayEnabled:
+		p.mu.Lock()
+		p.delayEnabled = bool(v)
+		p.mu.Unlock()
+		return nil
+
+	case SendBufferSizeOption:
+		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
+			return tcpip.ErrInvalidOptionValue
+		}
+		p.mu.Lock()
+		p.sendBufferSize = v
+		p.mu.Unlock()
+		return nil
+
+	case ReceiveBufferSizeOption:
+		if v.Min <= 0 || v.Default < v.Min || v.Default > v.Max {
+			return tcpip.ErrInvalidOptionValue
+		}
+		p.mu.Lock()
+		p.recvBufferSize = v
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.CongestionControlOption:
+		for _, c := range p.availableCongestionControl {
+			if string(v) == c {
+				p.mu.Lock()
+				p.congestionControl = string(v)
+				p.mu.Unlock()
+				return nil
+			}
+		}
+		// linux returns ENOENT when an invalid congestion control
+		// is specified.
+		return tcpip.ErrNoSuchFile
+
+	case tcpip.ModerateReceiveBufferOption:
+		p.mu.Lock()
+		p.moderateReceiveBuffer = bool(v)
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.TCPLingerTimeoutOption:
+		if v < 0 {
+			v = 0
+		}
+		p.mu.Lock()
+		p.tcpLingerTimeout = time.Duration(v)
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.TCPTimeWaitTimeoutOption:
+		if v < 0 {
+			v = 0
+		}
+		p.mu.Lock()
+		p.tcpTimeWaitTimeout = time.Duration(v)
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.TCPMinRTOOption:
+		if v < 0 {
+			v = tcpip.TCPMinRTOOption(MinRTO)
+		}
+		p.mu.Lock()
+		p.minRTO = time.Duration(v)
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.TCPMaxRTOOption:
+		if v < 0 {
+			v = tcpip.TCPMaxRTOOption(MaxRTO)
+		}
+		p.mu.Lock()
+		p.maxRTO = time.Duration(v)
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.TCPMaxRetriesOption:
+		p.mu.Lock()
+		p.maxRetries = uint32(v)
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.TCPSynRcvdCountThresholdOption:
+		p.mu.Lock()
+		p.synRcvdCount.SetThreshold(uint64(v))
+		p.mu.Unlock()
+		return nil
+
+	case tcpip.TCPSynRetriesOption:
+		if v < 1 || v > 255 {
+			return tcpip.ErrInvalidOptionValue
+		}
+		p.mu.Lock()
+		p.synRetries = uint8(v)
+		p.mu.Unlock()
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// Option implements stack.TransportProtocol.Option.
+func (p *protocol) Option(option interface{}) *tcpip.Error {
+	switch v := option.(type) {
+	case *SACKEnabled:
+		p.mu.RLock()
+		*v = SACKEnabled(p.sackEnabled)
+		p.mu.RUnlock()
+		return nil
+
+	case *DelayEnabled:
+		p.mu.RLock()
+		*v = DelayEnabled(p.delayEnabled)
+		p.mu.RUnlock()
+		return nil
+
+	case *SendBufferSizeOption:
+		p.mu.RLock()
+		*v = p.sendBufferSize
+		p.mu.RUnlock()
+		return nil
+
+	case *ReceiveBufferSizeOption:
+		p.mu.RLock()
+		*v = p.recvBufferSize
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.CongestionControlOption:
+		p.mu.RLock()
+		*v = tcpip.CongestionControlOption(p.congestionControl)
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.AvailableCongestionControlOption:
+		p.mu.RLock()
+		*v = tcpip.AvailableCongestionControlOption(strings.Join(p.availableCongestionControl, " "))
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.ModerateReceiveBufferOption:
+		p.mu.RLock()
+		*v = tcpip.ModerateReceiveBufferOption(p.moderateReceiveBuffer)
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.TCPLingerTimeoutOption:
+		p.mu.RLock()
+		*v = tcpip.TCPLingerTimeoutOption(p.tcpLingerTimeout)
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.TCPTimeWaitTimeoutOption:
+		p.mu.RLock()
+		*v = tcpip.TCPTimeWaitTimeoutOption(p.tcpTimeWaitTimeout)
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.TCPMinRTOOption:
+		p.mu.RLock()
+		*v = tcpip.TCPMinRTOOption(p.minRTO)
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.TCPMaxRTOOption:
+		p.mu.RLock()
+		*v = tcpip.TCPMaxRTOOption(p.maxRTO)
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.TCPMaxRetriesOption:
+		p.mu.RLock()
+		*v = tcpip.TCPMaxRetriesOption(p.maxRetries)
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.TCPSynRcvdCountThresholdOption:
+		p.mu.RLock()
+		*v = tcpip.TCPSynRcvdCountThresholdOption(p.synRcvdCount.Threshold())
+		p.mu.RUnlock()
+		return nil
+
+	case *tcpip.TCPSynRetriesOption:
+		p.mu.RLock()
+		*v = tcpip.TCPSynRetriesOption(p.synRetries)
+		p.mu.RUnlock()
+		return nil
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// Close implements stack.TransportProtocol.Close.
+func (p *protocol) Close() {
+	p.dispatcher.close()
+}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (p *protocol) Wait() {
+	p.dispatcher.wait()
+}
+
+// SynRcvdCounter returns a reference to the synRcvdCount for this protocol
+// instance.
+func (p *protocol) SynRcvdCounter() *synRcvdCounter {
+	return &p.synRcvdCount
+}
+
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
+	hdr, ok := pkt.Data.PullUp(header.TCPMinimumSize)
+	if !ok {
+		return false
+	}
+
+	// If the header has options, pull those up as well.
+	if offset := int(header.TCP(hdr).DataOffset()); offset > header.TCPMinimumSize && offset <= pkt.Data.Size() {
+		hdr, ok = pkt.Data.PullUp(offset)
+		if !ok {
+			panic(fmt.Sprintf("There should be at least %d bytes in pkt.Data.", offset))
+		}
+	}
+
+	pkt.TransportHeader = hdr
+	pkt.Data.TrimFront(len(hdr))
+	return true
+}
+
+// NewProtocol returns a TCP transport protocol.
+func NewProtocol() stack.TransportProtocol {
+	p := protocol{
+		sendBufferSize: SendBufferSizeOption{
+			Min:     MinBufferSize,
+			Default: DefaultSendBufferSize,
+			Max:     MaxBufferSize,
+		},
+		recvBufferSize: ReceiveBufferSizeOption{
+			Min:     MinBufferSize,
+			Default: DefaultReceiveBufferSize,
+			Max:     MaxBufferSize,
+		},
+		congestionControl:          ccReno,
+		availableCongestionControl: []string{ccReno, ccCubic},
+		tcpLingerTimeout:           DefaultTCPLingerTimeout,
+		tcpTimeWaitTimeout:         DefaultTCPTimeWaitTimeout,
+		synRcvdCount:               synRcvdCounter{threshold: SynRcvdCountThreshold},
+		synRetries:                 DefaultSynRetries,
+		minRTO:                     MinRTO,
+		maxRTO:                     MaxRTO,
+		maxRetries:                 MaxRetries,
+	}
+	p.dispatcher.init(runtime.GOMAXPROCS(0))
+	return &p
+}
diff --git a/pkg/tcpip/transport/tcp/rcv.go b/pkg/tcpip/transport/tcp/rcv.go
new file mode 100644
index 000000000..dd89a292a
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/rcv.go
@@ -0,0 +1,475 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"container/heap"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+)
+
+// receiver holds the state necessary to receive TCP segments and turn them
+// into a stream of bytes.
+//
+// +stateify savable
+type receiver struct {
+	ep *endpoint
+
+	rcvNxt seqnum.Value
+
+	// rcvAcc is one beyond the last acceptable sequence number. That is,
+	// the "largest" sequence value that the receiver has announced to the
+	// its peer that it's willing to accept. This may be different than
+	// rcvNxt + rcvWnd if the receive window is reduced; in that case we
+	// have to reduce the window as we receive more data instead of
+	// shrinking it.
+	rcvAcc seqnum.Value
+
+	// rcvWnd is the non-scaled receive window last advertised to the peer.
+	rcvWnd seqnum.Size
+
+	rcvWndScale uint8
+
+	closed bool
+
+	pendingRcvdSegments segmentHeap
+	pendingBufUsed      seqnum.Size
+	pendingBufSize      seqnum.Size
+
+	// Time when the last ack was received.
+	lastRcvdAckTime time.Time `state:".(unixTime)"`
+}
+
+func newReceiver(ep *endpoint, irs seqnum.Value, rcvWnd seqnum.Size, rcvWndScale uint8, pendingBufSize seqnum.Size) *receiver {
+	return &receiver{
+		ep:              ep,
+		rcvNxt:          irs + 1,
+		rcvAcc:          irs.Add(rcvWnd + 1),
+		rcvWnd:          rcvWnd,
+		rcvWndScale:     rcvWndScale,
+		pendingBufSize:  pendingBufSize,
+		lastRcvdAckTime: time.Now(),
+	}
+}
+
+// acceptable checks if the segment sequence number range is acceptable
+// according to the table on page 26 of RFC 793.
+func (r *receiver) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
+	// r.rcvWnd could be much larger than the window size we advertised in our
+	// outgoing packets, we should use what we have advertised for acceptability
+	// test.
+	scaledWindowSize := r.rcvWnd >> r.rcvWndScale
+	if scaledWindowSize > 0xffff {
+		// This is what we actually put in the Window field.
+		scaledWindowSize = 0xffff
+	}
+	advertisedWindowSize := scaledWindowSize << r.rcvWndScale
+	return header.Acceptable(segSeq, segLen, r.rcvNxt, r.rcvNxt.Add(advertisedWindowSize))
+}
+
+// getSendParams returns the parameters needed by the sender when building
+// segments to send.
+func (r *receiver) getSendParams() (rcvNxt seqnum.Value, rcvWnd seqnum.Size) {
+	// Calculate the window size based on the available buffer space.
+	receiveBufferAvailable := r.ep.receiveBufferAvailable()
+	acc := r.rcvNxt.Add(seqnum.Size(receiveBufferAvailable))
+	if r.rcvAcc.LessThan(acc) {
+		r.rcvAcc = acc
+	}
+	// Stash away the non-scaled receive window as we use it for measuring
+	// receiver's estimated RTT.
+	r.rcvWnd = r.rcvNxt.Size(r.rcvAcc)
+	return r.rcvNxt, r.rcvWnd >> r.rcvWndScale
+}
+
+// nonZeroWindow is called when the receive window grows from zero to nonzero;
+// in such cases we may need to send an ack to indicate to our peer that it can
+// resume sending data.
+func (r *receiver) nonZeroWindow() {
+	// Immediately send an ack.
+	r.ep.snd.sendAck()
+}
+
+// consumeSegment attempts to consume a segment that was received by r. The
+// segment may have just been received or may have been received earlier but
+// wasn't ready to be consumed then.
+//
+// Returns true if the segment was consumed, false if it cannot be consumed
+// yet because of a missing segment.
+func (r *receiver) consumeSegment(s *segment, segSeq seqnum.Value, segLen seqnum.Size) bool {
+	if segLen > 0 {
+		// If the segment doesn't include the seqnum we're expecting to
+		// consume now, we're missing a segment. We cannot proceed until
+		// we receive that segment though.
+		if !r.rcvNxt.InWindow(segSeq, segLen) {
+			return false
+		}
+
+		// Trim segment to eliminate already acknowledged data.
+		if segSeq.LessThan(r.rcvNxt) {
+			diff := segSeq.Size(r.rcvNxt)
+			segLen -= diff
+			segSeq.UpdateForward(diff)
+			s.sequenceNumber.UpdateForward(diff)
+			s.data.TrimFront(int(diff))
+		}
+
+		// Move segment to ready-to-deliver list. Wakeup any waiters.
+		r.ep.readyToRead(s)
+
+	} else if segSeq != r.rcvNxt {
+		return false
+	}
+
+	// Update the segment that we're expecting to consume.
+	r.rcvNxt = segSeq.Add(segLen)
+
+	// In cases of a misbehaving sender which could send more than the
+	// advertised window, we could end up in a situation where we get a
+	// segment that exceeds the window advertised. Instead of partially
+	// accepting the segment and discarding bytes beyond the advertised
+	// window, we accept the whole segment and make sure r.rcvAcc is moved
+	// forward to match r.rcvNxt to indicate that the window is now closed.
+	//
+	// In absence of this check the r.acceptable() check fails and accepts
+	// segments that should be dropped because rcvWnd is calculated as
+	// the size of the interval (rcvNxt, rcvAcc] which becomes extremely
+	// large if rcvAcc is ever less than rcvNxt.
+	if r.rcvAcc.LessThan(r.rcvNxt) {
+		r.rcvAcc = r.rcvNxt
+	}
+
+	// Trim SACK Blocks to remove any SACK information that covers
+	// sequence numbers that have been consumed.
+	TrimSACKBlockList(&r.ep.sack, r.rcvNxt)
+
+	// Handle FIN or FIN-ACK.
+	if s.flagIsSet(header.TCPFlagFin) {
+		r.rcvNxt++
+
+		// Send ACK immediately.
+		r.ep.snd.sendAck()
+
+		// Tell any readers that no more data will come.
+		r.closed = true
+		r.ep.readyToRead(nil)
+
+		// We just received a FIN, our next state depends on whether we sent a
+		// FIN already or not.
+		switch r.ep.EndpointState() {
+		case StateEstablished:
+			r.ep.setEndpointState(StateCloseWait)
+		case StateFinWait1:
+			if s.flagIsSet(header.TCPFlagAck) {
+				// FIN-ACK, transition to TIME-WAIT.
+				r.ep.setEndpointState(StateTimeWait)
+			} else {
+				// Simultaneous close, expecting a final ACK.
+				r.ep.setEndpointState(StateClosing)
+			}
+		case StateFinWait2:
+			r.ep.setEndpointState(StateTimeWait)
+		}
+
+		// Flush out any pending segments, except the very first one if
+		// it happens to be the one we're handling now because the
+		// caller is using it.
+		first := 0
+		if len(r.pendingRcvdSegments) != 0 && r.pendingRcvdSegments[0] == s {
+			first = 1
+		}
+
+		for i := first; i < len(r.pendingRcvdSegments); i++ {
+			r.pendingRcvdSegments[i].decRef()
+			// Note that slice truncation does not allow garbage collection of
+			// truncated items, thus truncated items must be set to nil to avoid
+			// memory leaks.
+			r.pendingRcvdSegments[i] = nil
+		}
+		r.pendingRcvdSegments = r.pendingRcvdSegments[:first]
+
+		return true
+	}
+
+	// Handle ACK (not FIN-ACK, which we handled above) during one of the
+	// shutdown states.
+	if s.flagIsSet(header.TCPFlagAck) && s.ackNumber == r.ep.snd.sndNxt {
+		switch r.ep.EndpointState() {
+		case StateFinWait1:
+			r.ep.setEndpointState(StateFinWait2)
+			// Notify protocol goroutine that we have received an
+			// ACK to our FIN so that it can start the FIN_WAIT2
+			// timer to abort connection if the other side does
+			// not close within 2MSL.
+			r.ep.notifyProtocolGoroutine(notifyClose)
+		case StateClosing:
+			r.ep.setEndpointState(StateTimeWait)
+		case StateLastAck:
+			r.ep.transitionToStateCloseLocked()
+		}
+	}
+
+	return true
+}
+
+// updateRTT updates the receiver RTT measurement based on the sequence number
+// of the received segment.
+func (r *receiver) updateRTT() {
+	// From: https://public.lanl.gov/radiant/pubs/drs/sc2001-poster.pdf
+	//
+	// A system that is only transmitting acknowledgements can still
+	// estimate the round-trip time by observing the time between when a byte
+	// is first acknowledged and the receipt of data that is at least one
+	// window beyond the sequence number that was acknowledged.
+	r.ep.rcvListMu.Lock()
+	if r.ep.rcvAutoParams.rttMeasureTime.IsZero() {
+		// New measurement.
+		r.ep.rcvAutoParams.rttMeasureTime = time.Now()
+		r.ep.rcvAutoParams.rttMeasureSeqNumber = r.rcvNxt.Add(r.rcvWnd)
+		r.ep.rcvListMu.Unlock()
+		return
+	}
+	if r.rcvNxt.LessThan(r.ep.rcvAutoParams.rttMeasureSeqNumber) {
+		r.ep.rcvListMu.Unlock()
+		return
+	}
+	rtt := time.Since(r.ep.rcvAutoParams.rttMeasureTime)
+	// We only store the minimum observed RTT here as this is only used in
+	// absence of a SRTT available from either timestamps or a sender
+	// measurement of RTT.
+	if r.ep.rcvAutoParams.rtt == 0 || rtt < r.ep.rcvAutoParams.rtt {
+		r.ep.rcvAutoParams.rtt = rtt
+	}
+	r.ep.rcvAutoParams.rttMeasureTime = time.Now()
+	r.ep.rcvAutoParams.rttMeasureSeqNumber = r.rcvNxt.Add(r.rcvWnd)
+	r.ep.rcvListMu.Unlock()
+}
+
+func (r *receiver) handleRcvdSegmentClosing(s *segment, state EndpointState, closed bool) (drop bool, err *tcpip.Error) {
+	r.ep.rcvListMu.Lock()
+	rcvClosed := r.ep.rcvClosed || r.closed
+	r.ep.rcvListMu.Unlock()
+
+	// If we are in one of the shutdown states then we need to do
+	// additional checks before we try and process the segment.
+	switch state {
+	case StateCloseWait:
+		// If the ACK acks something not yet sent then we send an ACK.
+		if r.ep.snd.sndNxt.LessThan(s.ackNumber) {
+			r.ep.snd.sendAck()
+			return true, nil
+		}
+		fallthrough
+	case StateClosing, StateLastAck:
+		if !s.sequenceNumber.LessThanEq(r.rcvNxt) {
+			// Just drop the segment as we have
+			// already received a FIN and this
+			// segment is after the sequence number
+			// for the FIN.
+			return true, nil
+		}
+		fallthrough
+	case StateFinWait1:
+		fallthrough
+	case StateFinWait2:
+		// If we are closed for reads (either due to an
+		// incoming FIN or the user calling shutdown(..,
+		// SHUT_RD) then any data past the rcvNxt should
+		// trigger a RST.
+		endDataSeq := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
+		if state != StateCloseWait && rcvClosed && r.rcvNxt.LessThan(endDataSeq) {
+			return true, tcpip.ErrConnectionAborted
+		}
+		if state == StateFinWait1 {
+			break
+		}
+
+		// If it's a retransmission of an old data segment
+		// or a pure ACK then allow it.
+		if s.sequenceNumber.Add(s.logicalLen()).LessThanEq(r.rcvNxt) ||
+			s.logicalLen() == 0 {
+			break
+		}
+
+		// In FIN-WAIT2 if the socket is fully
+		// closed(not owned by application on our end
+		// then the only acceptable segment is a
+		// FIN. Since FIN can technically also carry
+		// data we verify that the segment carrying a
+		// FIN ends at exactly e.rcvNxt+1.
+		//
+		// From RFC793 page 25.
+		//
+		// For sequence number purposes, the SYN is
+		// considered to occur before the first actual
+		// data octet of the segment in which it occurs,
+		// while the FIN is considered to occur after
+		// the last actual data octet in a segment in
+		// which it occurs.
+		if closed && (!s.flagIsSet(header.TCPFlagFin) || s.sequenceNumber.Add(s.logicalLen()) != r.rcvNxt+1) {
+			return true, tcpip.ErrConnectionAborted
+		}
+	}
+
+	// We don't care about receive processing anymore if the receive side
+	// is closed.
+	//
+	// NOTE: We still want to permit a FIN as it's possible only our
+	// end has closed and the peer is yet to send a FIN. Hence we
+	// compare only the payload.
+	segEnd := s.sequenceNumber.Add(seqnum.Size(s.data.Size()))
+	if rcvClosed && !segEnd.LessThanEq(r.rcvNxt) {
+		return true, nil
+	}
+	return false, nil
+}
+
+// handleRcvdSegment handles TCP segments directed at the connection managed by
+// r as they arrive. It is called by the protocol main loop.
+func (r *receiver) handleRcvdSegment(s *segment) (drop bool, err *tcpip.Error) {
+	state := r.ep.EndpointState()
+	closed := r.ep.closed
+
+	segLen := seqnum.Size(s.data.Size())
+	segSeq := s.sequenceNumber
+
+	// If the sequence number range is outside the acceptable range, just
+	// send an ACK and stop further processing of the segment.
+	// This is according to RFC 793, page 68.
+	if !r.acceptable(segSeq, segLen) {
+		r.ep.snd.sendAck()
+		return true, nil
+	}
+
+	if state != StateEstablished {
+		drop, err := r.handleRcvdSegmentClosing(s, state, closed)
+		if drop || err != nil {
+			return drop, err
+		}
+	}
+
+	// Store the time of the last ack.
+	r.lastRcvdAckTime = time.Now()
+
+	// Defer segment processing if it can't be consumed now.
+	if !r.consumeSegment(s, segSeq, segLen) {
+		if segLen > 0 || s.flagIsSet(header.TCPFlagFin) {
+			// We only store the segment if it's within our buffer
+			// size limit.
+			if r.pendingBufUsed < r.pendingBufSize {
+				r.pendingBufUsed += s.logicalLen()
+				s.incRef()
+				heap.Push(&r.pendingRcvdSegments, s)
+				UpdateSACKBlocks(&r.ep.sack, segSeq, segSeq.Add(segLen), r.rcvNxt)
+			}
+
+			// Immediately send an ack so that the peer knows it may
+			// have to retransmit.
+			r.ep.snd.sendAck()
+		}
+		return false, nil
+	}
+
+	// Since we consumed a segment update the receiver's RTT estimate
+	// if required.
+	if segLen > 0 {
+		r.updateRTT()
+	}
+
+	// By consuming the current segment, we may have filled a gap in the
+	// sequence number domain that allows pending segments to be consumed
+	// now. So try to do it.
+	for !r.closed && r.pendingRcvdSegments.Len() > 0 {
+		s := r.pendingRcvdSegments[0]
+		segLen := seqnum.Size(s.data.Size())
+		segSeq := s.sequenceNumber
+
+		// Skip segment altogether if it has already been acknowledged.
+		if !segSeq.Add(segLen-1).LessThan(r.rcvNxt) &&
+			!r.consumeSegment(s, segSeq, segLen) {
+			break
+		}
+
+		heap.Pop(&r.pendingRcvdSegments)
+		r.pendingBufUsed -= s.logicalLen()
+		s.decRef()
+	}
+	return false, nil
+}
+
+// handleTimeWaitSegment handles inbound segments received when the endpoint
+// has entered the TIME_WAIT state.
+func (r *receiver) handleTimeWaitSegment(s *segment) (resetTimeWait bool, newSyn bool) {
+	segSeq := s.sequenceNumber
+	segLen := seqnum.Size(s.data.Size())
+
+	// Just silently drop any RST packets in TIME_WAIT. We do not support
+	// TIME_WAIT assasination as a result we confirm w/ fix 1 as described
+	// in https://tools.ietf.org/html/rfc1337#section-3.
+	if s.flagIsSet(header.TCPFlagRst) {
+		return false, false
+	}
+
+	// If it's a SYN and the sequence number is higher than any seen before
+	// for this connection then try and redirect it to a listening endpoint
+	// if available.
+	//
+	// RFC 1122:
+	//   "When a connection is [...] on TIME-WAIT state [...]
+	//   [a TCP] MAY accept a new SYN from the remote TCP to
+	//   reopen the connection directly, if it:
+
+	//    (1) assigns its initial sequence number for the new
+	//     connection to be larger than the largest sequence
+	//     number it used on the previous connection incarnation,
+	//     and
+
+	//    (2) returns to TIME-WAIT state if the SYN turns out
+	//      to be an old duplicate".
+	if s.flagIsSet(header.TCPFlagSyn) && r.rcvNxt.LessThan(segSeq) {
+
+		return false, true
+	}
+
+	// Drop the segment if it does not contain an ACK.
+	if !s.flagIsSet(header.TCPFlagAck) {
+		return false, false
+	}
+
+	// Update Timestamp if required. See RFC7323, section-4.3.
+	if r.ep.sendTSOk && s.parsedOptions.TS {
+		r.ep.updateRecentTimestamp(s.parsedOptions.TSVal, r.ep.snd.maxSentAck, segSeq)
+	}
+
+	if segSeq.Add(1) == r.rcvNxt && s.flagIsSet(header.TCPFlagFin) {
+		// If it's a FIN-ACK then resetTimeWait and send an ACK, as it
+		// indicates our final ACK could have been lost.
+		r.ep.snd.sendAck()
+		return true, false
+	}
+
+	// If the sequence number range is outside the acceptable range or
+	// carries data then just send an ACK. This is according to RFC 793,
+	// page 37.
+	//
+	// NOTE: In TIME_WAIT the only acceptable sequence number is rcvNxt.
+	if segSeq != r.rcvNxt || segLen != 0 {
+		r.ep.snd.sendAck()
+	}
+	return false, false
+}
diff --git a/pkg/tcpip/transport/tcp/rcv_state.go b/pkg/tcpip/transport/tcp/rcv_state.go
new file mode 100644
index 000000000..2bf21a2e7
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/rcv_state.go
@@ -0,0 +1,29 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"time"
+)
+
+// saveLastRcvdAckTime is invoked by stateify.
+func (r *receiver) saveLastRcvdAckTime() unixTime {
+	return unixTime{r.lastRcvdAckTime.Unix(), r.lastRcvdAckTime.UnixNano()}
+}
+
+// loadLastRcvdAckTime is invoked by stateify.
+func (r *receiver) loadLastRcvdAckTime(unix unixTime) {
+	r.lastRcvdAckTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/rcv_test.go b/pkg/tcpip/transport/tcp/rcv_test.go
new file mode 100644
index 000000000..8a026ec46
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/rcv_test.go
@@ -0,0 +1,74 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package rcv_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+)
+
+func TestAcceptable(t *testing.T) {
+	for _, tt := range []struct {
+		segSeq         seqnum.Value
+		segLen         seqnum.Size
+		rcvNxt, rcvAcc seqnum.Value
+		want           bool
+	}{
+		// The segment is smaller than the window.
+		{105, 2, 100, 104, false},
+		{105, 2, 101, 105, true},
+		{105, 2, 102, 106, true},
+		{105, 2, 103, 107, true},
+		{105, 2, 104, 108, true},
+		{105, 2, 105, 109, true},
+		{105, 2, 106, 110, true},
+		{105, 2, 107, 111, false},
+
+		// The segment is larger than the window.
+		{105, 4, 103, 105, true},
+		{105, 4, 104, 106, true},
+		{105, 4, 105, 107, true},
+		{105, 4, 106, 108, true},
+		{105, 4, 107, 109, true},
+		{105, 4, 108, 110, true},
+		{105, 4, 109, 111, false},
+		{105, 4, 110, 112, false},
+
+		// The segment has no width.
+		{105, 0, 100, 102, false},
+		{105, 0, 101, 103, false},
+		{105, 0, 102, 104, false},
+		{105, 0, 103, 105, true},
+		{105, 0, 104, 106, true},
+		{105, 0, 105, 107, true},
+		{105, 0, 106, 108, false},
+		{105, 0, 107, 109, false},
+
+		// The receive window has no width.
+		{105, 2, 103, 103, false},
+		{105, 2, 104, 104, false},
+		{105, 2, 105, 105, false},
+		{105, 2, 106, 106, false},
+		{105, 2, 107, 107, false},
+		{105, 2, 108, 108, false},
+		{105, 2, 109, 109, false},
+	} {
+		if got := header.Acceptable(tt.segSeq, tt.segLen, tt.rcvNxt, tt.rcvAcc); got != tt.want {
+			t.Errorf("header.Acceptable(%d, %d, %d, %d) = %t, want %t", tt.segSeq, tt.segLen, tt.rcvNxt, tt.rcvAcc, got, tt.want)
+		}
+	}
+}
diff --git a/pkg/tcpip/transport/tcp/reno.go b/pkg/tcpip/transport/tcp/reno.go
new file mode 100644
index 000000000..f83ebc717
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/reno.go
@@ -0,0 +1,103 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+// renoState stores the variables related to TCP New Reno congestion
+// control algorithm.
+//
+// +stateify savable
+type renoState struct {
+	s *sender
+}
+
+// newRenoCC initializes the state for the NewReno congestion control algorithm.
+func newRenoCC(s *sender) *renoState {
+	return &renoState{s: s}
+}
+
+// updateSlowStart will update the congestion window as per the slow-start
+// algorithm used by NewReno. If after adjusting the congestion window
+// we cross the SSthreshold then it will return the number of packets that
+// must be consumed in congestion avoidance mode.
+func (r *renoState) updateSlowStart(packetsAcked int) int {
+	// Don't let the congestion window cross into the congestion
+	// avoidance range.
+	newcwnd := r.s.sndCwnd + packetsAcked
+	if newcwnd >= r.s.sndSsthresh {
+		newcwnd = r.s.sndSsthresh
+		r.s.sndCAAckCount = 0
+	}
+
+	packetsAcked -= newcwnd - r.s.sndCwnd
+	r.s.sndCwnd = newcwnd
+	return packetsAcked
+}
+
+// updateCongestionAvoidance will update congestion window in congestion
+// avoidance mode as described in RFC5681 section 3.1
+func (r *renoState) updateCongestionAvoidance(packetsAcked int) {
+	// Consume the packets in congestion avoidance mode.
+	r.s.sndCAAckCount += packetsAcked
+	if r.s.sndCAAckCount >= r.s.sndCwnd {
+		r.s.sndCwnd += r.s.sndCAAckCount / r.s.sndCwnd
+		r.s.sndCAAckCount = r.s.sndCAAckCount % r.s.sndCwnd
+	}
+}
+
+// reduceSlowStartThreshold reduces the slow-start threshold per RFC 5681,
+// page 6, eq. 4. It is called when we detect congestion in the network.
+func (r *renoState) reduceSlowStartThreshold() {
+	r.s.sndSsthresh = r.s.outstanding / 2
+	if r.s.sndSsthresh < 2 {
+		r.s.sndSsthresh = 2
+	}
+
+}
+
+// Update updates the congestion state based on the number of packets that
+// were acknowledged.
+// Update implements congestionControl.Update.
+func (r *renoState) Update(packetsAcked int) {
+	if r.s.sndCwnd < r.s.sndSsthresh {
+		packetsAcked = r.updateSlowStart(packetsAcked)
+		if packetsAcked == 0 {
+			return
+		}
+	}
+	r.updateCongestionAvoidance(packetsAcked)
+}
+
+// HandleNDupAcks implements congestionControl.HandleNDupAcks.
+func (r *renoState) HandleNDupAcks() {
+	// A retransmit was triggered due to nDupAckThreshold
+	// being hit. Reduce our slow start threshold.
+	r.reduceSlowStartThreshold()
+}
+
+// HandleRTOExpired implements congestionControl.HandleRTOExpired.
+func (r *renoState) HandleRTOExpired() {
+	// We lost a packet, so reduce ssthresh.
+	r.reduceSlowStartThreshold()
+
+	// Reduce the congestion window to 1, i.e., enter slow-start. Per
+	// RFC 5681, page 7, we must use 1 regardless of the value of the
+	// initial congestion window.
+	r.s.sndCwnd = 1
+}
+
+// PostRecovery implements congestionControl.PostRecovery.
+func (r *renoState) PostRecovery() {
+	// noop.
+}
diff --git a/pkg/tcpip/transport/tcp/sack.go b/pkg/tcpip/transport/tcp/sack.go
new file mode 100644
index 000000000..7be86d68e
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/sack.go
@@ -0,0 +1,105 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+)
+
+const (
+	// MaxSACKBlocks is the maximum number of SACK blocks stored
+	// at receiver side.
+	MaxSACKBlocks = 6
+)
+
+// UpdateSACKBlocks updates the list of SACK blocks to include the segment
+// specified by segStart->segEnd. If the segment happens to be an out of order
+// delivery then the first block in the sack.blocks always includes the
+// segment identified by segStart->segEnd.
+func UpdateSACKBlocks(sack *SACKInfo, segStart seqnum.Value, segEnd seqnum.Value, rcvNxt seqnum.Value) {
+	newSB := header.SACKBlock{Start: segStart, End: segEnd}
+
+	// Ignore any invalid SACK blocks or blocks that are before rcvNxt as
+	// those bytes have already been acked.
+	if newSB.End.LessThanEq(newSB.Start) || newSB.End.LessThan(rcvNxt) {
+		return
+	}
+
+	if sack.NumBlocks == 0 {
+		sack.Blocks[0] = newSB
+		sack.NumBlocks = 1
+		return
+	}
+	var n = 0
+	for i := 0; i < sack.NumBlocks; i++ {
+		start, end := sack.Blocks[i].Start, sack.Blocks[i].End
+		if end.LessThanEq(rcvNxt) {
+			// Discard any sack blocks that are before rcvNxt as
+			// those have already been acked.
+			continue
+		}
+		if newSB.Start.LessThanEq(end) && start.LessThanEq(newSB.End) {
+			// Merge this SACK block into newSB and discard this SACK
+			// block.
+			if start.LessThan(newSB.Start) {
+				newSB.Start = start
+			}
+			if newSB.End.LessThan(end) {
+				newSB.End = end
+			}
+		} else {
+			// Save this block.
+			sack.Blocks[n] = sack.Blocks[i]
+			n++
+		}
+	}
+	if rcvNxt.LessThan(newSB.Start) {
+		// If this was an out of order segment then make sure that the
+		// first SACK block is the one that includes the segment.
+		//
+		// See the first bullet point in
+		// https://tools.ietf.org/html/rfc2018#section-4
+		if n == MaxSACKBlocks {
+			// If the number of SACK blocks is equal to
+			// MaxSACKBlocks then discard the last SACK block.
+			n--
+		}
+		for i := n - 1; i >= 0; i-- {
+			sack.Blocks[i+1] = sack.Blocks[i]
+		}
+		sack.Blocks[0] = newSB
+		n++
+	}
+	sack.NumBlocks = n
+}
+
+// TrimSACKBlockList updates the sack block list by removing/modifying any block
+// where start is < rcvNxt.
+func TrimSACKBlockList(sack *SACKInfo, rcvNxt seqnum.Value) {
+	n := 0
+	for i := 0; i < sack.NumBlocks; i++ {
+		if sack.Blocks[i].End.LessThanEq(rcvNxt) {
+			continue
+		}
+		if sack.Blocks[i].Start.LessThan(rcvNxt) {
+			// Shrink this SACK block.
+			sack.Blocks[i].Start = rcvNxt
+		}
+		sack.Blocks[n] = sack.Blocks[i]
+		n++
+	}
+	sack.NumBlocks = n
+}
diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard.go b/pkg/tcpip/transport/tcp/sack_scoreboard.go
new file mode 100644
index 000000000..7ef2df377
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/sack_scoreboard.go
@@ -0,0 +1,306 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/google/btree"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+)
+
+const (
+	// maxSACKBlocks is the maximum number of distinct SACKBlocks the
+	// scoreboard will track. Once there are 100 distinct blocks, new
+	// insertions will fail.
+	maxSACKBlocks = 100
+
+	// defaultBtreeDegree is set to 2 as btree.New(2) results in a 2-3-4
+	// tree.
+	defaultBtreeDegree = 2
+)
+
+// SACKScoreboard stores a set of disjoint SACK ranges.
+//
+// +stateify savable
+type SACKScoreboard struct {
+	// smss is defined in RFC5681 as following:
+	//
+	//    The SMSS is the size of the largest segment that the sender can
+	//    transmit.  This value can be based on the maximum transmission unit
+	//    of the network, the path MTU discovery [RFC1191, RFC4821] algorithm,
+	//    RMSS (see next item), or other factors.  The size does not include
+	//    the TCP/IP headers and options.
+	smss      uint16
+	maxSACKED seqnum.Value
+	sacked    seqnum.Size  `state:"nosave"`
+	ranges    *btree.BTree `state:"nosave"`
+}
+
+// NewSACKScoreboard returns a new SACK Scoreboard.
+func NewSACKScoreboard(smss uint16, iss seqnum.Value) *SACKScoreboard {
+	return &SACKScoreboard{
+		smss:      smss,
+		ranges:    btree.New(defaultBtreeDegree),
+		maxSACKED: iss,
+	}
+}
+
+// Reset erases all known range information from the SACK scoreboard.
+func (s *SACKScoreboard) Reset() {
+	s.ranges = btree.New(defaultBtreeDegree)
+	s.sacked = 0
+}
+
+// Insert inserts/merges the provided SACKBlock into the scoreboard.
+func (s *SACKScoreboard) Insert(r header.SACKBlock) {
+	if s.ranges.Len() >= maxSACKBlocks {
+		return
+	}
+
+	// Check if we can merge the new range with a range before or after it.
+	var toDelete []btree.Item
+	if s.maxSACKED.LessThan(r.End - 1) {
+		s.maxSACKED = r.End - 1
+	}
+	s.ranges.AscendGreaterOrEqual(r, func(i btree.Item) bool {
+		if i == r {
+			return true
+		}
+		sacked := i.(header.SACKBlock)
+		// There is a hole between these two SACK blocks, so we can't
+		// merge anymore.
+		if r.End.LessThan(sacked.Start) {
+			return false
+		}
+		// There is some overlap at this point, merge the blocks and
+		// delete the other one.
+		//
+		// ----sS--------sE
+		// r.S---------------rE
+		//               -------sE
+		if sacked.End.LessThan(r.End) {
+			// sacked is contained in the newly inserted range.
+			// Delete this block.
+			toDelete = append(toDelete, i)
+			return true
+		}
+		// sacked covers a range past end of the newly inserted
+		// block.
+		r.End = sacked.End
+		toDelete = append(toDelete, i)
+		return true
+	})
+
+	s.ranges.DescendLessOrEqual(r, func(i btree.Item) bool {
+		if i == r {
+			return true
+		}
+		sacked := i.(header.SACKBlock)
+		// sA------sE
+		//            rA----rE
+		if sacked.End.LessThan(r.Start) {
+			return false
+		}
+		// The previous range extends into the current block. Merge it
+		// into the newly inserted range and delete the other one.
+		//
+		//   <-rA---rE----<---rE--->
+		// sA--------------sE
+		r.Start = sacked.Start
+		// Extend r to cover sacked if sacked extends past r.
+		if r.End.LessThan(sacked.End) {
+			r.End = sacked.End
+		}
+		toDelete = append(toDelete, i)
+		return true
+	})
+	for _, i := range toDelete {
+		if sb := s.ranges.Delete(i); sb != nil {
+			sb := i.(header.SACKBlock)
+			s.sacked -= sb.Start.Size(sb.End)
+		}
+	}
+
+	replaced := s.ranges.ReplaceOrInsert(r)
+	if replaced == nil {
+		s.sacked += r.Start.Size(r.End)
+	}
+}
+
+// IsSACKED returns true if the a given range of sequence numbers denoted by r
+// are already covered by SACK information in the scoreboard.
+func (s *SACKScoreboard) IsSACKED(r header.SACKBlock) bool {
+	if s.Empty() {
+		return false
+	}
+
+	found := false
+	s.ranges.DescendLessOrEqual(r, func(i btree.Item) bool {
+		sacked := i.(header.SACKBlock)
+		if sacked.End.LessThan(r.Start) {
+			return false
+		}
+		if sacked.Contains(r) {
+			found = true
+			return false
+		}
+		return true
+	})
+	return found
+}
+
+// Dump prints the state of the scoreboard structure.
+func (s *SACKScoreboard) String() string {
+	var str strings.Builder
+	str.WriteString("SACKScoreboard: {")
+	s.ranges.Ascend(func(i btree.Item) bool {
+		str.WriteString(fmt.Sprintf("%v,", i))
+		return true
+	})
+	str.WriteString("}\n")
+	return str.String()
+}
+
+// Delete removes all SACK information prior to seq.
+func (s *SACKScoreboard) Delete(seq seqnum.Value) {
+	if s.Empty() {
+		return
+	}
+	toDelete := []btree.Item{}
+	toInsert := []btree.Item{}
+	r := header.SACKBlock{seq, seq.Add(1)}
+	s.ranges.DescendLessOrEqual(r, func(i btree.Item) bool {
+		if i == r {
+			return true
+		}
+		sb := i.(header.SACKBlock)
+		toDelete = append(toDelete, i)
+		if sb.End.LessThanEq(seq) {
+			s.sacked -= sb.Start.Size(sb.End)
+		} else {
+			newSB := header.SACKBlock{seq, sb.End}
+			toInsert = append(toInsert, newSB)
+			s.sacked -= sb.Start.Size(seq)
+		}
+		return true
+	})
+	for _, sb := range toDelete {
+		s.ranges.Delete(sb)
+	}
+	for _, sb := range toInsert {
+		s.ranges.ReplaceOrInsert(sb)
+	}
+}
+
+// Copy provides a copy of the SACK scoreboard.
+func (s *SACKScoreboard) Copy() (sackBlocks []header.SACKBlock, maxSACKED seqnum.Value) {
+	s.ranges.Ascend(func(i btree.Item) bool {
+		sackBlocks = append(sackBlocks, i.(header.SACKBlock))
+		return true
+	})
+	return sackBlocks, s.maxSACKED
+}
+
+// IsRangeLost implements the IsLost(SeqNum) operation defined in RFC 6675
+// section 4 but operates on a range of sequence numbers and returns true if
+// there are at least nDupAckThreshold SACK blocks greater than the range being
+// checked or if at least (nDupAckThreshold-1)*s.smss bytes have been SACKED
+// with sequence numbers greater than the block being checked.
+func (s *SACKScoreboard) IsRangeLost(r header.SACKBlock) bool {
+	if s.Empty() {
+		return false
+	}
+	nDupSACK := 0
+	nDupSACKBytes := seqnum.Size(0)
+	isLost := false
+
+	// We need to check if the immediate lower (if any) sacked
+	// range contains or partially overlaps with r.
+	searchMore := true
+	s.ranges.DescendLessOrEqual(r, func(i btree.Item) bool {
+		sacked := i.(header.SACKBlock)
+		if sacked.Contains(r) {
+			searchMore = false
+			return false
+		}
+		if sacked.End.LessThanEq(r.Start) {
+			// all sequence numbers covered by sacked are below
+			// r so we continue searching.
+			return false
+		}
+		// There is a partial overlap. In this case we r.Start is
+		// between sacked.Start & sacked.End and r.End extends beyond
+		// sacked.End.
+		// Move r.Start to sacked.End and continuing searching blocks
+		// above r.Start.
+		r.Start = sacked.End
+		return false
+	})
+
+	if !searchMore {
+		return isLost
+	}
+
+	s.ranges.AscendGreaterOrEqual(r, func(i btree.Item) bool {
+		sacked := i.(header.SACKBlock)
+		if sacked.Contains(r) {
+			return false
+		}
+		nDupSACKBytes += sacked.Start.Size(sacked.End)
+		nDupSACK++
+		if nDupSACK >= nDupAckThreshold || nDupSACKBytes >= seqnum.Size((nDupAckThreshold-1)*s.smss) {
+			isLost = true
+			return false
+		}
+		return true
+	})
+	return isLost
+}
+
+// IsLost implements the IsLost(SeqNum) operation defined in RFC3517 section
+// 4.
+//
+// This routine returns whether the given sequence number is considered to be
+// lost. The routine returns true when either nDupAckThreshold discontiguous
+// SACKed sequences have arrived above 'SeqNum' or (nDupAckThreshold * SMSS)
+// bytes with sequence numbers greater than 'SeqNum' have been SACKed.
+// Otherwise, the routine returns false.
+func (s *SACKScoreboard) IsLost(seq seqnum.Value) bool {
+	return s.IsRangeLost(header.SACKBlock{seq, seq.Add(1)})
+}
+
+// Empty returns true if the SACK scoreboard has no entries, false otherwise.
+func (s *SACKScoreboard) Empty() bool {
+	return s.ranges.Len() == 0
+}
+
+// Sacked returns the current number of bytes held in the SACK scoreboard.
+func (s *SACKScoreboard) Sacked() seqnum.Size {
+	return s.sacked
+}
+
+// MaxSACKED returns the highest sequence number ever inserted in the SACK
+// scoreboard.
+func (s *SACKScoreboard) MaxSACKED() seqnum.Value {
+	return s.maxSACKED
+}
+
+// SMSS returns the sender's MSS as held by the SACK scoreboard.
+func (s *SACKScoreboard) SMSS() uint16 {
+	return s.smss
+}
diff --git a/pkg/tcpip/transport/tcp/sack_scoreboard_test.go b/pkg/tcpip/transport/tcp/sack_scoreboard_test.go
new file mode 100644
index 000000000..b4e5ba0df
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/sack_scoreboard_test.go
@@ -0,0 +1,249 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+)
+
+const smss = 1500
+
+func initScoreboard(blocks []header.SACKBlock, iss seqnum.Value) *tcp.SACKScoreboard {
+	s := tcp.NewSACKScoreboard(smss, iss)
+	for _, blk := range blocks {
+		s.Insert(blk)
+	}
+	return s
+}
+
+func TestSACKScoreboardIsSACKED(t *testing.T) {
+	type blockTest struct {
+		block  header.SACKBlock
+		sacked bool
+	}
+	testCases := []struct {
+		comment          string
+		scoreboardBlocks []header.SACKBlock
+		blockTests       []blockTest
+		iss              seqnum.Value
+	}{
+		{
+			"Test holes and unsacked SACK blocks in SACKed ranges and insertion of overlapping SACK blocks",
+			[]header.SACKBlock{{10, 20}, {10, 30}, {30, 40}, {41, 50}, {5, 10}, {1, 50}, {111, 120}, {101, 110}, {52, 120}},
+			[]blockTest{
+				{header.SACKBlock{15, 21}, true},
+				{header.SACKBlock{200, 201}, false},
+				{header.SACKBlock{50, 51}, false},
+				{header.SACKBlock{53, 120}, true},
+			},
+			0,
+		},
+		{
+			"Test disjoint SACKBlocks",
+			[]header.SACKBlock{{2288624809, 2288810057}, {2288811477, 2288838565}},
+			[]blockTest{
+				{header.SACKBlock{2288624809, 2288810057}, true},
+				{header.SACKBlock{2288811477, 2288838565}, true},
+				{header.SACKBlock{2288810057, 2288811477}, false},
+			},
+			2288624809,
+		},
+		{
+			"Test sequence number wrap around",
+			[]header.SACKBlock{{4294254144, 225652}, {5340409, 5350509}},
+			[]blockTest{
+				{header.SACKBlock{4294254144, 4294254145}, true},
+				{header.SACKBlock{4294254143, 4294254144}, false},
+				{header.SACKBlock{4294254144, 1}, true},
+				{header.SACKBlock{225652, 5350509}, false},
+				{header.SACKBlock{5340409, 5350509}, true},
+				{header.SACKBlock{5350509, 5350609}, false},
+			},
+			4294254144,
+		},
+		{
+			"Test disjoint SACKBlocks out of order",
+			[]header.SACKBlock{{827450276, 827454536}, {827426028, 827428868}},
+			[]blockTest{
+				{header.SACKBlock{827426028, 827428867}, true},
+				{header.SACKBlock{827450168, 827450275}, false},
+			},
+			827426000,
+		},
+	}
+	for _, tc := range testCases {
+		sb := initScoreboard(tc.scoreboardBlocks, tc.iss)
+		for _, blkTest := range tc.blockTests {
+			if want, got := blkTest.sacked, sb.IsSACKED(blkTest.block); got != want {
+				t.Errorf("%s: s.IsSACKED(%v) = %v, want %v", tc.comment, blkTest.block, got, want)
+			}
+		}
+	}
+}
+
+func TestSACKScoreboardIsRangeLost(t *testing.T) {
+	s := tcp.NewSACKScoreboard(10, 0)
+	s.Insert(header.SACKBlock{1, 25})
+	s.Insert(header.SACKBlock{25, 50})
+	s.Insert(header.SACKBlock{51, 100})
+	s.Insert(header.SACKBlock{111, 120})
+	s.Insert(header.SACKBlock{101, 110})
+	s.Insert(header.SACKBlock{121, 141})
+	s.Insert(header.SACKBlock{145, 146})
+	s.Insert(header.SACKBlock{147, 148})
+	s.Insert(header.SACKBlock{149, 150})
+	s.Insert(header.SACKBlock{153, 154})
+	s.Insert(header.SACKBlock{155, 156})
+	testCases := []struct {
+		block header.SACKBlock
+		lost  bool
+	}{
+		// Block not covered by SACK block and has more than
+		// nDupAckThreshold discontiguous SACK blocks after it as well
+		// as (nDupAckThreshold -1) * 10 (smss) bytes that have been
+		// SACKED above the sequence number covered by this block.
+		{block: header.SACKBlock{0, 1}, lost: true},
+
+		// These blocks have all been SACKed and should not be
+		// considered lost.
+		{block: header.SACKBlock{1, 2}, lost: false},
+		{block: header.SACKBlock{25, 26}, lost: false},
+		{block: header.SACKBlock{1, 45}, lost: false},
+
+		// Same as the first case above.
+		{block: header.SACKBlock{50, 51}, lost: true},
+
+		// This block has been SACKed and should not be considered lost.
+		{block: header.SACKBlock{119, 120}, lost: false},
+
+		// This one should return true because there are >
+		// (nDupAckThreshold - 1) * 10 (smss) bytes that have been
+		// sacked above this sequence number.
+		{block: header.SACKBlock{120, 121}, lost: true},
+
+		// This block has been SACKed and should not be considered lost.
+		{block: header.SACKBlock{125, 126}, lost: false},
+
+		// This block has not been SACKed and there are nDupAckThreshold
+		// number of SACKed blocks after it.
+		{block: header.SACKBlock{141, 145}, lost: true},
+
+		// This block has not been SACKed and there are less than
+		// nDupAckThreshold SACKed sequences after it.
+		{block: header.SACKBlock{151, 152}, lost: false},
+	}
+	for _, tc := range testCases {
+		if want, got := tc.lost, s.IsRangeLost(tc.block); got != want {
+			t.Errorf("s.IsRangeLost(%v) = %v, want %v", tc.block, got, want)
+		}
+	}
+}
+
+func TestSACKScoreboardIsLost(t *testing.T) {
+	s := tcp.NewSACKScoreboard(10, 0)
+	s.Insert(header.SACKBlock{1, 25})
+	s.Insert(header.SACKBlock{25, 50})
+	s.Insert(header.SACKBlock{51, 100})
+	s.Insert(header.SACKBlock{111, 120})
+	s.Insert(header.SACKBlock{101, 110})
+	s.Insert(header.SACKBlock{121, 141})
+	s.Insert(header.SACKBlock{121, 141})
+	s.Insert(header.SACKBlock{145, 146})
+	s.Insert(header.SACKBlock{147, 148})
+	s.Insert(header.SACKBlock{149, 150})
+	s.Insert(header.SACKBlock{153, 154})
+	s.Insert(header.SACKBlock{155, 156})
+	testCases := []struct {
+		seq  seqnum.Value
+		lost bool
+	}{
+		// Sequence number not covered by SACK block and has more than
+		// nDupAckThreshold discontiguous SACK blocks after it as well
+		// as (nDupAckThreshold -1) * 10 (smss) bytes that have been
+		// SACKED above the sequence number.
+		{seq: 0, lost: true},
+
+		// These sequence numbers have all been SACKed and should not be
+		// considered lost.
+		{seq: 1, lost: false},
+		{seq: 25, lost: false},
+		{seq: 45, lost: false},
+
+		// Same as first case above.
+		{seq: 50, lost: true},
+
+		// This block has been SACKed and should not be considered lost.
+		{seq: 119, lost: false},
+
+		// This one should return true because there are >
+		// (nDupAckThreshold - 1) * 10 (smss) bytes that have been
+		// sacked above this sequence number.
+		{seq: 120, lost: true},
+
+		// This sequence number has been SACKed and should not be
+		// considered lost.
+		{seq: 125, lost: false},
+
+		// This sequence number has not been SACKed and there are
+		// nDupAckThreshold number of SACKed blocks after it.
+		{seq: 141, lost: true},
+
+		// This sequence number has not been SACKed and there are less
+		// than nDupAckThreshold SACKed sequences after it.
+		{seq: 151, lost: false},
+	}
+	for _, tc := range testCases {
+		if want, got := tc.lost, s.IsLost(tc.seq); got != want {
+			t.Errorf("s.IsLost(%v) = %v, want %v", tc.seq, got, want)
+		}
+	}
+}
+
+func TestSACKScoreboardDelete(t *testing.T) {
+	blocks := []header.SACKBlock{{4294254144, 225652}, {5340409, 5350509}}
+	s := initScoreboard(blocks, 4294254143)
+	s.Delete(5340408)
+	if s.Empty() {
+		t.Fatalf("s.Empty() = true, want false")
+	}
+	if got, want := s.Sacked(), blocks[1].Start.Size(blocks[1].End); got != want {
+		t.Fatalf("incorrect sacked bytes in scoreboard got: %v, want: %v", got, want)
+	}
+	s.Delete(5340410)
+	if s.Empty() {
+		t.Fatal("s.Empty() = true, want false")
+	}
+	newSB := header.SACKBlock{5340410, 5350509}
+	if !s.IsSACKED(newSB) {
+		t.Fatalf("s.IsSACKED(%v) = false, want true, scoreboard: %v", newSB, s)
+	}
+	s.Delete(5350509)
+	lastOctet := header.SACKBlock{5350508, 5350509}
+	if s.IsSACKED(lastOctet) {
+		t.Fatalf("s.IsSACKED(%v) = false, want true", lastOctet)
+	}
+
+	s.Delete(5350510)
+	if !s.Empty() {
+		t.Fatal("s.Empty() = false, want true")
+	}
+	if got, want := s.Sacked(), seqnum.Size(0); got != want {
+		t.Fatalf("incorrect sacked bytes in scoreboard got: %v, want: %v", got, want)
+	}
+}
diff --git a/pkg/tcpip/transport/tcp/segment.go b/pkg/tcpip/transport/tcp/segment.go
new file mode 100644
index 000000000..0280892a8
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/segment.go
@@ -0,0 +1,194 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"sync/atomic"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// segment represents a TCP segment. It holds the payload and parsed TCP segment
+// information, and can be added to intrusive lists.
+// segment is mostly immutable, the only field allowed to change is viewToDeliver.
+//
+// +stateify savable
+type segment struct {
+	segmentEntry
+	refCnt int32
+	id     stack.TransportEndpointID `state:"manual"`
+	route  stack.Route               `state:"manual"`
+	data   buffer.VectorisedView     `state:".(buffer.VectorisedView)"`
+	hdr    header.TCP
+	// views is used as buffer for data when its length is large
+	// enough to store a VectorisedView.
+	views [8]buffer.View `state:"nosave"`
+	// viewToDeliver keeps track of the next View that should be
+	// delivered by the Read endpoint.
+	viewToDeliver  int
+	sequenceNumber seqnum.Value
+	ackNumber      seqnum.Value
+	flags          uint8
+	window         seqnum.Size
+	// csum is only populated for received segments.
+	csum uint16
+	// csumValid is true if the csum in the received segment is valid.
+	csumValid bool
+
+	// parsedOptions stores the parsed values from the options in the segment.
+	parsedOptions  header.TCPOptions
+	options        []byte `state:".([]byte)"`
+	hasNewSACKInfo bool
+	rcvdTime       time.Time `state:".(unixTime)"`
+	// xmitTime is the last transmit time of this segment.
+	xmitTime  time.Time `state:".(unixTime)"`
+	xmitCount uint32
+}
+
+func newSegment(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) *segment {
+	s := &segment{
+		refCnt: 1,
+		id:     id,
+		route:  r.Clone(),
+	}
+	s.data = pkt.Data.Clone(s.views[:])
+	s.hdr = header.TCP(pkt.TransportHeader)
+	s.rcvdTime = time.Now()
+	return s
+}
+
+func newSegmentFromView(r *stack.Route, id stack.TransportEndpointID, v buffer.View) *segment {
+	s := &segment{
+		refCnt: 1,
+		id:     id,
+		route:  r.Clone(),
+	}
+	s.rcvdTime = time.Now()
+	if len(v) != 0 {
+		s.views[0] = v
+		s.data = buffer.NewVectorisedView(len(v), s.views[:1])
+	}
+	return s
+}
+
+func (s *segment) clone() *segment {
+	t := &segment{
+		refCnt:         1,
+		id:             s.id,
+		sequenceNumber: s.sequenceNumber,
+		ackNumber:      s.ackNumber,
+		flags:          s.flags,
+		window:         s.window,
+		route:          s.route.Clone(),
+		viewToDeliver:  s.viewToDeliver,
+		rcvdTime:       s.rcvdTime,
+		xmitTime:       s.xmitTime,
+		xmitCount:      s.xmitCount,
+	}
+	t.data = s.data.Clone(t.views[:])
+	return t
+}
+
+// flagIsSet checks if at least one flag in flags is set in s.flags.
+func (s *segment) flagIsSet(flags uint8) bool {
+	return s.flags&flags != 0
+}
+
+// flagsAreSet checks if all flags in flags are set in s.flags.
+func (s *segment) flagsAreSet(flags uint8) bool {
+	return s.flags&flags == flags
+}
+
+func (s *segment) decRef() {
+	if atomic.AddInt32(&s.refCnt, -1) == 0 {
+		s.route.Release()
+	}
+}
+
+func (s *segment) incRef() {
+	atomic.AddInt32(&s.refCnt, 1)
+}
+
+// logicalLen is the segment length in the sequence number space. It's defined
+// as the data length plus one for each of the SYN and FIN bits set.
+func (s *segment) logicalLen() seqnum.Size {
+	l := seqnum.Size(s.data.Size())
+	if s.flagIsSet(header.TCPFlagSyn) {
+		l++
+	}
+	if s.flagIsSet(header.TCPFlagFin) {
+		l++
+	}
+	return l
+}
+
+// parse populates the sequence & ack numbers, flags, and window fields of the
+// segment from the TCP header stored in the data. It then updates the view to
+// skip the header.
+//
+// Returns boolean indicating if the parsing was successful.
+//
+// If checksum verification is not offloaded then parse also verifies the
+// TCP checksum and stores the checksum and result of checksum verification in
+// the csum and csumValid fields of the segment.
+func (s *segment) parse() bool {
+	// h is the header followed by the payload. We check that the offset to
+	// the data respects the following constraints:
+	// 1. That it's at least the minimum header size; if we don't do this
+	//    then part of the header would be delivered to user.
+	// 2. That the header fits within the buffer; if we don't do this, we
+	//    would panic when we tried to access data beyond the buffer.
+	//
+	// N.B. The segment has already been validated as having at least the
+	//      minimum TCP size before reaching here, so it's safe to read the
+	//      fields.
+	offset := int(s.hdr.DataOffset())
+	if offset < header.TCPMinimumSize || offset > len(s.hdr) {
+		return false
+	}
+
+	s.options = []byte(s.hdr[header.TCPMinimumSize:])
+	s.parsedOptions = header.ParseTCPOptions(s.options)
+
+	// Query the link capabilities to decide if checksum validation is
+	// required.
+	verifyChecksum := true
+	if s.route.Capabilities()&stack.CapabilityRXChecksumOffload != 0 {
+		s.csumValid = true
+		verifyChecksum = false
+	}
+	if verifyChecksum {
+		s.csum = s.hdr.Checksum()
+		xsum := s.route.PseudoHeaderChecksum(ProtocolNumber, uint16(s.data.Size()+len(s.hdr)))
+		xsum = s.hdr.CalculateChecksum(xsum)
+		xsum = header.ChecksumVV(s.data, xsum)
+		s.csumValid = xsum == 0xffff
+	}
+
+	s.sequenceNumber = seqnum.Value(s.hdr.SequenceNumber())
+	s.ackNumber = seqnum.Value(s.hdr.AckNumber())
+	s.flags = s.hdr.Flags()
+	s.window = seqnum.Size(s.hdr.WindowSize())
+	return true
+}
+
+// sackBlock returns a header.SACKBlock that represents this segment.
+func (s *segment) sackBlock() header.SACKBlock {
+	return header.SACKBlock{s.sequenceNumber, s.sequenceNumber.Add(s.logicalLen())}
+}
diff --git a/pkg/tcpip/transport/tcp/segment_heap.go b/pkg/tcpip/transport/tcp/segment_heap.go
new file mode 100644
index 000000000..8d3ddce4b
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/segment_heap.go
@@ -0,0 +1,51 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import "container/heap"
+
+type segmentHeap []*segment
+
+var _ heap.Interface = (*segmentHeap)(nil)
+
+// Len returns the length of h.
+func (h *segmentHeap) Len() int {
+	return len(*h)
+}
+
+// Less determines whether the i-th element of h is less than the j-th element.
+func (h *segmentHeap) Less(i, j int) bool {
+	return (*h)[i].sequenceNumber.LessThan((*h)[j].sequenceNumber)
+}
+
+// Swap swaps the i-th and j-th elements of h.
+func (h *segmentHeap) Swap(i, j int) {
+	(*h)[i], (*h)[j] = (*h)[j], (*h)[i]
+}
+
+// Push adds x as the last element of h.
+func (h *segmentHeap) Push(x interface{}) {
+	*h = append(*h, x.(*segment))
+}
+
+// Pop removes the last element of h and returns it.
+func (h *segmentHeap) Pop() interface{} {
+	old := *h
+	n := len(old)
+	x := old[n-1]
+	old[n-1] = nil
+	*h = old[:n-1]
+	return x
+}
diff --git a/pkg/tcpip/transport/tcp/segment_queue.go b/pkg/tcpip/transport/tcp/segment_queue.go
new file mode 100644
index 000000000..48a257137
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/segment_queue.go
@@ -0,0 +1,85 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// segmentQueue is a bounded, thread-safe queue of TCP segments.
+//
+// +stateify savable
+type segmentQueue struct {
+	mu    sync.Mutex  `state:"nosave"`
+	list  segmentList `state:"wait"`
+	limit int
+	used  int
+}
+
+// emptyLocked determines if the queue is empty.
+// Preconditions: q.mu must be held.
+func (q *segmentQueue) emptyLocked() bool {
+	return q.used == 0
+}
+
+// empty determines if the queue is empty.
+func (q *segmentQueue) empty() bool {
+	q.mu.Lock()
+	r := q.emptyLocked()
+	q.mu.Unlock()
+
+	return r
+}
+
+// setLimit updates the limit. No segments are immediately dropped in case the
+// queue becomes full due to the new limit.
+func (q *segmentQueue) setLimit(limit int) {
+	q.mu.Lock()
+	q.limit = limit
+	q.mu.Unlock()
+}
+
+// enqueue adds the given segment to the queue.
+//
+// Returns true when the segment is successfully added to the queue, in which
+// case ownership of the reference is transferred to the queue. And returns
+// false if the queue is full, in which case ownership is retained by the
+// caller.
+func (q *segmentQueue) enqueue(s *segment) bool {
+	q.mu.Lock()
+	r := q.used < q.limit
+	if r {
+		q.list.PushBack(s)
+		q.used++
+	}
+	q.mu.Unlock()
+
+	return r
+}
+
+// dequeue removes and returns the next segment from queue, if one exists.
+// Ownership is transferred to the caller, who is responsible for decrementing
+// the ref count when done.
+func (q *segmentQueue) dequeue() *segment {
+	q.mu.Lock()
+	s := q.list.Front()
+	if s != nil {
+		q.list.Remove(s)
+		q.used--
+	}
+	q.mu.Unlock()
+
+	return s
+}
diff --git a/pkg/tcpip/transport/tcp/segment_state.go b/pkg/tcpip/transport/tcp/segment_state.go
new file mode 100644
index 000000000..7dc2741a6
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/segment_state.go
@@ -0,0 +1,82 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+)
+
+// saveData is invoked by stateify.
+func (s *segment) saveData() buffer.VectorisedView {
+	// We cannot save s.data directly as s.data.views may alias to s.views,
+	// which is not allowed by state framework (in-struct pointer).
+	v := make([]buffer.View, len(s.data.Views()))
+	// For views already delivered, we cannot save them directly as they may
+	// have already been sliced and saved elsewhere (e.g., readViews).
+	for i := 0; i < s.viewToDeliver; i++ {
+		v[i] = append([]byte(nil), s.data.Views()[i]...)
+	}
+	for i := s.viewToDeliver; i < len(v); i++ {
+		v[i] = s.data.Views()[i]
+	}
+	return buffer.NewVectorisedView(s.data.Size(), v)
+}
+
+// loadData is invoked by stateify.
+func (s *segment) loadData(data buffer.VectorisedView) {
+	// NOTE: We cannot do the s.data = data.Clone(s.views[:]) optimization
+	// here because data.views is not guaranteed to be loaded by now. Plus,
+	// data.views will be allocated anyway so there really is little point
+	// of utilizing s.views for data.views.
+	s.data = data
+}
+
+// saveOptions is invoked by stateify.
+func (s *segment) saveOptions() []byte {
+	// We cannot save s.options directly as it may point to s.data's trimmed
+	// tail, which is not allowed by state framework (in-struct pointer).
+	b := make([]byte, 0, cap(s.options))
+	return append(b, s.options...)
+}
+
+// loadOptions is invoked by stateify.
+func (s *segment) loadOptions(options []byte) {
+	// NOTE: We cannot point s.options back into s.data's trimmed tail. But
+	// it is OK as they do not need to aliased. Plus, options is already
+	// allocated so there is no cost here.
+	s.options = options
+}
+
+// saveRcvdTime is invoked by stateify.
+func (s *segment) saveRcvdTime() unixTime {
+	return unixTime{s.rcvdTime.Unix(), s.rcvdTime.UnixNano()}
+}
+
+// loadRcvdTime is invoked by stateify.
+func (s *segment) loadRcvdTime(unix unixTime) {
+	s.rcvdTime = time.Unix(unix.second, unix.nano)
+}
+
+// saveXmitTime is invoked by stateify.
+func (s *segment) saveXmitTime() unixTime {
+	return unixTime{s.rcvdTime.Unix(), s.rcvdTime.UnixNano()}
+}
+
+// loadXmitTime is invoked by stateify.
+func (s *segment) loadXmitTime(unix unixTime) {
+	s.rcvdTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/snd.go b/pkg/tcpip/transport/tcp/snd.go
new file mode 100644
index 000000000..5862c32f2
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/snd.go
@@ -0,0 +1,1487 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"fmt"
+	"math"
+	"sync/atomic"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+)
+
+const (
+	// MinRTO is the minimum allowed value for the retransmit timeout.
+	MinRTO = 200 * time.Millisecond
+
+	// MaxRTO is the maximum allowed value for the retransmit timeout.
+	MaxRTO = 120 * time.Second
+
+	// InitialCwnd is the initial congestion window.
+	InitialCwnd = 10
+
+	// nDupAckThreshold is the number of duplicate ACK's required
+	// before fast-retransmit is entered.
+	nDupAckThreshold = 3
+
+	// MaxRetries is the maximum number of probe retries sender does
+	// before timing out the connection.
+	// Linux default TCP_RETR2, net.ipv4.tcp_retries2.
+	MaxRetries = 15
+)
+
+// ccState indicates the current congestion control state for this sender.
+type ccState int
+
+const (
+	// Open indicates that the sender is receiving acks in order and
+	// no loss or dupACK's etc have been detected.
+	Open ccState = iota
+	// RTORecovery indicates that an RTO has occurred and the sender
+	// has entered an RTO based recovery phase.
+	RTORecovery
+	// FastRecovery indicates that the sender has entered FastRecovery
+	// based on receiving nDupAck's. This state is entered only when
+	// SACK is not in use.
+	FastRecovery
+	// SACKRecovery indicates that the sender has entered SACK based
+	// recovery.
+	SACKRecovery
+	// Disorder indicates the sender either received some SACK blocks
+	// or dupACK's.
+	Disorder
+)
+
+// congestionControl is an interface that must be implemented by any supported
+// congestion control algorithm.
+type congestionControl interface {
+	// HandleNDupAcks is invoked when sender.dupAckCount >= nDupAckThreshold
+	// just before entering fast retransmit.
+	HandleNDupAcks()
+
+	// HandleRTOExpired is invoked when the retransmit timer expires.
+	HandleRTOExpired()
+
+	// Update is invoked when processing inbound acks. It's passed the
+	// number of packet's that were acked by the most recent cumulative
+	// acknowledgement.
+	Update(packetsAcked int)
+
+	// PostRecovery is invoked when the sender is exiting a fast retransmit/
+	// recovery phase. This provides congestion control algorithms a way
+	// to adjust their state when exiting recovery.
+	PostRecovery()
+}
+
+// sender holds the state necessary to send TCP segments.
+//
+// +stateify savable
+type sender struct {
+	ep *endpoint
+
+	// lastSendTime is the timestamp when the last packet was sent.
+	lastSendTime time.Time `state:".(unixTime)"`
+
+	// dupAckCount is the number of duplicated acks received. It is used for
+	// fast retransmit.
+	dupAckCount int
+
+	// fr holds state related to fast recovery.
+	fr fastRecovery
+
+	// sndCwnd is the congestion window, in packets.
+	sndCwnd int
+
+	// sndSsthresh is the threshold between slow start and congestion
+	// avoidance.
+	sndSsthresh int
+
+	// sndCAAckCount is the number of packets acknowledged during congestion
+	// avoidance. When enough packets have been ack'd (typically cwnd
+	// packets), the congestion window is incremented by one.
+	sndCAAckCount int
+
+	// outstanding is the number of outstanding packets, that is, packets
+	// that have been sent but not yet acknowledged.
+	outstanding int
+
+	// sndWnd is the send window size.
+	sndWnd seqnum.Size
+
+	// sndUna is the next unacknowledged sequence number.
+	sndUna seqnum.Value
+
+	// sndNxt is the sequence number of the next segment to be sent.
+	sndNxt seqnum.Value
+
+	// rttMeasureSeqNum is the sequence number being used for the latest RTT
+	// measurement.
+	rttMeasureSeqNum seqnum.Value
+
+	// rttMeasureTime is the time when the rttMeasureSeqNum was sent.
+	rttMeasureTime time.Time `state:".(unixTime)"`
+
+	// firstRetransmittedSegXmitTime is the original transmit time of
+	// the first segment that was retransmitted due to RTO expiration.
+	firstRetransmittedSegXmitTime time.Time `state:".(unixTime)"`
+
+	// zeroWindowProbing is set if the sender is currently probing
+	// for zero receive window.
+	zeroWindowProbing bool `state:"nosave"`
+
+	// unackZeroWindowProbes is the number of unacknowledged zero
+	// window probes.
+	unackZeroWindowProbes uint32 `state:"nosave"`
+
+	closed      bool
+	writeNext   *segment
+	writeList   segmentList
+	resendTimer timer       `state:"nosave"`
+	resendWaker sleep.Waker `state:"nosave"`
+
+	// rtt.srtt, rtt.rttvar, and rto are the "smoothed round-trip time",
+	// "round-trip time variation" and "retransmit timeout", as defined in
+	// section 2 of RFC 6298.
+	rtt rtt
+	rto time.Duration
+
+	// minRTO is the minimum permitted value for sender.rto.
+	minRTO time.Duration
+
+	// maxRTO is the maximum permitted value for sender.rto.
+	maxRTO time.Duration
+
+	// maxRetries is the maximum permitted retransmissions.
+	maxRetries uint32
+
+	// maxPayloadSize is the maximum size of the payload of a given segment.
+	// It is initialized on demand.
+	maxPayloadSize int
+
+	// gso is set if generic segmentation offload is enabled.
+	gso bool
+
+	// sndWndScale is the number of bits to shift left when reading the send
+	// window size from a segment.
+	sndWndScale uint8
+
+	// maxSentAck is the maxium acknowledgement actually sent.
+	maxSentAck seqnum.Value
+
+	// state is the current state of congestion control for this endpoint.
+	state ccState
+
+	// cc is the congestion control algorithm in use for this sender.
+	cc congestionControl
+}
+
+// rtt is a synchronization wrapper used to appease stateify. See the comment
+// in sender, where it is used.
+//
+// +stateify savable
+type rtt struct {
+	sync.Mutex `state:"nosave"`
+
+	srtt       time.Duration
+	rttvar     time.Duration
+	srttInited bool
+}
+
+// fastRecovery holds information related to fast recovery from a packet loss.
+//
+// +stateify savable
+type fastRecovery struct {
+	// active whether the endpoint is in fast recovery. The following fields
+	// are only meaningful when active is true.
+	active bool
+
+	// first and last represent the inclusive sequence number range being
+	// recovered.
+	first seqnum.Value
+	last  seqnum.Value
+
+	// maxCwnd is the maximum value the congestion window may be inflated to
+	// due to duplicate acks. This exists to avoid attacks where the
+	// receiver intentionally sends duplicate acks to artificially inflate
+	// the sender's cwnd.
+	maxCwnd int
+
+	// highRxt is the highest sequence number which has been retransmitted
+	// during the current loss recovery phase.
+	// See: RFC 6675 Section 2 for details.
+	highRxt seqnum.Value
+
+	// rescueRxt is the highest sequence number which has been
+	// optimistically retransmitted to prevent stalling of the ACK clock
+	// when there is loss at the end of the window and no new data is
+	// available for transmission.
+	// See: RFC 6675 Section 2 for details.
+	rescueRxt seqnum.Value
+}
+
+func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender {
+	// The sender MUST reduce the TCP data length to account for any IP or
+	// TCP options that it is including in the packets that it sends.
+	// See: https://tools.ietf.org/html/rfc6691#section-2
+	maxPayloadSize := int(mss) - ep.maxOptionSize()
+
+	s := &sender{
+		ep:               ep,
+		sndWnd:           sndWnd,
+		sndUna:           iss + 1,
+		sndNxt:           iss + 1,
+		rto:              1 * time.Second,
+		rttMeasureSeqNum: iss + 1,
+		lastSendTime:     time.Now(),
+		maxPayloadSize:   maxPayloadSize,
+		maxSentAck:       irs + 1,
+		fr: fastRecovery{
+			// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1.
+			last:      iss,
+			highRxt:   iss,
+			rescueRxt: iss,
+		},
+		gso: ep.gso != nil,
+	}
+
+	if s.gso {
+		s.ep.gso.MSS = uint16(maxPayloadSize)
+	}
+
+	s.cc = s.initCongestionControl(ep.cc)
+
+	// A negative sndWndScale means that no scaling is in use, otherwise we
+	// store the scaling value.
+	if sndWndScale > 0 {
+		s.sndWndScale = uint8(sndWndScale)
+	}
+
+	s.resendTimer.init(&s.resendWaker)
+
+	s.updateMaxPayloadSize(int(ep.route.MTU()), 0)
+
+	// Initialize SACK Scoreboard after updating max payload size as we use
+	// the maxPayloadSize as the smss when determining if a segment is lost
+	// etc.
+	s.ep.scoreboard = NewSACKScoreboard(uint16(s.maxPayloadSize), iss)
+
+	// Get Stack wide config.
+	var minRTO tcpip.TCPMinRTOOption
+	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil {
+		panic(fmt.Sprintf("unable to get minRTO from stack: %s", err))
+	}
+	s.minRTO = time.Duration(minRTO)
+
+	var maxRTO tcpip.TCPMaxRTOOption
+	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil {
+		panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err))
+	}
+	s.maxRTO = time.Duration(maxRTO)
+
+	var maxRetries tcpip.TCPMaxRetriesOption
+	if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil {
+		panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err))
+	}
+	s.maxRetries = uint32(maxRetries)
+
+	return s
+}
+
+// initCongestionControl initializes the specified congestion control module and
+// returns a handle to it. It also initializes the sndCwnd and sndSsThresh to
+// their initial values.
+func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl {
+	s.sndCwnd = InitialCwnd
+	s.sndSsthresh = math.MaxInt64
+
+	switch congestionControlName {
+	case ccCubic:
+		return newCubicCC(s)
+	case ccReno:
+		fallthrough
+	default:
+		return newRenoCC(s)
+	}
+}
+
+// updateMaxPayloadSize updates the maximum payload size based on the given
+// MTU. If this is in response to "packet too big" control packets (indicated
+// by the count argument), it also reduces the number of outstanding packets and
+// attempts to retransmit the first packet above the MTU size.
+func (s *sender) updateMaxPayloadSize(mtu, count int) {
+	m := mtu - header.TCPMinimumSize
+
+	m -= s.ep.maxOptionSize()
+
+	// We don't adjust up for now.
+	if m >= s.maxPayloadSize {
+		return
+	}
+
+	// Make sure we can transmit at least one byte.
+	if m <= 0 {
+		m = 1
+	}
+
+	s.maxPayloadSize = m
+	if s.gso {
+		s.ep.gso.MSS = uint16(m)
+	}
+
+	if count == 0 {
+		// updateMaxPayloadSize is also called when the sender is created.
+		// and there is no data to send in such cases. Return immediately.
+		return
+	}
+
+	// Update the scoreboard's smss to reflect the new lowered
+	// maxPayloadSize.
+	s.ep.scoreboard.smss = uint16(m)
+
+	s.outstanding -= count
+	if s.outstanding < 0 {
+		s.outstanding = 0
+	}
+
+	// Rewind writeNext to the first segment exceeding the MTU. Do nothing
+	// if it is already before such a packet.
+	for seg := s.writeList.Front(); seg != nil; seg = seg.Next() {
+		if seg == s.writeNext {
+			// We got to writeNext before we could find a segment
+			// exceeding the MTU.
+			break
+		}
+
+		if seg.data.Size() > m {
+			// We found a segment exceeding the MTU. Rewind
+			// writeNext and try to retransmit it.
+			s.writeNext = seg
+			break
+		}
+	}
+
+	// Since we likely reduced the number of outstanding packets, we may be
+	// ready to send some more.
+	s.sendData()
+}
+
+// sendAck sends an ACK segment.
+func (s *sender) sendAck() {
+	s.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, s.sndNxt)
+}
+
+// updateRTO updates the retransmit timeout when a new roud-trip time is
+// available. This is done in accordance with section 2 of RFC 6298.
+func (s *sender) updateRTO(rtt time.Duration) {
+	s.rtt.Lock()
+	if !s.rtt.srttInited {
+		s.rtt.rttvar = rtt / 2
+		s.rtt.srtt = rtt
+		s.rtt.srttInited = true
+	} else {
+		diff := s.rtt.srtt - rtt
+		if diff < 0 {
+			diff = -diff
+		}
+		// Use RFC6298 standard algorithm to update rttvar and srtt when
+		// no timestamps are available.
+		if !s.ep.sendTSOk {
+			s.rtt.rttvar = (3*s.rtt.rttvar + diff) / 4
+			s.rtt.srtt = (7*s.rtt.srtt + rtt) / 8
+		} else {
+			// When we are taking RTT measurements of every ACK then
+			// we need to use a modified method as specified in
+			// https://tools.ietf.org/html/rfc7323#appendix-G
+			if s.outstanding == 0 {
+				s.rtt.Unlock()
+				return
+			}
+			// Netstack measures congestion window/inflight all in
+			// terms of packets and not bytes. This is similar to
+			// how linux also does cwnd and inflight. In practice
+			// this approximation works as expected.
+			expectedSamples := math.Ceil(float64(s.outstanding) / 2)
+
+			// alpha & beta values are the original values as recommended in
+			// https://tools.ietf.org/html/rfc6298#section-2.3.
+			const alpha = 0.125
+			const beta = 0.25
+
+			alphaPrime := alpha / expectedSamples
+			betaPrime := beta / expectedSamples
+			rttVar := (1-betaPrime)*s.rtt.rttvar.Seconds() + betaPrime*diff.Seconds()
+			srtt := (1-alphaPrime)*s.rtt.srtt.Seconds() + alphaPrime*rtt.Seconds()
+			s.rtt.rttvar = time.Duration(rttVar * float64(time.Second))
+			s.rtt.srtt = time.Duration(srtt * float64(time.Second))
+		}
+	}
+
+	s.rto = s.rtt.srtt + 4*s.rtt.rttvar
+	s.rtt.Unlock()
+	if s.rto < s.minRTO {
+		s.rto = s.minRTO
+	}
+}
+
+// resendSegment resends the first unacknowledged segment.
+func (s *sender) resendSegment() {
+	// Don't use any segments we already sent to measure RTT as they may
+	// have been affected by packets being lost.
+	s.rttMeasureSeqNum = s.sndNxt
+
+	// Resend the segment.
+	if seg := s.writeList.Front(); seg != nil {
+		if seg.data.Size() > s.maxPayloadSize {
+			s.splitSeg(seg, s.maxPayloadSize)
+		}
+
+		// See: RFC 6675 section 5 Step 4.3
+		//
+		// To prevent retransmission, set both the HighRXT and RescueRXT
+		// to the highest sequence number in the retransmitted segment.
+		s.fr.highRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1
+		s.fr.rescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1
+		s.sendSegment(seg)
+		s.ep.stack.Stats().TCP.FastRetransmit.Increment()
+		s.ep.stats.SendErrors.FastRetransmit.Increment()
+
+		// Run SetPipe() as per RFC 6675 section 5 Step 4.4
+		s.SetPipe()
+	}
+}
+
+// retransmitTimerExpired is called when the retransmit timer expires, and
+// unacknowledged segments are assumed lost, and thus need to be resent.
+// Returns true if the connection is still usable, or false if the connection
+// is deemed lost.
+func (s *sender) retransmitTimerExpired() bool {
+	// Check if the timer actually expired or if it's a spurious wake due
+	// to a previously orphaned runtime timer.
+	if !s.resendTimer.checkExpiration() {
+		return true
+	}
+
+	// TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases
+	// when writeList is empty. Remove this once we have a proper fix for this
+	// issue.
+	if s.writeList.Front() == nil {
+		return true
+	}
+
+	s.ep.stack.Stats().TCP.Timeouts.Increment()
+	s.ep.stats.SendErrors.Timeouts.Increment()
+
+	// Give up if we've waited more than a minute since the last resend or
+	// if a user time out is set and we have exceeded the user specified
+	// timeout since the first retransmission.
+	uto := s.ep.userTimeout
+
+	if s.firstRetransmittedSegXmitTime.IsZero() {
+		// We store the original xmitTime of the segment that we are
+		// about to retransmit as the retransmission time. This is
+		// required as by the time the retransmitTimer has expired the
+		// segment has already been sent and unacked for the RTO at the
+		// time the segment was sent.
+		s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime
+	}
+
+	elapsed := time.Since(s.firstRetransmittedSegXmitTime)
+	remaining := s.maxRTO
+	if uto != 0 {
+		// Cap to the user specified timeout if one is specified.
+		remaining = uto - elapsed
+	}
+
+	// Always honor the user-timeout irrespective of whether the zero
+	// window probes were acknowledged.
+	// net/ipv4/tcp_timer.c::tcp_probe_timer()
+	if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries {
+		return false
+	}
+
+	// Set new timeout. The timer will be restarted by the call to sendData
+	// below.
+	s.rto *= 2
+	// Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5
+	if s.rto > s.maxRTO {
+		s.rto = s.maxRTO
+	}
+
+	// Cap RTO to remaining time.
+	if s.rto > remaining {
+		s.rto = remaining
+	}
+
+	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4.
+	//
+	// Retransmit timeouts:
+	//     After a retransmit timeout, record the highest sequence number
+	//     transmitted in the variable recover, and exit the fast recovery
+	//     procedure if applicable.
+	s.fr.last = s.sndNxt - 1
+
+	if s.fr.active {
+		// We were attempting fast recovery but were not successful.
+		// Leave the state. We don't need to update ssthresh because it
+		// has already been updated when entered fast-recovery.
+		s.leaveFastRecovery()
+	}
+
+	s.state = RTORecovery
+	s.cc.HandleRTOExpired()
+
+	// Mark the next segment to be sent as the first unacknowledged one and
+	// start sending again. Set the number of outstanding packets to 0 so
+	// that we'll be able to retransmit.
+	//
+	// We'll keep on transmitting (or retransmitting) as we get acks for
+	// the data we transmit.
+	s.outstanding = 0
+
+	// Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1
+	//
+	//  In order to avoid memory deadlocks, the TCP receiver is allowed to
+	//  discard data that has already been selectively acknowledged. As a
+	//  result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK
+	//  information gathered from a receiver upon a retransmission timeout
+	//  (RTO) "since the timeout might indicate that the data receiver has
+	//  reneged." Additionally, a TCP sender MUST "ignore prior SACK
+	//  information in determining which data to retransmit."
+	//
+	// NOTE: We take the stricter interpretation and just expunge all
+	// information as we lack more rigorous checks to validate if the SACK
+	// information is usable after an RTO.
+	s.ep.scoreboard.Reset()
+	s.writeNext = s.writeList.Front()
+
+	// RFC 1122 4.2.2.17: Start sending zero window probes when we still see a
+	// zero receive window after retransmission interval and we have data to
+	// send.
+	if s.zeroWindowProbing {
+		s.sendZeroWindowProbe()
+		// RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed
+		// indefinitely.  As long as the receiving TCP continues to send
+		// acknowledgments in response to the probe segments, the sending TCP
+		// MUST allow the connection to stay open.
+		return true
+	}
+
+	seg := s.writeNext
+	// RFC 1122 4.2.3.5: Close the connection when the number of
+	// retransmissions for this segment is beyond a limit.
+	if seg != nil && seg.xmitCount > s.maxRetries {
+		return false
+	}
+
+	s.sendData()
+
+	return true
+}
+
+// pCount returns the number of packets in the segment. Due to GSO, a segment
+// can be composed of multiple packets.
+func (s *sender) pCount(seg *segment) int {
+	size := seg.data.Size()
+	if size == 0 {
+		return 1
+	}
+
+	return (size-1)/s.maxPayloadSize + 1
+}
+
+// splitSeg splits a given segment at the size specified and inserts the
+// remainder as a new segment after the current one in the write list.
+func (s *sender) splitSeg(seg *segment, size int) {
+	if seg.data.Size() <= size {
+		return
+	}
+	// Split this segment up.
+	nSeg := seg.clone()
+	nSeg.data.TrimFront(size)
+	nSeg.sequenceNumber.UpdateForward(seqnum.Size(size))
+	s.writeList.InsertAfter(seg, nSeg)
+
+	// The segment being split does not carry PUSH flag because it is
+	// followed by the newly split segment.
+	// RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered
+	// segment (i.e., when there is no more queued data to be sent).
+	// Linux removes PSH flag only when the segment is being split over MSS
+	// and retains it when we are splitting the segment over lack of sender
+	// window space.
+	// ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point()
+	// ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test()
+	if seg.data.Size() > s.maxPayloadSize {
+		seg.flags ^= header.TCPFlagPsh
+	}
+
+	seg.data.CapLength(size)
+}
+
+// NextSeg implements the RFC6675 NextSeg() operation.
+//
+// NextSeg starts scanning the writeList starting from nextSegHint and returns
+// the hint to be passed on the next call to NextSeg. This is required to avoid
+// iterating the write list repeatedly when NextSeg is invoked in a loop during
+// recovery. The returned hint will be nil if there are no more segments that
+// can match rules defined by NextSeg operation in RFC6675.
+//
+// rescueRtx will be true only if nextSeg is a rescue retransmission as
+// described by Step 4) of the NextSeg algorithm.
+func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) {
+	var s3 *segment
+	var s4 *segment
+	// Step 1.
+	for seg := nextSegHint; seg != nil; seg = seg.Next() {
+		// Stop iteration if we hit a segment that has never been
+		// transmitted (i.e. either it has no assigned sequence number
+		// or if it does have one, it's >= the next sequence number
+		// to be sent [i.e. >= s.sndNxt]).
+		if !s.isAssignedSequenceNumber(seg) || s.sndNxt.LessThanEq(seg.sequenceNumber) {
+			hint = nil
+			break
+		}
+		segSeq := seg.sequenceNumber
+		if smss := s.ep.scoreboard.SMSS(); seg.data.Size() > int(smss) {
+			s.splitSeg(seg, int(smss))
+		}
+
+		// See RFC 6675 Section 4
+		//
+		//     1. If there exists a smallest unSACKED sequence number
+		//     'S2' that meets the following 3 criteria for determinig
+		//     loss, the sequence range of one segment of up to SMSS
+		//     octects starting with S2 MUST be returned.
+		if !s.ep.scoreboard.IsSACKED(header.SACKBlock{segSeq, segSeq.Add(1)}) {
+			// NextSeg():
+			//
+			//    (1.a) S2 is greater than HighRxt
+			//    (1.b) S2 is less than highest octect covered by
+			//    any received SACK.
+			if s.fr.highRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) {
+				// NextSeg():
+				//     (1.c) IsLost(S2) returns true.
+				if s.ep.scoreboard.IsLost(segSeq) {
+					return seg, seg.Next(), false
+				}
+
+				// NextSeg():
+				//
+				// (3): If the conditions for rules (1) and (2)
+				// fail, but there exists an unSACKed sequence
+				// number S3 that meets the criteria for
+				// detecting loss given in steps 1.a and 1.b
+				// above (specifically excluding (1.c)) then one
+				// segment of upto SMSS octets starting with S3
+				// SHOULD be returned.
+				if s3 == nil {
+					s3 = seg
+					hint = seg.Next()
+				}
+			}
+			// NextSeg():
+			//
+			//     (4) If the conditions for (1), (2) and (3) fail,
+			//     but there exists outstanding unSACKED data, we
+			//     provide the opportunity for a single "rescue"
+			//     retransmission per entry into loss recovery. If
+			//     HighACK is greater than RescueRxt (or RescueRxt
+			//     is undefined), then one segment of upto SMSS
+			//     octects that MUST include the highest outstanding
+			//     unSACKed sequence number SHOULD be returned, and
+			//     RescueRxt set to RecoveryPoint. HighRxt MUST NOT
+			//     be updated.
+			if s.fr.rescueRxt.LessThan(s.sndUna - 1) {
+				if s4 != nil {
+					if s4.sequenceNumber.LessThan(segSeq) {
+						s4 = seg
+					}
+				} else {
+					s4 = seg
+				}
+			}
+		}
+	}
+
+	// If we got here then no segment matched step (1).
+	// Step (2): "If no sequence number 'S2' per rule (1)
+	// exists but there exists available unsent data and the
+	// receiver's advertised window allows, the sequence
+	// range of one segment of up to SMSS octets of
+	// previously unsent data starting with sequence number
+	// HighData+1 MUST be returned."
+	for seg := s.writeNext; seg != nil; seg = seg.Next() {
+		if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) {
+			continue
+		}
+		// We do not split the segment here to <= smss as it has
+		// potentially not been assigned a sequence number yet.
+		return seg, nil, false
+	}
+
+	if s3 != nil {
+		return s3, hint, false
+	}
+
+	return s4, nil, true
+}
+
+// maybeSendSegment tries to send the specified segment and either coalesces
+// other segments into this one or splits the specified segment based on the
+// lower of the specified limit value or the receivers window size specified by
+// end.
+func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) {
+	// We abuse the flags field to determine if we have already
+	// assigned a sequence number to this segment.
+	if !s.isAssignedSequenceNumber(seg) {
+		// Merge segments if allowed.
+		if seg.data.Size() != 0 {
+			available := int(s.sndNxt.Size(end))
+			if available > limit {
+				available = limit
+			}
+
+			// nextTooBig indicates that the next segment was too
+			// large to entirely fit in the current segment. It
+			// would be possible to split the next segment and merge
+			// the portion that fits, but unexpectedly splitting
+			// segments can have user visible side-effects which can
+			// break applications. For example, RFC 7766 section 8
+			// says that the length and data of a DNS response
+			// should be sent in the same TCP segment to avoid
+			// triggering bugs in poorly written DNS
+			// implementations.
+			var nextTooBig bool
+			for seg.Next() != nil && seg.Next().data.Size() != 0 {
+				if seg.data.Size()+seg.Next().data.Size() > available {
+					nextTooBig = true
+					break
+				}
+				seg.data.Append(seg.Next().data)
+
+				// Consume the segment that we just merged in.
+				s.writeList.Remove(seg.Next())
+			}
+			if !nextTooBig && seg.data.Size() < available {
+				// Segment is not full.
+				if s.outstanding > 0 && atomic.LoadUint32(&s.ep.delay) != 0 {
+					// Nagle's algorithm. From Wikipedia:
+					//   Nagle's algorithm works by
+					//   combining a number of small
+					//   outgoing messages and sending them
+					//   all at once. Specifically, as long
+					//   as there is a sent packet for which
+					//   the sender has received no
+					//   acknowledgment, the sender should
+					//   keep buffering its output until it
+					//   has a full packet's worth of
+					//   output, thus allowing output to be
+					//   sent all at once.
+					return false
+				}
+				// With TCP_CORK, hold back until minimum of the available
+				// send space and MSS.
+				// TODO(gvisor.dev/issue/2833): Drain the held segments after a
+				// timeout.
+				if seg.data.Size() < s.maxPayloadSize && atomic.LoadUint32(&s.ep.cork) != 0 {
+					return false
+				}
+			}
+		}
+
+		// Assign flags. We don't do it above so that we can merge
+		// additional data if Nagle holds the segment.
+		seg.sequenceNumber = s.sndNxt
+		seg.flags = header.TCPFlagAck | header.TCPFlagPsh
+	}
+
+	var segEnd seqnum.Value
+	if seg.data.Size() == 0 {
+		if s.writeList.Back() != seg {
+			panic("FIN segments must be the final segment in the write list.")
+		}
+		seg.flags = header.TCPFlagAck | header.TCPFlagFin
+		segEnd = seg.sequenceNumber.Add(1)
+		// Update the state to reflect that we have now
+		// queued a FIN.
+		switch s.ep.EndpointState() {
+		case StateCloseWait:
+			s.ep.setEndpointState(StateLastAck)
+		default:
+			s.ep.setEndpointState(StateFinWait1)
+		}
+	} else {
+		// We're sending a non-FIN segment.
+		if seg.flags&header.TCPFlagFin != 0 {
+			panic("Netstack queues FIN segments without data.")
+		}
+
+		if !seg.sequenceNumber.LessThan(end) {
+			return false
+		}
+
+		available := int(seg.sequenceNumber.Size(end))
+		if available == 0 {
+			return false
+		}
+
+		// If the whole segment or at least 1MSS sized segment cannot
+		// be accomodated in the receiver advertized window, skip
+		// splitting and sending of the segment. ref:
+		// net/ipv4/tcp_output.c::tcp_snd_wnd_test()
+		//
+		// Linux checks this for all segment transmits not triggered by
+		// a probe timer. On this condition, it defers the segment split
+		// and transmit to a short probe timer.
+		//
+		// ref: include/net/tcp.h::tcp_check_probe_timer()
+		// ref: net/ipv4/tcp_output.c::tcp_write_wakeup()
+		//
+		// Instead of defining a new transmit timer, we attempt to split
+		// the segment right here if there are no pending segments. If
+		// there are pending segments, segment transmits are deferred to
+		// the retransmit timer handler.
+		if s.sndUna != s.sndNxt {
+			switch {
+			case available >= seg.data.Size():
+				// OK to send, the whole segments fits in the
+				// receiver's advertised window.
+			case available >= s.maxPayloadSize:
+				// OK to send, at least 1 MSS sized segment fits
+				// in the receiver's advertised window.
+			default:
+				return false
+			}
+		}
+
+		// The segment size limit is computed as a function of sender
+		// congestion window and MSS. When sender congestion window is >
+		// 1, this limit can be larger than MSS. Ensure that the
+		// currently available send space is not greater than minimum of
+		// this limit and MSS.
+		if available > limit {
+			available = limit
+		}
+
+		// If GSO is not in use then cap available to
+		// maxPayloadSize. When GSO is in use the gVisor GSO logic or
+		// the host GSO logic will cap the segment to the correct size.
+		if s.ep.gso == nil && available > s.maxPayloadSize {
+			available = s.maxPayloadSize
+		}
+
+		if seg.data.Size() > available {
+			s.splitSeg(seg, available)
+		}
+
+		segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size()))
+	}
+
+	s.sendSegment(seg)
+
+	// Update sndNxt if we actually sent new data (as opposed to
+	// retransmitting some previously sent data).
+	if s.sndNxt.LessThan(segEnd) {
+		s.sndNxt = segEnd
+	}
+
+	return true
+}
+
+// handleSACKRecovery implements the loss recovery phase as described in RFC6675
+// section 5, step C.
+func (s *sender) handleSACKRecovery(limit int, end seqnum.Value) (dataSent bool) {
+	s.SetPipe()
+
+	if smss := int(s.ep.scoreboard.SMSS()); limit > smss {
+		// Cap segment size limit to s.smss as SACK recovery requires
+		// that all retransmissions or new segments send during recovery
+		// be of <= SMSS.
+		limit = smss
+	}
+
+	nextSegHint := s.writeList.Front()
+	for s.outstanding < s.sndCwnd {
+		var nextSeg *segment
+		var rescueRtx bool
+		nextSeg, nextSegHint, rescueRtx = s.NextSeg(nextSegHint)
+		if nextSeg == nil {
+			return dataSent
+		}
+		if !s.isAssignedSequenceNumber(nextSeg) || s.sndNxt.LessThanEq(nextSeg.sequenceNumber) {
+			// New data being sent.
+
+			// Step C.3 described below is handled by
+			// maybeSendSegment which increments sndNxt when
+			// a segment is transmitted.
+			//
+			// Step C.3 "If any of the data octets sent in
+			// (C.1) are above HighData, HighData must be
+			// updated to reflect the transmission of
+			// previously unsent data."
+			//
+			// We pass s.smss as the limit as the Step 2) requires that
+			// new data sent should be of size s.smss or less.
+			if sent := s.maybeSendSegment(nextSeg, limit, end); !sent {
+				return dataSent
+			}
+			dataSent = true
+			s.outstanding++
+			s.writeNext = nextSeg.Next()
+			continue
+		}
+
+		// Now handle the retransmission case where we matched either step 1,3 or 4
+		// of the NextSeg algorithm.
+		// RFC 6675, Step C.4.
+		//
+		// "The estimate of the amount of data outstanding in the network
+		// must be updated by incrementing pipe by the number of octets
+		// transmitted in (C.1)."
+		s.outstanding++
+		dataSent = true
+		s.sendSegment(nextSeg)
+
+		segEnd := nextSeg.sequenceNumber.Add(nextSeg.logicalLen())
+		if rescueRtx {
+			// We do the last part of rule (4) of NextSeg here to update
+			// RescueRxt as until this point we don't know if we are going
+			// to use the rescue transmission.
+			s.fr.rescueRxt = s.fr.last
+		} else {
+			// RFC 6675, Step C.2
+			//
+			// "If any of the data octets sent in (C.1) are below
+			// HighData, HighRxt MUST be set to the highest sequence
+			// number of the retransmitted segment unless NextSeg ()
+			// rule (4) was invoked for this retransmission."
+			s.fr.highRxt = segEnd - 1
+		}
+	}
+	return dataSent
+}
+
+func (s *sender) sendZeroWindowProbe() {
+	ack, win := s.ep.rcv.getSendParams()
+	s.unackZeroWindowProbes++
+	// Send a zero window probe with sequence number pointing to
+	// the last acknowledged byte.
+	s.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, s.sndUna-1, ack, win)
+	// Rearm the timer to continue probing.
+	s.resendTimer.enable(s.rto)
+}
+
+func (s *sender) enableZeroWindowProbing() {
+	s.zeroWindowProbing = true
+	// We piggyback the probing on the retransmit timer with the
+	// current retranmission interval, as we may start probing while
+	// segment retransmissions.
+	if s.firstRetransmittedSegXmitTime.IsZero() {
+		s.firstRetransmittedSegXmitTime = time.Now()
+	}
+	s.resendTimer.enable(s.rto)
+}
+
+func (s *sender) disableZeroWindowProbing() {
+	s.zeroWindowProbing = false
+	s.unackZeroWindowProbes = 0
+	s.firstRetransmittedSegXmitTime = time.Time{}
+	s.resendTimer.disable()
+}
+
+// sendData sends new data segments. It is called when data becomes available or
+// when the send window opens up.
+func (s *sender) sendData() {
+	limit := s.maxPayloadSize
+	if s.gso {
+		limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize)
+	}
+	end := s.sndUna.Add(s.sndWnd)
+
+	// Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10.
+	// "A TCP SHOULD set cwnd to no more than RW before beginning
+	// transmission if the TCP has not sent data in the interval exceeding
+	// the retrasmission timeout."
+	if !s.fr.active && s.state != RTORecovery && time.Now().Sub(s.lastSendTime) > s.rto {
+		if s.sndCwnd > InitialCwnd {
+			s.sndCwnd = InitialCwnd
+		}
+	}
+
+	var dataSent bool
+
+	// RFC 6675 recovery algorithm step C 1-5.
+	if s.fr.active && s.ep.sackPermitted {
+		dataSent = s.handleSACKRecovery(s.maxPayloadSize, end)
+	} else {
+		for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() {
+			cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize
+			if cwndLimit < limit {
+				limit = cwndLimit
+			}
+			if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
+				// Move writeNext along so that we don't try and scan data that
+				// has already been SACKED.
+				s.writeNext = seg.Next()
+				continue
+			}
+			if sent := s.maybeSendSegment(seg, limit, end); !sent {
+				break
+			}
+			dataSent = true
+			s.outstanding += s.pCount(seg)
+			s.writeNext = seg.Next()
+		}
+	}
+
+	if dataSent {
+		// We sent data, so we should stop the keepalive timer to ensure
+		// that no keepalives are sent while there is pending data.
+		s.ep.disableKeepaliveTimer()
+	}
+
+	// If the sender has advertized zero receive window and we have
+	// data to be sent out, start zero window probing to query the
+	// the remote for it's receive window size.
+	if s.writeNext != nil && s.sndWnd == 0 {
+		s.enableZeroWindowProbing()
+	}
+
+	// Enable the timer if we have pending data and it's not enabled yet.
+	if !s.resendTimer.enabled() && s.sndUna != s.sndNxt {
+		s.resendTimer.enable(s.rto)
+	}
+	// If we have no more pending data, start the keepalive timer.
+	if s.sndUna == s.sndNxt {
+		s.ep.resetKeepaliveTimer(false)
+	}
+}
+
+func (s *sender) enterFastRecovery() {
+	s.fr.active = true
+	// Save state to reflect we're now in fast recovery.
+	//
+	// See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3.
+	// We inflate the cwnd by 3 to account for the 3 packets which triggered
+	// the 3 duplicate ACKs and are now not in flight.
+	s.sndCwnd = s.sndSsthresh + 3
+	s.fr.first = s.sndUna
+	s.fr.last = s.sndNxt - 1
+	s.fr.maxCwnd = s.sndCwnd + s.outstanding
+	s.fr.highRxt = s.sndUna
+	s.fr.rescueRxt = s.sndUna
+	if s.ep.sackPermitted {
+		s.state = SACKRecovery
+		s.ep.stack.Stats().TCP.SACKRecovery.Increment()
+		return
+	}
+	s.state = FastRecovery
+	s.ep.stack.Stats().TCP.FastRecovery.Increment()
+}
+
+func (s *sender) leaveFastRecovery() {
+	s.fr.active = false
+	s.fr.maxCwnd = 0
+	s.dupAckCount = 0
+
+	// Deflate cwnd. It had been artificially inflated when new dups arrived.
+	s.sndCwnd = s.sndSsthresh
+
+	s.cc.PostRecovery()
+}
+
+func (s *sender) handleFastRecovery(seg *segment) (rtx bool) {
+	ack := seg.ackNumber
+	// We are in fast recovery mode. Ignore the ack if it's out of
+	// range.
+	if !ack.InRange(s.sndUna, s.sndNxt+1) {
+		return false
+	}
+
+	// Leave fast recovery if it acknowledges all the data covered by
+	// this fast recovery session.
+	if s.fr.last.LessThan(ack) {
+		s.leaveFastRecovery()
+		return false
+	}
+
+	if s.ep.sackPermitted {
+		// When SACK is enabled we let retransmission be governed by
+		// the SACK logic.
+		return false
+	}
+
+	// Don't count this as a duplicate if it is carrying data or
+	// updating the window.
+	if seg.logicalLen() != 0 || s.sndWnd != seg.window {
+		return false
+	}
+
+	// Inflate the congestion window if we're getting duplicate acks
+	// for the packet we retransmitted.
+	if ack == s.fr.first {
+		// We received a dup, inflate the congestion window by 1 packet
+		// if we're not at the max yet. Only inflate the window if
+		// regular FastRecovery is in use, RFC6675 does not require
+		// inflating cwnd on duplicate ACKs.
+		if s.sndCwnd < s.fr.maxCwnd {
+			s.sndCwnd++
+		}
+		return false
+	}
+
+	// A partial ack was received. Retransmit this packet and
+	// remember it so that we don't retransmit it again. We don't
+	// inflate the window because we're putting the same packet back
+	// onto the wire.
+	//
+	// N.B. The retransmit timer will be reset by the caller.
+	s.fr.first = ack
+	s.dupAckCount = 0
+	return true
+}
+
+// isAssignedSequenceNumber relies on the fact that we only set flags once a
+// sequencenumber is assigned and that is only done right before we send the
+// segment. As a result any segment that has a non-zero flag has a valid
+// sequence number assigned to it.
+func (s *sender) isAssignedSequenceNumber(seg *segment) bool {
+	return seg.flags != 0
+}
+
+// SetPipe implements the SetPipe() function described in RFC6675. Netstack
+// maintains the congestion window in number of packets and not bytes, so
+// SetPipe() here measures number of outstanding packets rather than actual
+// outstanding bytes in the network.
+func (s *sender) SetPipe() {
+	// If SACK isn't permitted or it is permitted but recovery is not active
+	// then ignore pipe calculations.
+	if !s.ep.sackPermitted || !s.fr.active {
+		return
+	}
+	pipe := 0
+	smss := seqnum.Size(s.ep.scoreboard.SMSS())
+	for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() {
+		// With GSO each segment can be much larger than SMSS. So check the segment
+		// in SMSS sized ranges.
+		segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.data.Size()))
+		for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) {
+			endSeq := startSeq.Add(smss)
+			if segEnd.LessThan(endSeq) {
+				endSeq = segEnd
+			}
+			sb := header.SACKBlock{startSeq, endSeq}
+			// SetPipe():
+			//
+			// After initializing pipe to zero, the following steps are
+			// taken for each octet 'S1' in the sequence space between
+			// HighACK and HighData that has not been SACKed:
+			if !s1.sequenceNumber.LessThan(s.sndNxt) {
+				break
+			}
+			if s.ep.scoreboard.IsSACKED(sb) {
+				continue
+			}
+
+			// SetPipe():
+			//
+			//    (a) If IsLost(S1) returns false, Pipe is incremened by 1.
+			//
+			// NOTE: here we mark the whole segment as lost. We do not try
+			// and test every byte in our write buffer as we maintain our
+			// pipe in terms of oustanding packets and not bytes.
+			if !s.ep.scoreboard.IsRangeLost(sb) {
+				pipe++
+			}
+			// SetPipe():
+			//    (b) If S1 <= HighRxt, Pipe is incremented by 1.
+			if s1.sequenceNumber.LessThanEq(s.fr.highRxt) {
+				pipe++
+			}
+		}
+	}
+	s.outstanding = pipe
+}
+
+// checkDuplicateAck is called when an ack is received. It manages the state
+// related to duplicate acks and determines if a retransmit is needed according
+// to the rules in RFC 6582 (NewReno).
+func (s *sender) checkDuplicateAck(seg *segment) (rtx bool) {
+	ack := seg.ackNumber
+	if s.fr.active {
+		return s.handleFastRecovery(seg)
+	}
+
+	// We're not in fast recovery yet. A segment is considered a duplicate
+	// only if it doesn't carry any data and doesn't update the send window,
+	// because if it does, it wasn't sent in response to an out-of-order
+	// segment. If SACK is enabled then we have an additional check to see
+	// if the segment carries new SACK information. If it does then it is
+	// considered a duplicate ACK as per RFC6675.
+	if ack != s.sndUna || seg.logicalLen() != 0 || s.sndWnd != seg.window || ack == s.sndNxt {
+		if !s.ep.sackPermitted || !seg.hasNewSACKInfo {
+			s.dupAckCount = 0
+			return false
+		}
+	}
+
+	s.dupAckCount++
+
+	// Do not enter fast recovery until we reach nDupAckThreshold or the
+	// first unacknowledged byte is considered lost as per SACK scoreboard.
+	if s.dupAckCount < nDupAckThreshold || (s.ep.sackPermitted && !s.ep.scoreboard.IsLost(s.sndUna)) {
+		// RFC 6675 Step 3.
+		s.fr.highRxt = s.sndUna - 1
+		// Do run SetPipe() to calculate the outstanding segments.
+		s.SetPipe()
+		s.state = Disorder
+		return false
+	}
+
+	// See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2
+	//
+	// We only do the check here, the incrementing of last to the highest
+	// sequence number transmitted till now is done when enterFastRecovery
+	// is invoked.
+	if !s.fr.last.LessThan(seg.ackNumber) {
+		s.dupAckCount = 0
+		return false
+	}
+	s.cc.HandleNDupAcks()
+	s.enterFastRecovery()
+	s.dupAckCount = 0
+	return true
+}
+
+// handleRcvdSegment is called when a segment is received; it is responsible for
+// updating the send-related state.
+func (s *sender) handleRcvdSegment(seg *segment) {
+	// Check if we can extract an RTT measurement from this ack.
+	if !seg.parsedOptions.TS && s.rttMeasureSeqNum.LessThan(seg.ackNumber) {
+		s.updateRTO(time.Now().Sub(s.rttMeasureTime))
+		s.rttMeasureSeqNum = s.sndNxt
+	}
+
+	// Update Timestamp if required. See RFC7323, section-4.3.
+	if s.ep.sendTSOk && seg.parsedOptions.TS {
+		s.ep.updateRecentTimestamp(seg.parsedOptions.TSVal, s.maxSentAck, seg.sequenceNumber)
+	}
+
+	// Insert SACKBlock information into our scoreboard.
+	if s.ep.sackPermitted {
+		for _, sb := range seg.parsedOptions.SACKBlocks {
+			// Only insert the SACK block if the following holds
+			// true:
+			//  * SACK block acks data after the ack number in the
+			//    current segment.
+			//  * SACK block represents a sequence
+			//    between sndUna and sndNxt (i.e. data that is
+			//    currently unacked and in-flight).
+			//  * SACK block that has not been SACKed already.
+			//
+			// NOTE: This check specifically excludes DSACK blocks
+			// which have start/end before sndUna and are used to
+			// indicate spurious retransmissions.
+			if seg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) {
+				s.ep.scoreboard.Insert(sb)
+				seg.hasNewSACKInfo = true
+			}
+		}
+		s.SetPipe()
+	}
+
+	// Count the duplicates and do the fast retransmit if needed.
+	rtx := s.checkDuplicateAck(seg)
+
+	// Stash away the current window size.
+	s.sndWnd = seg.window
+
+	ack := seg.ackNumber
+
+	// Disable zero window probing if remote advertizes a non-zero receive
+	// window. This can be with an ACK to the zero window probe (where the
+	// acknumber refers to the already acknowledged byte) OR to any previously
+	// unacknowledged segment.
+	if s.zeroWindowProbing && seg.window > 0 &&
+		(ack == s.sndUna || (ack-1).InRange(s.sndUna, s.sndNxt)) {
+		s.disableZeroWindowProbing()
+	}
+
+	// On receiving the ACK for the zero window probe, account for it and
+	// skip trying to send any segment as we are still probing for
+	// receive window to become non-zero.
+	if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.sndUna {
+		s.unackZeroWindowProbes--
+		return
+	}
+
+	// Ignore ack if it doesn't acknowledge any new data.
+	if (ack - 1).InRange(s.sndUna, s.sndNxt) {
+		s.dupAckCount = 0
+
+		// See : https://tools.ietf.org/html/rfc1323#section-3.3.
+		// Specifically we should only update the RTO using TSEcr if the
+		// following condition holds:
+		//
+		//    A TSecr value received in a segment is used to update the
+		//    averaged RTT measurement only if the segment acknowledges
+		//    some new data, i.e., only if it advances the left edge of
+		//    the send window.
+		if s.ep.sendTSOk && seg.parsedOptions.TSEcr != 0 {
+			// TSVal/Ecr values sent by Netstack are at a millisecond
+			// granularity.
+			elapsed := time.Duration(s.ep.timestamp()-seg.parsedOptions.TSEcr) * time.Millisecond
+			s.updateRTO(elapsed)
+		}
+
+		// When an ack is received we must rearm the timer.
+		// RFC 6298 5.3
+		s.resendTimer.enable(s.rto)
+
+		// Remove all acknowledged data from the write list.
+		acked := s.sndUna.Size(ack)
+		s.sndUna = ack
+
+		ackLeft := acked
+		originalOutstanding := s.outstanding
+		for ackLeft > 0 {
+			// We use logicalLen here because we can have FIN
+			// segments (which are always at the end of list) that
+			// have no data, but do consume a sequence number.
+			seg := s.writeList.Front()
+			datalen := seg.logicalLen()
+
+			if datalen > ackLeft {
+				prevCount := s.pCount(seg)
+				seg.data.TrimFront(int(ackLeft))
+				seg.sequenceNumber.UpdateForward(ackLeft)
+				s.outstanding -= prevCount - s.pCount(seg)
+				break
+			}
+
+			if s.writeNext == seg {
+				s.writeNext = seg.Next()
+			}
+
+			s.writeList.Remove(seg)
+
+			// if SACK is enabled then Only reduce outstanding if
+			// the segment was not previously SACKED as these have
+			// already been accounted for in SetPipe().
+			if !s.ep.sackPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) {
+				s.outstanding -= s.pCount(seg)
+			}
+			seg.decRef()
+			ackLeft -= datalen
+		}
+
+		// Update the send buffer usage and notify potential waiters.
+		s.ep.updateSndBufferUsage(int(acked))
+
+		// Clear SACK information for all acked data.
+		s.ep.scoreboard.Delete(s.sndUna)
+
+		// If we are not in fast recovery then update the congestion
+		// window based on the number of acknowledged packets.
+		if !s.fr.active {
+			s.cc.Update(originalOutstanding - s.outstanding)
+			if s.fr.last.LessThan(s.sndUna) {
+				s.state = Open
+			}
+		}
+
+		// It is possible for s.outstanding to drop below zero if we get
+		// a retransmit timeout, reset outstanding to zero but later
+		// get an ack that cover previously sent data.
+		if s.outstanding < 0 {
+			s.outstanding = 0
+		}
+
+		s.SetPipe()
+
+		// If all outstanding data was acknowledged the disable the timer.
+		// RFC 6298 Rule 5.3
+		if s.sndUna == s.sndNxt {
+			s.outstanding = 0
+			// Reset firstRetransmittedSegXmitTime to the zero value.
+			s.firstRetransmittedSegXmitTime = time.Time{}
+			s.resendTimer.disable()
+		}
+	}
+	// Now that we've popped all acknowledged data from the retransmit
+	// queue, retransmit if needed.
+	if rtx {
+		s.resendSegment()
+	}
+
+	// Send more data now that some of the pending data has been ack'd, or
+	// that the window opened up, or the congestion window was inflated due
+	// to a duplicate ack during fast recovery. This will also re-enable
+	// the retransmit timer if needed.
+	if !s.ep.sackPermitted || s.fr.active || s.dupAckCount == 0 || seg.hasNewSACKInfo {
+		s.sendData()
+	}
+}
+
+// sendSegment sends the specified segment.
+func (s *sender) sendSegment(seg *segment) *tcpip.Error {
+	if seg.xmitCount > 0 {
+		s.ep.stack.Stats().TCP.Retransmits.Increment()
+		s.ep.stats.SendErrors.Retransmits.Increment()
+		if s.sndCwnd < s.sndSsthresh {
+			s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment()
+		}
+	}
+	seg.xmitTime = time.Now()
+	seg.xmitCount++
+	err := s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber)
+
+	// Every time a packet containing data is sent (including a
+	// retransmission), if SACK is enabled and we are retransmitting data
+	// then use the conservative timer described in RFC6675 Section 6.0,
+	// otherwise follow the standard time described in RFC6298 Section 5.1.
+	if err != nil && seg.data.Size() != 0 {
+		if s.fr.active && seg.xmitCount > 1 && s.ep.sackPermitted {
+			s.resendTimer.enable(s.rto)
+		} else {
+			if !s.resendTimer.enabled() {
+				s.resendTimer.enable(s.rto)
+			}
+		}
+	}
+
+	return err
+}
+
+// sendSegmentFromView sends a new segment containing the given payload, flags
+// and sequence number.
+func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags byte, seq seqnum.Value) *tcpip.Error {
+	s.lastSendTime = time.Now()
+	if seq == s.rttMeasureSeqNum {
+		s.rttMeasureTime = s.lastSendTime
+	}
+
+	rcvNxt, rcvWnd := s.ep.rcv.getSendParams()
+
+	// Remember the max sent ack.
+	s.maxSentAck = rcvNxt
+
+	return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd)
+}
diff --git a/pkg/tcpip/transport/tcp/snd_state.go b/pkg/tcpip/transport/tcp/snd_state.go
new file mode 100644
index 000000000..8b20c3455
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/snd_state.go
@@ -0,0 +1,60 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"time"
+)
+
+// +stateify savable
+type unixTime struct {
+	second int64
+	nano   int64
+}
+
+// saveLastSendTime is invoked by stateify.
+func (s *sender) saveLastSendTime() unixTime {
+	return unixTime{s.lastSendTime.Unix(), s.lastSendTime.UnixNano()}
+}
+
+// loadLastSendTime is invoked by stateify.
+func (s *sender) loadLastSendTime(unix unixTime) {
+	s.lastSendTime = time.Unix(unix.second, unix.nano)
+}
+
+// saveRttMeasureTime is invoked by stateify.
+func (s *sender) saveRttMeasureTime() unixTime {
+	return unixTime{s.rttMeasureTime.Unix(), s.rttMeasureTime.UnixNano()}
+}
+
+// loadRttMeasureTime is invoked by stateify.
+func (s *sender) loadRttMeasureTime(unix unixTime) {
+	s.rttMeasureTime = time.Unix(unix.second, unix.nano)
+}
+
+// afterLoad is invoked by stateify.
+func (s *sender) afterLoad() {
+	s.resendTimer.init(&s.resendWaker)
+}
+
+// saveFirstRetransmittedSegXmitTime is invoked by stateify.
+func (s *sender) saveFirstRetransmittedSegXmitTime() unixTime {
+	return unixTime{s.firstRetransmittedSegXmitTime.Unix(), s.firstRetransmittedSegXmitTime.UnixNano()}
+}
+
+// loadFirstRetransmittedSegXmitTime is invoked by stateify.
+func (s *sender) loadFirstRetransmittedSegXmitTime(unix unixTime) {
+	s.firstRetransmittedSegXmitTime = time.Unix(unix.second, unix.nano)
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
new file mode 100644
index 000000000..b9993ce1a
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/tcp_noracedetector_test.go
@@ -0,0 +1,550 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// These tests are flaky when run under the go race detector due to some
+// iterations taking long enough that the retransmit timer can kick in causing
+// the congestion window measurements to fail due to extra packets etc.
+//
+// +build !race
+
+package tcp_test
+
+import (
+	"fmt"
+	"math"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+)
+
+func TestFastRecovery(t *testing.T) {
+	maxPayload := 32
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	const iterations = 3
+	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	// Write all the data in one shot. Packets will only be written at the
+	// MTU size though.
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Do slow start for a few iterations.
+	expected := tcp.InitialCwnd
+	bytesRead := 0
+	for i := 0; i < iterations; i++ {
+		expected = tcp.InitialCwnd << uint(i)
+		if i > 0 {
+			// Acknowledge all the data received so far if not on
+			// first iteration.
+			c.SendAck(790, bytesRead)
+		}
+
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
+	}
+
+	// Send 3 duplicate acks. This should force an immediate retransmit of
+	// the pending packet and put the sender into fast recovery.
+	rtxOffset := bytesRead - maxPayload*expected
+	for i := 0; i < 3; i++ {
+		c.SendAck(790, rtxOffset)
+	}
+
+	// Receive the retransmitted packet.
+	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+	// Wait before checking metrics.
+	metricPollFn := func() error {
+		if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.FastRetransmit.Value = %d, want = %d", got, want)
+		}
+		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.Retransmit.Value = %d, want = %d", got, want)
+		}
+
+		if got, want := c.Stack().Stats().TCP.FastRecovery.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.FastRecovery.Value = %d, want = %d", got, want)
+		}
+		return nil
+	}
+
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
+	}
+
+	// Now send 7 mode duplicate acks. Each of these should cause a window
+	// inflation by 1 and cause the sender to send an extra packet.
+	for i := 0; i < 7; i++ {
+		c.SendAck(790, rtxOffset)
+	}
+
+	recover := bytesRead
+
+	// Ensure no new packets arrive.
+	c.CheckNoPacketTimeout("More packets received than expected during recovery after dupacks for this cwnd.",
+		50*time.Millisecond)
+
+	// Acknowledge half of the pending data.
+	rtxOffset = bytesRead - expected*maxPayload/2
+	c.SendAck(790, rtxOffset)
+
+	// Receive the retransmit due to partial ack.
+	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+	// Wait before checking metrics.
+	metricPollFn = func() error {
+		if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(2); got != want {
+			return fmt.Errorf("got stats.TCP.FastRetransmit.Value = %d, want = %d", got, want)
+		}
+		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(2); got != want {
+			return fmt.Errorf("got stats.TCP.Retransmit.Value = %d, want = %d", got, want)
+		}
+		return nil
+	}
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
+	}
+
+	// Receive the 10 extra packets that should have been released due to
+	// the congestion window inflation in recovery.
+	for i := 0; i < 10; i++ {
+		c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+		bytesRead += maxPayload
+	}
+
+	// A partial ACK during recovery should reduce congestion window by the
+	// number acked. Since we had "expected" packets outstanding before sending
+	// partial ack and we acked expected/2 , the cwnd and outstanding should
+	// be expected/2 + 10 (7 dupAcks + 3 for the original 3 dupacks that triggered
+	// fast recovery). Which means the sender should not send any more packets
+	// till we ack this one.
+	c.CheckNoPacketTimeout("More packets received than expected during recovery after partial ack for this cwnd.",
+		50*time.Millisecond)
+
+	// Acknowledge all pending data to recover point.
+	c.SendAck(790, recover)
+
+	// At this point, the cwnd should reset to expected/2 and there are 10
+	// packets outstanding.
+	//
+	// NOTE: Technically netstack is incorrect in that we adjust the cwnd on
+	// the same segment that takes us out of recovery. But because of that
+	// the actual cwnd at exit of recovery will be expected/2 + 1 as we
+	// acked a cwnd worth of packets which will increase the cwnd further by
+	// 1 in congestion avoidance.
+	//
+	// Now in the first iteration since there are 10 packets outstanding.
+	// We would expect to get expected/2 +1 - 10 packets. But subsequent
+	// iterations will send us expected/2 + 1 + 1 (per iteration).
+	expected = expected/2 + 1 - 10
+	for i := 0; i < iterations; i++ {
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout(fmt.Sprintf("More packets received(after deflation) than expected %d for this cwnd.", expected), 50*time.Millisecond)
+
+		// Acknowledge all the data received so far.
+		c.SendAck(790, bytesRead)
+
+		// In cogestion avoidance, the packets trains increase by 1 in
+		// each iteration.
+		if i == 0 {
+			// After the first iteration we expect to get the full
+			// congestion window worth of packets in every
+			// iteration.
+			expected += 10
+		}
+		expected++
+	}
+}
+
+func TestExponentialIncreaseDuringSlowStart(t *testing.T) {
+	maxPayload := 32
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	const iterations = 3
+	data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	// Write all the data in one shot. Packets will only be written at the
+	// MTU size though.
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	expected := tcp.InitialCwnd
+	bytesRead := 0
+	for i := 0; i < iterations; i++ {
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
+
+		// Acknowledge all the data received so far.
+		c.SendAck(790, bytesRead)
+
+		// Double the number of expected packets for the next iteration.
+		expected *= 2
+	}
+}
+
+func TestCongestionAvoidance(t *testing.T) {
+	maxPayload := 32
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	const iterations = 3
+	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	// Write all the data in one shot. Packets will only be written at the
+	// MTU size though.
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Do slow start for a few iterations.
+	expected := tcp.InitialCwnd
+	bytesRead := 0
+	for i := 0; i < iterations; i++ {
+		expected = tcp.InitialCwnd << uint(i)
+		if i > 0 {
+			// Acknowledge all the data received so far if not on
+			// first iteration.
+			c.SendAck(790, bytesRead)
+		}
+
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd (slow start phase).", 50*time.Millisecond)
+	}
+
+	// Don't acknowledge the first packet of the last packet train. Let's
+	// wait for them to time out, which will trigger a restart of slow
+	// start, and initialization of ssthresh to cwnd/2.
+	rtxOffset := bytesRead - maxPayload*expected
+	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+	// Acknowledge all the data received so far.
+	c.SendAck(790, bytesRead)
+
+	// This part is tricky: when the timeout happened, we had "expected"
+	// packets pending, cwnd reset to 1, and ssthresh set to expected/2.
+	// By acknowledging "expected" packets, the slow-start part will
+	// increase cwnd to expected/2 (which "consumes" expected/2-1 of the
+	// acknowledgements), then the congestion avoidance part will consume
+	// an extra expected/2 acks to take cwnd to expected/2 + 1. One ack
+	// remains in the "ack count" (which will cause cwnd to be incremented
+	// once it reaches cwnd acks).
+	//
+	// So we're straight into congestion avoidance with cwnd set to
+	// expected/2 + 1.
+	//
+	// Check that packets trains of cwnd packets are sent, and that cwnd is
+	// incremented by 1 after we acknowledge each packet.
+	expected = expected/2 + 1
+	for i := 0; i < iterations; i++ {
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd (congestion avoidance phase).", 50*time.Millisecond)
+
+		// Acknowledge all the data received so far.
+		c.SendAck(790, bytesRead)
+
+		// In cogestion avoidance, the packets trains increase by 1 in
+		// each iteration.
+		expected++
+	}
+}
+
+// cubicCwnd returns an estimate of a cubic window given the
+// originalCwnd, wMax, last congestion event time and sRTT.
+func cubicCwnd(origCwnd int, wMax int, congEventTime time.Time, sRTT time.Duration) int {
+	cwnd := float64(origCwnd)
+	// We wait 50ms between each iteration so sRTT as computed by cubic
+	// should be close to 50ms.
+	elapsed := (time.Since(congEventTime) + sRTT).Seconds()
+	k := math.Cbrt(float64(wMax) * 0.3 / 0.7)
+	wtRTT := 0.4*math.Pow(elapsed-k, 3) + float64(wMax)
+	cwnd += (wtRTT - cwnd) / cwnd
+	return int(cwnd)
+}
+
+func TestCubicCongestionAvoidance(t *testing.T) {
+	maxPayload := 32
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+	defer c.Cleanup()
+
+	enableCUBIC(t, c)
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	const iterations = 3
+	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	// Write all the data in one shot. Packets will only be written at the
+	// MTU size though.
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Do slow start for a few iterations.
+	expected := tcp.InitialCwnd
+	bytesRead := 0
+	for i := 0; i < iterations; i++ {
+		expected = tcp.InitialCwnd << uint(i)
+		if i > 0 {
+			// Acknowledge all the data received so far if not on
+			// first iteration.
+			c.SendAck(790, bytesRead)
+		}
+
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd (during slow-start phase).", 50*time.Millisecond)
+	}
+
+	// Don't acknowledge the first packet of the last packet train. Let's
+	// wait for them to time out, which will trigger a restart of slow
+	// start, and initialization of ssthresh to cwnd * 0.7.
+	rtxOffset := bytesRead - maxPayload*expected
+	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+	// Acknowledge all pending data.
+	c.SendAck(790, bytesRead)
+
+	// Store away the time we sent the ACK and assuming a 200ms RTO
+	// we estimate that the sender will have an RTO 200ms from now
+	// and go back into slow start.
+	packetDropTime := time.Now().Add(200 * time.Millisecond)
+
+	// This part is tricky: when the timeout happened, we had "expected"
+	// packets pending, cwnd reset to 1, and ssthresh set to expected * 0.7.
+	// By acknowledging "expected" packets, the slow-start part will
+	// increase cwnd to expected/2 essentially putting the connection
+	// straight into congestion avoidance.
+	wMax := expected
+	// Lower expected as per cubic spec after a congestion event.
+	expected = int(float64(expected) * 0.7)
+	cwnd := expected
+	for i := 0; i < iterations; i++ {
+		// Cubic grows window independent of ACKs. Cubic Window growth
+		// is a function of time elapsed since last congestion event.
+		// As a result the congestion window does not grow
+		// deterministically in response to ACKs.
+		//
+		// We need to roughly estimate what the cwnd of the sender is
+		// based on when we sent the dupacks.
+		cwnd := cubicCwnd(cwnd, wMax, packetDropTime, 50*time.Millisecond)
+
+		packetsExpected := cwnd
+		for j := 0; j < packetsExpected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+		t.Logf("expected packets received, next trying to receive any extra packets that may come")
+
+		// If our estimate was correct there should be no more pending packets.
+		// We attempt to read a packet a few times with a short sleep in between
+		// to ensure that we don't see the sender send any unexpected packets.
+		unexpectedPackets := 0
+		for {
+			gotPacket := c.ReceiveNonBlockingAndCheckPacket(data, bytesRead, maxPayload)
+			if !gotPacket {
+				break
+			}
+			bytesRead += maxPayload
+			unexpectedPackets++
+			time.Sleep(1 * time.Millisecond)
+		}
+		if unexpectedPackets != 0 {
+			t.Fatalf("received %d unexpected packets for iteration %d", unexpectedPackets, i)
+		}
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd(congestion avoidance)", 5*time.Millisecond)
+
+		// Acknowledge all the data received so far.
+		c.SendAck(790, bytesRead)
+	}
+}
+
+func TestRetransmit(t *testing.T) {
+	maxPayload := 32
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	const iterations = 3
+	data := buffer.NewView(maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	// Write all the data in two shots. Packets will only be written at the
+	// MTU size though.
+	half := data[:len(data)/2]
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+	half = data[len(data)/2:]
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(half), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Do slow start for a few iterations.
+	expected := tcp.InitialCwnd
+	bytesRead := 0
+	for i := 0; i < iterations; i++ {
+		expected = tcp.InitialCwnd << uint(i)
+		if i > 0 {
+			// Acknowledge all the data received so far if not on
+			// first iteration.
+			c.SendAck(790, bytesRead)
+		}
+
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacket(data, bytesRead, maxPayload)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
+	}
+
+	// Wait for a timeout and retransmit.
+	rtxOffset := bytesRead - maxPayload*expected
+	c.ReceiveAndCheckPacket(data, rtxOffset, maxPayload)
+
+	metricPollFn := func() error {
+		if got, want := c.Stack().Stats().TCP.Timeouts.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.Timeouts.Value = %d, want = %d", got, want)
+		}
+
+		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.Retransmits.Value = %d, want = %d", got, want)
+		}
+
+		if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Timeouts.Value(), uint64(1); got != want {
+			return fmt.Errorf("got EP SendErrors.Timeouts.Value = %d, want = %d", got, want)
+		}
+
+		if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Retransmits.Value(), uint64(1); got != want {
+			return fmt.Errorf("got EP stats SendErrors.Retransmits.Value = %d, want = %d", got, want)
+		}
+
+		if got, want := c.Stack().Stats().TCP.SlowStartRetransmits.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.SlowStartRetransmits.Value = %d, want = %d", got, want)
+		}
+
+		return nil
+	}
+
+	// Poll when checking metrics.
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
+	}
+
+	// Acknowledge half of the pending data.
+	rtxOffset = bytesRead - expected*maxPayload/2
+	c.SendAck(790, rtxOffset)
+
+	// Receive the remaining data, making sure that acknowledged data is not
+	// retransmitted.
+	for offset := rtxOffset; offset < len(data); offset += maxPayload {
+		c.ReceiveAndCheckPacket(data, offset, maxPayload)
+		c.SendAck(790, offset+maxPayload)
+	}
+
+	c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_sack_test.go b/pkg/tcpip/transport/tcp/tcp_sack_test.go
new file mode 100644
index 000000000..99521f0c1
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/tcp_sack_test.go
@@ -0,0 +1,589 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_test
+
+import (
+	"fmt"
+	"log"
+	"reflect"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+)
+
+// createConnectedWithSACKPermittedOption creates and connects c.ep with the
+// SACKPermitted option enabled if the stack in the context has the SACK support
+// enabled.
+func createConnectedWithSACKPermittedOption(c *context.Context) *context.RawEndpoint {
+	return c.CreateConnectedWithOptions(header.TCPSynOptions{SACKPermitted: c.SACKEnabled()})
+}
+
+// createConnectedWithSACKAndTS creates and connects c.ep with the SACK & TS
+// option enabled if the stack in the context has SACK and TS enabled.
+func createConnectedWithSACKAndTS(c *context.Context) *context.RawEndpoint {
+	return c.CreateConnectedWithOptions(header.TCPSynOptions{SACKPermitted: c.SACKEnabled(), TS: true})
+}
+
+func setStackSACKPermitted(t *testing.T, c *context.Context, enable bool) {
+	t.Helper()
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(enable)); err != nil {
+		t.Fatalf("c.s.SetTransportProtocolOption(tcp.ProtocolNumber, SACKEnabled(%t) = %s", enable, err)
+	}
+}
+
+// TestSackPermittedConnect establishes a connection with the SACK option
+// enabled.
+func TestSackPermittedConnect(t *testing.T) {
+	for _, sackEnabled := range []bool{false, true} {
+		t.Run(fmt.Sprintf("stack.sackEnabled: %v", sackEnabled), func(t *testing.T) {
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
+
+			setStackSACKPermitted(t, c, sackEnabled)
+			rep := createConnectedWithSACKPermittedOption(c)
+			data := []byte{1, 2, 3}
+
+			rep.SendPacket(data, nil)
+			savedSeqNum := rep.NextSeqNum
+			rep.VerifyACKNoSACK()
+
+			// Make an out of order packet and send it.
+			rep.NextSeqNum += 3
+			sackBlocks := []header.SACKBlock{
+				{rep.NextSeqNum, rep.NextSeqNum.Add(seqnum.Size(len(data)))},
+			}
+			rep.SendPacket(data, nil)
+
+			// Restore the saved sequence number so that the
+			// VerifyXXX calls use the right sequence number for
+			// checking ACK numbers.
+			rep.NextSeqNum = savedSeqNum
+			if sackEnabled {
+				rep.VerifyACKHasSACK(sackBlocks)
+			} else {
+				rep.VerifyACKNoSACK()
+			}
+
+			// Send the missing segment.
+			rep.SendPacket(data, nil)
+			// The ACK should contain the cumulative ACK for all 9
+			// bytes sent and no SACK blocks.
+			rep.NextSeqNum += 3
+			// Check that no SACK block is returned in the ACK.
+			rep.VerifyACKNoSACK()
+		})
+	}
+}
+
+// TestSackDisabledConnect establishes a connection with the SACK option
+// disabled and verifies that no SACKs are sent for out of order segments.
+func TestSackDisabledConnect(t *testing.T) {
+	for _, sackEnabled := range []bool{false, true} {
+		t.Run(fmt.Sprintf("sackEnabled: %v", sackEnabled), func(t *testing.T) {
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
+
+			setStackSACKPermitted(t, c, sackEnabled)
+
+			rep := c.CreateConnectedWithOptions(header.TCPSynOptions{})
+
+			data := []byte{1, 2, 3}
+
+			rep.SendPacket(data, nil)
+			savedSeqNum := rep.NextSeqNum
+			rep.VerifyACKNoSACK()
+
+			// Make an out of order packet and send it.
+			rep.NextSeqNum += 3
+			rep.SendPacket(data, nil)
+
+			// The ACK should contain the older sequence number and
+			// no SACK blocks.
+			rep.NextSeqNum = savedSeqNum
+			rep.VerifyACKNoSACK()
+
+			// Send the missing segment.
+			rep.SendPacket(data, nil)
+			// The ACK should contain the cumulative ACK for all 9
+			// bytes sent and no SACK blocks.
+			rep.NextSeqNum += 3
+			// Check that no SACK block is returned in the ACK.
+			rep.VerifyACKNoSACK()
+		})
+	}
+}
+
+// TestSackPermittedAccept accepts and establishes a connection with the
+// SACKPermitted option enabled if the connection request specifies the
+// SACKPermitted option. In case of SYN cookies SACK should be disabled as we
+// don't encode the SACK information in the cookie.
+func TestSackPermittedAccept(t *testing.T) {
+	type testCase struct {
+		cookieEnabled bool
+		sackPermitted bool
+		wndScale      int
+		wndSize       uint16
+	}
+
+	testCases := []testCase{
+		// When cookie is used window scaling is disabled.
+		{true, false, -1, 0xffff}, // When cookie is used window scaling is disabled.
+		{false, true, 5, 0x8000},  // 0x8000 * 2^5 = 1<<20 = 1MB window (the default).
+	}
+
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("test: %#v", tc), func(t *testing.T) {
+			for _, sackEnabled := range []bool{false, true} {
+				t.Run(fmt.Sprintf("test stack.sackEnabled: %v", sackEnabled), func(t *testing.T) {
+					c := context.New(t, defaultMTU)
+					defer c.Cleanup()
+
+					if tc.cookieEnabled {
+						// Set the SynRcvd threshold to
+						// zero to force a syn cookie
+						// based accept to happen.
+						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+							t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+						}
+					}
+					setStackSACKPermitted(t, c, sackEnabled)
+
+					rep := c.AcceptWithOptions(tc.wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS, SACKPermitted: tc.sackPermitted})
+					//  Now verify no SACK blocks are
+					//  received when sack is disabled.
+					data := []byte{1, 2, 3}
+					rep.SendPacket(data, nil)
+					rep.VerifyACKNoSACK()
+
+					savedSeqNum := rep.NextSeqNum
+
+					// Make an out of order packet and send
+					// it.
+					rep.NextSeqNum += 3
+					sackBlocks := []header.SACKBlock{
+						{rep.NextSeqNum, rep.NextSeqNum.Add(seqnum.Size(len(data)))},
+					}
+					rep.SendPacket(data, nil)
+
+					// The ACK should contain the older
+					// sequence number.
+					rep.NextSeqNum = savedSeqNum
+					if sackEnabled && tc.sackPermitted {
+						rep.VerifyACKHasSACK(sackBlocks)
+					} else {
+						rep.VerifyACKNoSACK()
+					}
+
+					// Send the missing segment.
+					rep.SendPacket(data, nil)
+					// The ACK should contain the cumulative
+					// ACK for all 9 bytes sent and no SACK
+					// blocks.
+					rep.NextSeqNum += 3
+					// Check that no SACK block is returned
+					// in the ACK.
+					rep.VerifyACKNoSACK()
+				})
+			}
+		})
+	}
+}
+
+// TestSackDisabledAccept accepts and establishes a connection with
+// the SACKPermitted option disabled and verifies that no SACKs are
+// sent for out of order packets.
+func TestSackDisabledAccept(t *testing.T) {
+	type testCase struct {
+		cookieEnabled bool
+		wndScale      int
+		wndSize       uint16
+	}
+
+	testCases := []testCase{
+		// When cookie is used window scaling is disabled.
+		{true, -1, 0xffff}, // When cookie is used window scaling is disabled.
+		{false, 5, 0x8000}, // 0x8000 * 2^5 = 1<<20 = 1MB window (the default).
+	}
+
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("test: %#v", tc), func(t *testing.T) {
+			for _, sackEnabled := range []bool{false, true} {
+				t.Run(fmt.Sprintf("test: sackEnabled: %v", sackEnabled), func(t *testing.T) {
+					c := context.New(t, defaultMTU)
+					defer c.Cleanup()
+
+					if tc.cookieEnabled {
+						// Set the SynRcvd threshold to
+						// zero to force a syn cookie
+						// based accept to happen.
+						if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+							t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+						}
+					}
+
+					setStackSACKPermitted(t, c, sackEnabled)
+
+					rep := c.AcceptWithOptions(tc.wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS})
+
+					//  Now verify no SACK blocks are
+					//  received when sack is disabled.
+					data := []byte{1, 2, 3}
+					rep.SendPacket(data, nil)
+					rep.VerifyACKNoSACK()
+					savedSeqNum := rep.NextSeqNum
+
+					// Make an out of order packet and send
+					// it.
+					rep.NextSeqNum += 3
+					rep.SendPacket(data, nil)
+
+					// The ACK should contain the older
+					// sequence number and no SACK blocks.
+					rep.NextSeqNum = savedSeqNum
+					rep.VerifyACKNoSACK()
+
+					// Send the missing segment.
+					rep.SendPacket(data, nil)
+					// The ACK should contain the cumulative
+					// ACK for all 9 bytes sent and no SACK
+					// blocks.
+					rep.NextSeqNum += 3
+					// Check that no SACK block is returned
+					// in the ACK.
+					rep.VerifyACKNoSACK()
+				})
+			}
+		})
+	}
+}
+
+func TestUpdateSACKBlocks(t *testing.T) {
+	testCases := []struct {
+		segStart   seqnum.Value
+		segEnd     seqnum.Value
+		rcvNxt     seqnum.Value
+		sackBlocks []header.SACKBlock
+		updated    []header.SACKBlock
+	}{
+		// Trivial cases where current SACK block list is empty and we
+		// have an out of order delivery.
+		{10, 11, 2, []header.SACKBlock{}, []header.SACKBlock{{10, 11}}},
+		{10, 12, 2, []header.SACKBlock{}, []header.SACKBlock{{10, 12}}},
+		{10, 20, 2, []header.SACKBlock{}, []header.SACKBlock{{10, 20}}},
+
+		// Cases where current SACK block list is not empty and we have
+		// an out of order delivery. Tests that the updated SACK block
+		// list has the first block as the one that contains the new
+		// SACK block representing the segment that was just delivered.
+		{10, 11, 9, []header.SACKBlock{{12, 20}}, []header.SACKBlock{{10, 11}, {12, 20}}},
+		{24, 30, 9, []header.SACKBlock{{12, 20}}, []header.SACKBlock{{24, 30}, {12, 20}}},
+		{24, 30, 9, []header.SACKBlock{{12, 20}, {32, 40}}, []header.SACKBlock{{24, 30}, {12, 20}, {32, 40}}},
+
+		// Ensure that we only retain header.MaxSACKBlocks and drop the
+		// oldest one if adding a new block exceeds
+		// header.MaxSACKBlocks.
+		{24, 30, 9,
+			[]header.SACKBlock{{12, 20}, {32, 40}, {42, 50}, {52, 60}, {62, 70}, {72, 80}},
+			[]header.SACKBlock{{24, 30}, {12, 20}, {32, 40}, {42, 50}, {52, 60}, {62, 70}}},
+
+		// Cases where segment extends an existing SACK block.
+		{10, 12, 9, []header.SACKBlock{{12, 20}}, []header.SACKBlock{{10, 20}}},
+		{10, 22, 9, []header.SACKBlock{{12, 20}}, []header.SACKBlock{{10, 22}}},
+		{10, 22, 9, []header.SACKBlock{{12, 20}}, []header.SACKBlock{{10, 22}}},
+		{15, 22, 9, []header.SACKBlock{{12, 20}}, []header.SACKBlock{{12, 22}}},
+		{15, 25, 9, []header.SACKBlock{{12, 20}}, []header.SACKBlock{{12, 25}}},
+		{11, 25, 9, []header.SACKBlock{{12, 20}}, []header.SACKBlock{{11, 25}}},
+		{10, 12, 9, []header.SACKBlock{{12, 20}, {32, 40}}, []header.SACKBlock{{10, 20}, {32, 40}}},
+		{10, 22, 9, []header.SACKBlock{{12, 20}, {32, 40}}, []header.SACKBlock{{10, 22}, {32, 40}}},
+		{10, 22, 9, []header.SACKBlock{{12, 20}, {32, 40}}, []header.SACKBlock{{10, 22}, {32, 40}}},
+		{15, 22, 9, []header.SACKBlock{{12, 20}, {32, 40}}, []header.SACKBlock{{12, 22}, {32, 40}}},
+		{15, 25, 9, []header.SACKBlock{{12, 20}, {32, 40}}, []header.SACKBlock{{12, 25}, {32, 40}}},
+		{11, 25, 9, []header.SACKBlock{{12, 20}, {32, 40}}, []header.SACKBlock{{11, 25}, {32, 40}}},
+
+		// Cases where segment contains rcvNxt.
+		{10, 20, 15, []header.SACKBlock{{20, 30}, {40, 50}}, []header.SACKBlock{{40, 50}}},
+	}
+
+	for _, tc := range testCases {
+		var sack tcp.SACKInfo
+		copy(sack.Blocks[:], tc.sackBlocks)
+		sack.NumBlocks = len(tc.sackBlocks)
+		tcp.UpdateSACKBlocks(&sack, tc.segStart, tc.segEnd, tc.rcvNxt)
+		if got, want := sack.Blocks[:sack.NumBlocks], tc.updated; !reflect.DeepEqual(got, want) {
+			t.Errorf("UpdateSACKBlocks(%v, %v, %v, %v), got: %v, want: %v", tc.sackBlocks, tc.segStart, tc.segEnd, tc.rcvNxt, got, want)
+		}
+
+	}
+}
+
+func TestTrimSackBlockList(t *testing.T) {
+	testCases := []struct {
+		rcvNxt     seqnum.Value
+		sackBlocks []header.SACKBlock
+		trimmed    []header.SACKBlock
+	}{
+		// Simple cases where we trim whole entries.
+		{2, []header.SACKBlock{{10, 20}, {22, 30}, {32, 40}}, []header.SACKBlock{{10, 20}, {22, 30}, {32, 40}}},
+		{21, []header.SACKBlock{{10, 20}, {22, 30}, {32, 40}}, []header.SACKBlock{{22, 30}, {32, 40}}},
+		{31, []header.SACKBlock{{10, 20}, {22, 30}, {32, 40}}, []header.SACKBlock{{32, 40}}},
+		{40, []header.SACKBlock{{10, 20}, {22, 30}, {32, 40}}, []header.SACKBlock{}},
+		// Cases where we need to update a block.
+		{12, []header.SACKBlock{{10, 20}, {22, 30}, {32, 40}}, []header.SACKBlock{{12, 20}, {22, 30}, {32, 40}}},
+		{23, []header.SACKBlock{{10, 20}, {22, 30}, {32, 40}}, []header.SACKBlock{{23, 30}, {32, 40}}},
+		{33, []header.SACKBlock{{10, 20}, {22, 30}, {32, 40}}, []header.SACKBlock{{33, 40}}},
+		{41, []header.SACKBlock{{10, 20}, {22, 30}, {32, 40}}, []header.SACKBlock{}},
+	}
+	for _, tc := range testCases {
+		var sack tcp.SACKInfo
+		copy(sack.Blocks[:], tc.sackBlocks)
+		sack.NumBlocks = len(tc.sackBlocks)
+		tcp.TrimSACKBlockList(&sack, tc.rcvNxt)
+		if got, want := sack.Blocks[:sack.NumBlocks], tc.trimmed; !reflect.DeepEqual(got, want) {
+			t.Errorf("TrimSackBlockList(%v, %v), got: %v, want: %v", tc.sackBlocks, tc.rcvNxt, got, want)
+		}
+	}
+}
+
+func TestSACKRecovery(t *testing.T) {
+	const maxPayload = 10
+	// See: tcp.makeOptions for why tsOptionSize is set to 12 here.
+	const tsOptionSize = 12
+	// Enabling SACK means the payload size is reduced to account
+	// for the extra space required for the TCP options.
+	//
+	// We increase the MTU by 40 bytes to account for SACK and Timestamp
+	// options.
+	const maxTCPOptionSize = 40
+
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxTCPOptionSize+maxPayload))
+	defer c.Cleanup()
+
+	c.Stack().AddTCPProbe(func(s stack.TCPEndpointState) {
+		// We use log.Printf instead of t.Logf here because this probe
+		// can fire even when the test function has finished. This is
+		// because closing the endpoint in cleanup() does not mean the
+		// actual worker loop terminates immediately as it still has to
+		// do a full TCP shutdown. But this test can finish running
+		// before the shutdown is done. Using t.Logf in such a case
+		// causes the test to panic due to logging after test finished.
+		log.Printf("state: %+v\n", s)
+	})
+	setStackSACKPermitted(t, c, true)
+	createConnectedWithSACKAndTS(c)
+
+	const iterations = 3
+	data := buffer.NewView(2 * maxPayload * (tcp.InitialCwnd << (iterations + 1)))
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	// Write all the data in one shot. Packets will only be written at the
+	// MTU size though.
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Do slow start for a few iterations.
+	expected := tcp.InitialCwnd
+	bytesRead := 0
+	for i := 0; i < iterations; i++ {
+		expected = tcp.InitialCwnd << uint(i)
+		if i > 0 {
+			// Acknowledge all the data received so far if not on
+			// first iteration.
+			c.SendAck(790, bytesRead)
+		}
+
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacketWithOptions(data, bytesRead, maxPayload, tsOptionSize)
+			bytesRead += maxPayload
+		}
+
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout("More packets received than expected for this cwnd.", 50*time.Millisecond)
+	}
+
+	// Send 3 duplicate acks. This should force an immediate retransmit of
+	// the pending packet and put the sender into fast recovery.
+	rtxOffset := bytesRead - maxPayload*expected
+	start := c.IRS.Add(seqnum.Size(rtxOffset) + 30 + 1)
+	end := start.Add(10)
+	for i := 0; i < 3; i++ {
+		c.SendAckWithSACK(790, rtxOffset, []header.SACKBlock{{start, end}})
+		end = end.Add(10)
+	}
+
+	// Receive the retransmitted packet.
+	c.ReceiveAndCheckPacketWithOptions(data, rtxOffset, maxPayload, tsOptionSize)
+
+	metricPollFn := func() error {
+		tcpStats := c.Stack().Stats().TCP
+		stats := []struct {
+			stat *tcpip.StatCounter
+			name string
+			want uint64
+		}{
+			{tcpStats.FastRetransmit, "stats.TCP.FastRetransmit", 1},
+			{tcpStats.Retransmits, "stats.TCP.Retransmits", 1},
+			{tcpStats.SACKRecovery, "stats.TCP.SACKRecovery", 1},
+			{tcpStats.FastRecovery, "stats.TCP.FastRecovery", 0},
+		}
+		for _, s := range stats {
+			if got, want := s.stat.Value(), s.want; got != want {
+				return fmt.Errorf("got %s.Value() = %d, want = %d", s.name, got, want)
+			}
+		}
+		return nil
+	}
+
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
+	}
+
+	// Now send 7 mode duplicate ACKs. In SACK TCP dupAcks do not cause
+	// window inflation and sending of packets is completely handled by the
+	// SACK Recovery algorithm. We should see no packets being released, as
+	// the cwnd at this point after entering recovery should be half of the
+	// outstanding number of packets in flight.
+	for i := 0; i < 7; i++ {
+		c.SendAckWithSACK(790, rtxOffset, []header.SACKBlock{{start, end}})
+		end = end.Add(10)
+	}
+
+	recover := bytesRead
+
+	// Ensure no new packets arrive.
+	c.CheckNoPacketTimeout("More packets received than expected during recovery after dupacks for this cwnd.",
+		50*time.Millisecond)
+
+	// Acknowledge half of the pending data. This along with the 10 sacked
+	// segments above should reduce the outstanding below the current
+	// congestion window allowing the sender to transmit data.
+	rtxOffset = bytesRead - expected*maxPayload/2
+
+	// Now send a partial ACK w/ a SACK block that indicates that the next 3
+	// segments are lost and we have received 6 segments after the lost
+	// segments. This should cause the sender to immediately transmit all 3
+	// segments in response to this ACK unlike in FastRecovery where only 1
+	// segment is retransmitted per ACK.
+	start = c.IRS.Add(seqnum.Size(rtxOffset) + 30 + 1)
+	end = start.Add(60)
+	c.SendAckWithSACK(790, rtxOffset, []header.SACKBlock{{start, end}})
+
+	// At this point, we acked expected/2 packets and we SACKED 6 packets and
+	// 3 segments were considered lost due to the SACK block we sent.
+	//
+	// So total packets outstanding can be calculated as follows after 7
+	// iterations of slow start -> 10/20/40/80/160/320/640. So expected
+	// should be 640 at start, then we went to recover at which point the
+	// cwnd should be set to 320 + 3 (for the 3 dupAcks which have left the
+	// network).
+	// Outstanding at this point after acking half the window
+	// (320 packets) will be:
+	//    outstanding = 640-320-6(due to SACK block)-3 = 311
+	//
+	// The last 3 is due to the fact that the first 3 packets after
+	// rtxOffset will be considered lost due to the SACK blocks sent.
+	// Receive the retransmit due to partial ack.
+
+	c.ReceiveAndCheckPacketWithOptions(data, rtxOffset, maxPayload, tsOptionSize)
+	// Receive the 2 extra packets that should have been retransmitted as
+	// those should be considered lost and immediately retransmitted based
+	// on the SACK information in the previous ACK sent above.
+	for i := 0; i < 2; i++ {
+		c.ReceiveAndCheckPacketWithOptions(data, rtxOffset+maxPayload*(i+1), maxPayload, tsOptionSize)
+	}
+
+	// Now we should get 9 more new unsent packets as the cwnd is 323 and
+	// outstanding is 311.
+	for i := 0; i < 9; i++ {
+		c.ReceiveAndCheckPacketWithOptions(data, bytesRead, maxPayload, tsOptionSize)
+		bytesRead += maxPayload
+	}
+
+	metricPollFn = func() error {
+		// In SACK recovery only the first segment is fast retransmitted when
+		// entering recovery.
+		if got, want := c.Stack().Stats().TCP.FastRetransmit.Value(), uint64(1); got != want {
+			return fmt.Errorf("got stats.TCP.FastRetransmit.Value = %d, want = %d", got, want)
+		}
+
+		if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.FastRetransmit.Value(), uint64(1); got != want {
+			return fmt.Errorf("got EP stats SendErrors.FastRetransmit = %d, want = %d", got, want)
+		}
+
+		if got, want := c.Stack().Stats().TCP.Retransmits.Value(), uint64(4); got != want {
+			return fmt.Errorf("got stats.TCP.Retransmits.Value = %d, want = %d", got, want)
+		}
+
+		if got, want := c.EP.Stats().(*tcp.Stats).SendErrors.Retransmits.Value(), uint64(4); got != want {
+			return fmt.Errorf("got EP stats Stats.SendErrors.Retransmits = %d, want = %d", got, want)
+		}
+		return nil
+	}
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
+	}
+
+	c.CheckNoPacketTimeout("More packets received than expected during recovery after partial ack for this cwnd.", 50*time.Millisecond)
+
+	// Acknowledge all pending data to recover point.
+	c.SendAck(790, recover)
+
+	// At this point, the cwnd should reset to expected/2 and there are 9
+	// packets outstanding.
+	//
+	// Now in the first iteration since there are 9 packets outstanding.
+	// We would expect to get expected/2  - 9 packets. But subsequent
+	// iterations will send us expected/2  + 1 (per iteration).
+	expected = expected/2 - 9
+	for i := 0; i < iterations; i++ {
+		// Read all packets expected on this iteration. Don't
+		// acknowledge any of them just yet, so that we can measure the
+		// congestion window.
+		for j := 0; j < expected; j++ {
+			c.ReceiveAndCheckPacketWithOptions(data, bytesRead, maxPayload, tsOptionSize)
+			bytesRead += maxPayload
+		}
+		// Check we don't receive any more packets on this iteration.
+		// The timeout can't be too high or we'll trigger a timeout.
+		c.CheckNoPacketTimeout(fmt.Sprintf("More packets received(after deflation) than expected %d for this cwnd and iteration: %d.", expected, i), 50*time.Millisecond)
+
+		// Acknowledge all the data received so far.
+		c.SendAck(790, bytesRead)
+
+		// In cogestion avoidance, the packets trains increase by 1 in
+		// each iteration.
+		if i == 0 {
+			// After the first iteration we expect to get the full
+			// congestion window worth of packets in every
+			// iteration.
+			expected += 9
+		}
+		expected++
+	}
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_test.go b/pkg/tcpip/transport/tcp/tcp_test.go
new file mode 100644
index 000000000..e67ec42b1
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/tcp_test.go
@@ -0,0 +1,7258 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_test
+
+import (
+	"bytes"
+	"fmt"
+	"math"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// defaultMTU is the MTU, in bytes, used throughout the tests, except
+	// where another value is explicitly used. It is chosen to match the MTU
+	// of loopback interfaces on linux systems.
+	defaultMTU = 65535
+
+	// defaultIPv4MSS is the MSS sent by the network stack in SYN/SYN-ACK for an
+	// IPv4 endpoint when the MTU is set to defaultMTU in the test.
+	defaultIPv4MSS = defaultMTU - header.IPv4MinimumSize - header.TCPMinimumSize
+)
+
+func TestGiveUpConnect(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	var wq waiter.Queue
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+
+	// Register for notification, then start connection attempt.
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&waitEntry, waiter.EventOut)
+	defer wq.EventUnregister(&waitEntry)
+
+	if err := ep.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
+		t.Fatalf("got ep.Connect(...) = %s, want = %s", err, tcpip.ErrConnectStarted)
+	}
+
+	// Close the connection, wait for completion.
+	ep.Close()
+
+	// Wait for ep to become writable.
+	<-notifyCh
+	if err := ep.GetSockOpt(tcpip.ErrorOption{}); err != tcpip.ErrAborted {
+		t.Fatalf("got ep.GetSockOpt(tcpip.ErrorOption{}) = %s, want = %s", err, tcpip.ErrAborted)
+	}
+
+	// Call Connect again to retreive the handshake failure status
+	// and stats updates.
+	if err := ep.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrAborted {
+		t.Fatalf("got ep.Connect(...) = %s, want = %s", err, tcpip.ErrAborted)
+	}
+
+	if got := c.Stack().Stats().TCP.FailedConnectionAttempts.Value(); got != 1 {
+		t.Errorf("got stats.TCP.FailedConnectionAttempts.Value() = %d, want = 1", got)
+	}
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)
+	}
+}
+
+func TestConnectIncrementActiveConnection(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	stats := c.Stack().Stats()
+	want := stats.TCP.ActiveConnectionOpenings.Value() + 1
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+	if got := stats.TCP.ActiveConnectionOpenings.Value(); got != want {
+		t.Errorf("got stats.TCP.ActtiveConnectionOpenings.Value() = %d, want = %d", got, want)
+	}
+}
+
+func TestConnectDoesNotIncrementFailedConnectionAttempts(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	stats := c.Stack().Stats()
+	want := stats.TCP.FailedConnectionAttempts.Value()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+	if got := stats.TCP.FailedConnectionAttempts.Value(); got != want {
+		t.Errorf("got stats.TCP.FailedConnectionAttempts.Value() = %d, want = %d", got, want)
+	}
+	if got := c.EP.Stats().(*tcp.Stats).FailedConnectionAttempts.Value(); got != want {
+		t.Errorf("got EP stats.FailedConnectionAttempts = %d, want = %d", got, want)
+	}
+}
+
+func TestActiveFailedConnectionAttemptIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	stats := c.Stack().Stats()
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	c.EP = ep
+	want := stats.TCP.FailedConnectionAttempts.Value() + 1
+
+	if err := c.EP.Connect(tcpip.FullAddress{NIC: 2, Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrNoRoute {
+		t.Errorf("got c.EP.Connect(...) = %s, want = %s", err, tcpip.ErrNoRoute)
+	}
+
+	if got := stats.TCP.FailedConnectionAttempts.Value(); got != want {
+		t.Errorf("got stats.TCP.FailedConnectionAttempts.Value() = %d, want = %d", got, want)
+	}
+	if got := c.EP.Stats().(*tcp.Stats).FailedConnectionAttempts.Value(); got != want {
+		t.Errorf("got EP stats FailedConnectionAttempts = %d, want = %d", got, want)
+	}
+}
+
+func TestTCPSegmentsSentIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	stats := c.Stack().Stats()
+	// SYN and ACK
+	want := stats.TCP.SegmentsSent.Value() + 2
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	if got := stats.TCP.SegmentsSent.Value(); got != want {
+		t.Errorf("got stats.TCP.SegmentsSent.Value() = %d, want = %d", got, want)
+	}
+	if got := c.EP.Stats().(*tcp.Stats).SegmentsSent.Value(); got != want {
+		t.Errorf("got EP stats SegmentsSent.Value() = %d, want = %d", got, want)
+	}
+}
+
+func TestTCPResetsSentIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+	stats := c.Stack().Stats()
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	want := stats.TCP.SegmentsSent.Value() + 1
+
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		// If the AckNum is not the increment of the last sequence number, a RST
+		// segment is sent back in response.
+		AckNum: c.IRS + 2,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	c.GetPacket()
+
+	metricPollFn := func() error {
+		if got := stats.TCP.ResetsSent.Value(); got != want {
+			return fmt.Errorf("got stats.TCP.ResetsSent.Value() = %d, want = %d", got, want)
+		}
+		return nil
+	}
+	if err := testutil.Poll(metricPollFn, 1*time.Second); err != nil {
+		t.Error(err)
+	}
+}
+
+// TestTCPResetSentForACKWhenNotUsingSynCookies checks that the stack generates
+// a RST if an ACK is received on the listening socket for which there is no
+// active handshake in progress and we are not using SYN cookies.
+func TestTCPResetSentForACKWhenNotUsingSynCookies(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Set TCPLingerTimeout to 5 seconds so that sockets are marked closed
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Lower stackwide TIME_WAIT timeout so that the reservations
+	// are released instantly on Close.
+	tcpTW := tcpip.TCPTimeWaitTimeoutOption(1 * time.Millisecond)
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpTW); err != nil {
+		t.Fatalf("e.stack.SetTransportProtocolOption(%d, %#v) = %s", tcp.ProtocolNumber, tcpTW, err)
+	}
+
+	c.EP.Close()
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+1),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	c.GetPacket()
+
+	// Since an active close was done we need to wait for a little more than
+	// tcpLingerTimeout for the port reservations to be released and the
+	// socket to move to a CLOSED state.
+	time.Sleep(20 * time.Millisecond)
+
+	// Now resend the same ACK, this ACK should generate a RST as there
+	// should be no endpoint in SYN-RCVD state and we are not using
+	// syn-cookies yet. The reason we send the same ACK is we need a valid
+	// cookie(IRS) generated by the netstack without which the ACK will be
+	// rejected.
+	c.SendPacket(nil, ackHeaders)
+
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(0),
+		checker.TCPFlags(header.TCPFlagRst)))
+}
+
+func TestTCPResetsReceivedIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	stats := c.Stack().Stats()
+	want := stats.TCP.ResetsReceived.Value() + 1
+	iss := seqnum.Value(789)
+	rcvWnd := seqnum.Size(30000)
+	c.CreateConnected(iss, rcvWnd, -1 /* epRcvBuf */)
+
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		SeqNum:  iss.Add(1),
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  rcvWnd,
+		Flags:   header.TCPFlagRst,
+	})
+
+	if got := stats.TCP.ResetsReceived.Value(); got != want {
+		t.Errorf("got stats.TCP.ResetsReceived.Value() = %d, want = %d", got, want)
+	}
+}
+
+func TestTCPResetsDoNotGenerateResets(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	stats := c.Stack().Stats()
+	want := stats.TCP.ResetsReceived.Value() + 1
+	iss := seqnum.Value(789)
+	rcvWnd := seqnum.Size(30000)
+	c.CreateConnected(iss, rcvWnd, -1 /* epRcvBuf */)
+
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		SeqNum:  iss.Add(1),
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  rcvWnd,
+		Flags:   header.TCPFlagRst,
+	})
+
+	if got := stats.TCP.ResetsReceived.Value(); got != want {
+		t.Errorf("got stats.TCP.ResetsReceived.Value() = %d, want = %d", got, want)
+	}
+	c.CheckNoPacketTimeout("got an unexpected packet", 100*time.Millisecond)
+}
+
+func TestActiveHandshake(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+}
+
+func TestNonBlockingClose(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+	ep := c.EP
+	c.EP = nil
+
+	// Close the endpoint and measure how long it takes.
+	t0 := time.Now()
+	ep.Close()
+	if diff := time.Now().Sub(t0); diff > 3*time.Second {
+		t.Fatalf("Took too long to close: %s", diff)
+	}
+}
+
+func TestConnectResetAfterClose(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Set TCPLinger to 3 seconds so that sockets are marked closed
+	// after 3 second in FIN_WAIT2 state.
+	tcpLingerTimeout := 3 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPLingerTimeoutOption(tcpLingerTimeout)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%s) failed: %s", tcpLingerTimeout, err)
+	}
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+	ep := c.EP
+	c.EP = nil
+
+	// Close the endpoint, make sure we get a FIN segment, then acknowledge
+	// to complete closure of sender, but don't send our own FIN.
+	ep.Close()
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
+		),
+	)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	// Wait for the ep to give up waiting for a FIN.
+	time.Sleep(tcpLingerTimeout + 1*time.Second)
+
+	// Now send an ACK and it should trigger a RST as the endpoint should
+	// not exist anymore.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	for {
+		b := c.GetPacket()
+		tcpHdr := header.TCP(header.IPv4(b).Payload())
+		if tcpHdr.Flags() == header.TCPFlagAck|header.TCPFlagFin {
+			// This is a retransmit of the FIN, ignore it.
+			continue
+		}
+
+		checker.IPv4(t, b,
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				// RST is always generated with sndNxt which if the FIN
+				// has been sent will be 1 higher than the sequence number
+				// of the FIN itself.
+				checker.SeqNum(uint32(c.IRS)+2),
+				checker.AckNum(0),
+				checker.TCPFlags(header.TCPFlagRst),
+			),
+		)
+		break
+	}
+}
+
+// TestCurrentConnectedIncrement tests increment of the current
+// established and connected counters.
+func TestCurrentConnectedIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Set TCPTimeWaitTimeout to 1 seconds so that sockets are marked closed
+	// after 1 second in TIME_WAIT state.
+	tcpTimeWaitTimeout := 1 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPTimeWaitTimeout(%d) failed: %s", tcpTimeWaitTimeout, err)
+	}
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+	ep := c.EP
+	c.EP = nil
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 1 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 1", got)
+	}
+	gotConnected := c.Stack().Stats().TCP.CurrentConnected.Value()
+	if gotConnected != 1 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 1", gotConnected)
+	}
+
+	ep.Close()
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
+		),
+	)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)
+	}
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != gotConnected {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = %d", got, gotConnected)
+	}
+
+	// Ack and send FIN as well.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	// Check that the stack acks the FIN.
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+2),
+			checker.AckNum(791),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+
+	// Wait for a little more than the TIME-WAIT duration for the socket to
+	// transition to CLOSED state.
+	time.Sleep(1200 * time.Millisecond)
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)
+	}
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)
+	}
+}
+
+// TestClosingWithEnqueuedSegments tests handling of still enqueued segments
+// when the endpoint transitions to StateClose. The in-flight segments would be
+// re-enqueued to a any listening endpoint.
+func TestClosingWithEnqueuedSegments(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+	ep := c.EP
+	c.EP = nil
+
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateEstablished; got != want {
+		t.Errorf("unexpected endpoint state: want %d, got %d", want, got)
+	}
+
+	// Send a FIN for ESTABLISHED --> CLOSED-WAIT
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagFin | header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Get the ACK for the FIN we sent.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(791),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+
+	// Give the stack a few ms to transition the endpoint out of ESTABLISHED
+	// state.
+	time.Sleep(10 * time.Millisecond)
+
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateCloseWait; got != want {
+		t.Errorf("unexpected endpoint state: want %d, got %d", want, got)
+	}
+
+	// Close the application endpoint for CLOSE_WAIT --> LAST_ACK
+	ep.Close()
+
+	// Get the FIN
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(791),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
+		),
+	)
+
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateLastAck; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+
+	// Pause the endpoint`s protocolMainLoop.
+	ep.(interface{ StopWork() }).StopWork()
+
+	// Enqueue last ACK followed by an ACK matching the endpoint
+	//
+	// Send Last ACK for LAST_ACK --> CLOSED
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  791,
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	// Send a packet with ACK set, this would generate RST when
+	// not using SYN cookies as in this test.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  792,
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	// Unpause endpoint`s protocolMainLoop.
+	ep.(interface{ ResumeWork() }).ResumeWork()
+
+	// Wait for the protocolMainLoop to resume and update state.
+	time.Sleep(10 * time.Millisecond)
+
+	// Expect the endpoint to be closed.
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateClose; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+
+	if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != 1 {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %d, want = 1", got)
+	}
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)
+	}
+
+	// Check if the endpoint was moved to CLOSED and netstack a reset in
+	// response to the ACK packet that we sent after last-ACK.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+2),
+			checker.AckNum(0),
+			checker.TCPFlags(header.TCPFlagRst),
+		),
+	)
+}
+
+func TestSimpleReceive(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
+	}
+
+	data := []byte{1, 2, 3}
+	c.SendPacket(data, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Wait for receive to be notified.
+	select {
+	case <-ch:
+	case <-time.After(1 * time.Second):
+		t.Fatalf("Timed out waiting for data to arrive")
+	}
+
+	// Receive data.
+	v, _, err := c.EP.Read(nil)
+	if err != nil {
+		t.Fatalf("Read failed: %s", err)
+	}
+
+	if !bytes.Equal(data, v) {
+		t.Fatalf("got data = %v, want = %v", v, data)
+	}
+
+	// Check that ACK is received.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+len(data))),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
+
+// TestUserSuppliedMSSOnConnectV4 tests that the user supplied MSS is used when
+// creating a new active IPv4 TCP socket. It should be present in the sent TCP
+// SYN segment.
+func TestUserSuppliedMSSOnConnectV4(t *testing.T) {
+	const mtu = 5000
+	const maxMSS = mtu - header.IPv4MinimumSize - header.TCPMinimumSize
+	tests := []struct {
+		name   string
+		setMSS int
+		expMSS uint16
+	}{
+		{
+			"EqualToMaxMSS",
+			maxMSS,
+			maxMSS,
+		},
+		{
+			"LessThanMTU",
+			maxMSS - 1,
+			maxMSS - 1,
+		},
+		{
+			"GreaterThanMTU",
+			maxMSS + 1,
+			maxMSS,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			c := context.New(t, mtu)
+			defer c.Cleanup()
+
+			c.Create(-1)
+
+			// Set the MSS socket option.
+			if err := c.EP.SetSockOptInt(tcpip.MaxSegOption, test.setMSS); err != nil {
+				t.Fatalf("SetSockOptInt(MaxSegOption, %d) failed: %s", test.setMSS, err)
+			}
+
+			// Get expected window size.
+			rcvBufSize, err := c.EP.GetSockOptInt(tcpip.ReceiveBufferSizeOption)
+			if err != nil {
+				t.Fatalf("GetSockOptInt(ReceiveBufferSizeOption) failed: %s", err)
+			}
+			ws := tcp.FindWndScale(seqnum.Size(rcvBufSize))
+
+			// Start connection attempt to IPv4 address.
+			if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
+				t.Fatalf("unexpected return value from Connect: %s", err)
+			}
+
+			// Receive SYN packet with our user supplied MSS.
+			checker.IPv4(t, c.GetPacket(), checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.TCPFlags(header.TCPFlagSyn),
+				checker.TCPSynOptions(header.TCPSynOptions{MSS: test.expMSS, WS: ws})))
+		})
+	}
+}
+
+// TestUserSuppliedMSSOnConnectV6 tests that the user supplied MSS is used when
+// creating a new active IPv6 TCP socket. It should be present in the sent TCP
+// SYN segment.
+func TestUserSuppliedMSSOnConnectV6(t *testing.T) {
+	const mtu = 5000
+	const maxMSS = mtu - header.IPv6MinimumSize - header.TCPMinimumSize
+	tests := []struct {
+		name   string
+		setMSS uint16
+		expMSS uint16
+	}{
+		{
+			"EqualToMaxMSS",
+			maxMSS,
+			maxMSS,
+		},
+		{
+			"LessThanMTU",
+			maxMSS - 1,
+			maxMSS - 1,
+		},
+		{
+			"GreaterThanMTU",
+			maxMSS + 1,
+			maxMSS,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			c := context.New(t, mtu)
+			defer c.Cleanup()
+
+			c.CreateV6Endpoint(true)
+
+			// Set the MSS socket option.
+			if err := c.EP.SetSockOptInt(tcpip.MaxSegOption, int(test.setMSS)); err != nil {
+				t.Fatalf("SetSockOptInt(MaxSegOption, %d) failed: %s", test.setMSS, err)
+			}
+
+			// Get expected window size.
+			rcvBufSize, err := c.EP.GetSockOptInt(tcpip.ReceiveBufferSizeOption)
+			if err != nil {
+				t.Fatalf("GetSockOptInt(ReceiveBufferSizeOption) failed: %s", err)
+			}
+			ws := tcp.FindWndScale(seqnum.Size(rcvBufSize))
+
+			// Start connection attempt to IPv6 address.
+			if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestV6Addr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
+				t.Fatalf("unexpected return value from Connect: %s", err)
+			}
+
+			// Receive SYN packet with our user supplied MSS.
+			checker.IPv6(t, c.GetV6Packet(), checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.TCPFlags(header.TCPFlagSyn),
+				checker.TCPSynOptions(header.TCPSynOptions{MSS: test.expMSS, WS: ws})))
+		})
+	}
+}
+
+func TestSendRstOnListenerRxSynAckV4(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn | header.TCPFlagAck,
+		SeqNum:  100,
+		AckNum:  200,
+	})
+
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst),
+		checker.SeqNum(200)))
+}
+
+func TestSendRstOnListenerRxSynAckV6(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(true)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	c.SendV6Packet(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn | header.TCPFlagAck,
+		SeqNum:  100,
+		AckNum:  200,
+	})
+
+	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst),
+		checker.SeqNum(200)))
+}
+
+// TestTCPAckBeforeAcceptV4 tests that once the 3-way handshake is complete,
+// peers can send data and expect a response within a reasonable ammount of time
+// without calling Accept on the listening endpoint first.
+//
+// This test uses IPv4.
+func TestTCPAckBeforeAcceptV4(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	// Send data before accepting the connection.
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
+
+// TestTCPAckBeforeAcceptV6 tests that once the 3-way handshake is complete,
+// peers can send data and expect a response within a reasonable ammount of time
+// without calling Accept on the listening endpoint first.
+//
+// This test uses IPv6.
+func TestTCPAckBeforeAcceptV6(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(true)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	irs, iss := executeV6Handshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	// Send data before accepting the connection.
+	c.SendV6Packet([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
+
+func TestSendRstOnListenerRxAckV4(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1 /* epRcvBuf */)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10 /* backlog */); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagFin | header.TCPFlagAck,
+		SeqNum:  100,
+		AckNum:  200,
+	})
+
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst),
+		checker.SeqNum(200)))
+}
+
+func TestSendRstOnListenerRxAckV6(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(true /* v6Only */)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10 /* backlog */); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	c.SendV6Packet(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagFin | header.TCPFlagAck,
+		SeqNum:  100,
+		AckNum:  200,
+	})
+
+	checker.IPv6(t, c.GetV6Packet(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst),
+		checker.SeqNum(200)))
+}
+
+// TestListenShutdown tests for the listening endpoint replying with RST
+// on read shutdown.
+func TestListenShutdown(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1 /* epRcvBuf */)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(1 /* backlog */); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	if err := c.EP.Shutdown(tcpip.ShutdownRead); err != nil {
+		t.Fatal("Shutdown failed:", err)
+	}
+
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  100,
+		AckNum:  200,
+	})
+
+	// Expect the listening endpoint to reset the connection.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+		))
+}
+
+// TestListenCloseWhileConnect tests for the listening endpoint to
+// drain the accept-queue when closed. This should reset all of the
+// pending connections that are waiting to be accepted.
+func TestListenCloseWhileConnect(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1 /* epRcvBuf */)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(1 /* backlog */); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&waitEntry, waiter.EventIn)
+	defer c.WQ.EventUnregister(&waitEntry)
+
+	executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+	// Wait for the new endpoint created because of handshake to be delivered
+	// to the listening endpoint's accept queue.
+	<-notifyCh
+
+	// Close the listening endpoint.
+	c.EP.Close()
+
+	// Expect the listening endpoint to reset the connection.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+		))
+}
+
+func TestTOSV4(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	c.EP = ep
+
+	const tos = 0xC0
+	if err := c.EP.SetSockOptInt(tcpip.IPv4TOSOption, tos); err != nil {
+		t.Errorf("SetSockOptInt(IPv4TOSOption, %d) failed: %s", tos, err)
+	}
+
+	v, err := c.EP.GetSockOptInt(tcpip.IPv4TOSOption)
+	if err != nil {
+		t.Errorf("GetSockoptInt(IPv4TOSOption) failed: %s", err)
+	}
+
+	if v != tos {
+		t.Errorf("got GetSockOptInt(IPv4TOSOption) = %d, want = %d", v, tos)
+	}
+
+	testV4Connect(t, c, checker.TOS(tos, 0))
+
+	data := []byte{1, 2, 3}
+	view := buffer.NewView(len(data))
+	copy(view, data)
+
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Check that data is received.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.PayloadLen(len(data)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790), // Acknum is initial sequence number + 1
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+		checker.TOS(tos, 0),
+	)
+
+	if p := b[header.IPv4MinimumSize+header.TCPMinimumSize:]; !bytes.Equal(data, p) {
+		t.Errorf("got data = %x, want = %x", p, data)
+	}
+}
+
+func TestTrafficClassV6(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateV6Endpoint(false)
+
+	const tos = 0xC0
+	if err := c.EP.SetSockOptInt(tcpip.IPv6TrafficClassOption, tos); err != nil {
+		t.Errorf("SetSockOpInt(IPv6TrafficClassOption, %d) failed: %s", tos, err)
+	}
+
+	v, err := c.EP.GetSockOptInt(tcpip.IPv6TrafficClassOption)
+	if err != nil {
+		t.Fatalf("GetSockoptInt(IPv6TrafficClassOption) failed: %s", err)
+	}
+
+	if v != tos {
+		t.Errorf("got GetSockOptInt(IPv6TrafficClassOption) = %d, want = %d", v, tos)
+	}
+
+	// Test the connection request.
+	testV6Connect(t, c, checker.TOS(tos, 0))
+
+	data := []byte{1, 2, 3}
+	view := buffer.NewView(len(data))
+	copy(view, data)
+
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Check that data is received.
+	b := c.GetV6Packet()
+	checker.IPv6(t, b,
+		checker.PayloadLen(len(data)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+		checker.TOS(tos, 0),
+	)
+
+	if p := b[header.IPv6MinimumSize+header.TCPMinimumSize:]; !bytes.Equal(data, p) {
+		t.Errorf("got data = %x, want = %x", p, data)
+	}
+}
+
+func TestConnectBindToDevice(t *testing.T) {
+	for _, test := range []struct {
+		name   string
+		device tcpip.NICID
+		want   tcp.EndpointState
+	}{
+		{"RightDevice", 1, tcp.StateEstablished},
+		{"WrongDevice", 2, tcp.StateSynSent},
+		{"AnyDevice", 0, tcp.StateEstablished},
+	} {
+		t.Run(test.name, func(t *testing.T) {
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
+
+			c.Create(-1)
+			bindToDevice := tcpip.BindToDeviceOption(test.device)
+			c.EP.SetSockOpt(bindToDevice)
+			// Start connection attempt.
+			waitEntry, _ := waiter.NewChannelEntry(nil)
+			c.WQ.EventRegister(&waitEntry, waiter.EventOut)
+			defer c.WQ.EventUnregister(&waitEntry)
+
+			if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
+				t.Fatalf("unexpected return value from Connect: %s", err)
+			}
+
+			// Receive SYN packet.
+			b := c.GetPacket()
+			checker.IPv4(t, b,
+				checker.TCP(
+					checker.DstPort(context.TestPort),
+					checker.TCPFlags(header.TCPFlagSyn),
+				),
+			)
+			if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
+				t.Fatalf("unexpected endpoint state: want %s, got %s", want, got)
+			}
+			tcpHdr := header.TCP(header.IPv4(b).Payload())
+			c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+			iss := seqnum.Value(789)
+			rcvWnd := seqnum.Size(30000)
+			c.SendPacket(nil, &context.Headers{
+				SrcPort: tcpHdr.DestinationPort(),
+				DstPort: tcpHdr.SourcePort(),
+				Flags:   header.TCPFlagSyn | header.TCPFlagAck,
+				SeqNum:  iss,
+				AckNum:  c.IRS.Add(1),
+				RcvWnd:  rcvWnd,
+				TCPOpts: nil,
+			})
+
+			c.GetPacket()
+			if got, want := tcp.EndpointState(c.EP.State()), test.want; got != want {
+				t.Fatalf("unexpected endpoint state: want %s, got %s", want, got)
+			}
+		})
+	}
+}
+
+func TestRstOnSynSent(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create an endpoint, don't handshake because we want to interfere with the
+	// handshake process.
+	c.Create(-1)
+
+	// Start connection attempt.
+	waitEntry, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&waitEntry, waiter.EventOut)
+	defer c.WQ.EventUnregister(&waitEntry)
+
+	addr := tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}
+	if err := c.EP.Connect(addr); err != tcpip.ErrConnectStarted {
+		t.Fatalf("got Connect(%+v) = %s, want %s", addr, err, tcpip.ErrConnectStarted)
+	}
+
+	// Receive SYN packet.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagSyn),
+		),
+	)
+
+	// Ensure that we've reached SynSent state
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
+		t.Fatalf("got State() = %s, want %s", got, want)
+	}
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	// Send a packet with a proper ACK and a RST flag to cause the socket
+	// to Error and close out
+	iss := seqnum.Value(789)
+	rcvWnd := seqnum.Size(30000)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: tcpHdr.DestinationPort(),
+		DstPort: tcpHdr.SourcePort(),
+		Flags:   header.TCPFlagRst | header.TCPFlagAck,
+		SeqNum:  iss,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  rcvWnd,
+		TCPOpts: nil,
+	})
+
+	// Wait for receive to be notified.
+	select {
+	case <-ch:
+	case <-time.After(3 * time.Second):
+		t.Fatal("timed out waiting for packet to arrive")
+	}
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionRefused {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionRefused)
+	}
+
+	// Due to the RST the endpoint should be in an error state.
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
+		t.Fatalf("got State() = %s, want %s", got, want)
+	}
+}
+
+func TestOutOfOrderReceive(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Send second half of data first, with seqnum 3 ahead of expected.
+	data := []byte{1, 2, 3, 4, 5, 6}
+	c.SendPacket(data[3:], &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  793,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Check that we get an ACK specifying which seqnum is expected.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+
+	// Wait 200ms and check that no data has been received.
+	time.Sleep(200 * time.Millisecond)
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Send the first 3 bytes now.
+	c.SendPacket(data[:3], &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Receive data.
+	read := make([]byte, 0, 6)
+	for len(read) < len(data) {
+		v, _, err := c.EP.Read(nil)
+		if err != nil {
+			if err == tcpip.ErrWouldBlock {
+				// Wait for receive to be notified.
+				select {
+				case <-ch:
+				case <-time.After(5 * time.Second):
+					t.Fatalf("Timed out waiting for data to arrive")
+				}
+				continue
+			}
+			t.Fatalf("Read failed: %s", err)
+		}
+
+		read = append(read, v...)
+	}
+
+	// Check that we received the data in proper order.
+	if !bytes.Equal(data, read) {
+		t.Fatalf("got data = %v, want = %v", read, data)
+	}
+
+	// Check that the whole data is acknowledged.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+len(data))),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
+
+func TestOutOfOrderFlood(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create a new connection with initial window size of 10.
+	c.CreateConnected(789, 30000, 10)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Send 100 packets before the actual one that is expected.
+	data := []byte{1, 2, 3, 4, 5, 6}
+	for i := 0; i < 100; i++ {
+		c.SendPacket(data[3:], &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  796,
+			AckNum:  c.IRS.Add(1),
+			RcvWnd:  30000,
+		})
+
+		checker.IPv4(t, c.GetPacket(),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)+1),
+				checker.AckNum(790),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+	}
+
+	// Send packet with seqnum 793. It must be discarded because the
+	// out-of-order buffer was filled by the previous packets.
+	c.SendPacket(data[3:], &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  793,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+
+	// Now send the expected packet, seqnum 790.
+	c.SendPacket(data[:3], &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Check that only packet 790 is acknowledged.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(793),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
+
+func TestRstOnCloseWithUnreadData(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
+	}
+
+	data := []byte{1, 2, 3}
+	c.SendPacket(data, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Wait for receive to be notified.
+	select {
+	case <-ch:
+	case <-time.After(3 * time.Second):
+		t.Fatalf("Timed out waiting for data to arrive")
+	}
+
+	// Check that ACK is received, this happens regardless of the read.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+len(data))),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+
+	// Now that we know we have unread data, let's just close the connection
+	// and verify that netstack sends an RST rather than a FIN.
+	c.EP.Close()
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+			// We shouldn't consume a sequence number on RST.
+			checker.SeqNum(uint32(c.IRS)+1),
+		))
+	// The RST puts the endpoint into an error state.
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+
+	// This final ACK should be ignored because an ACK on a reset doesn't mean
+	// anything.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  seqnum.Value(790 + len(data)),
+		AckNum:  c.IRS.Add(seqnum.Size(2)),
+		RcvWnd:  30000,
+	})
+}
+
+func TestRstOnCloseWithUnreadDataFinConvertRst(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
+	}
+
+	data := []byte{1, 2, 3}
+	c.SendPacket(data, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Wait for receive to be notified.
+	select {
+	case <-ch:
+	case <-time.After(3 * time.Second):
+		t.Fatalf("Timed out waiting for data to arrive")
+	}
+
+	// Check that ACK is received, this happens regardless of the read.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+len(data))),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+
+	// Cause a FIN to be generated.
+	c.EP.Shutdown(tcpip.ShutdownWrite)
+
+	// Make sure we get the FIN but DON't ACK IT.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
+			checker.SeqNum(uint32(c.IRS)+1),
+		))
+
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateFinWait1; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+
+	// Cause a RST to be generated by closing the read end now since we have
+	// unread data.
+	c.EP.Shutdown(tcpip.ShutdownRead)
+
+	// Make sure we get the RST
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagRst),
+			// RST is always generated with sndNxt which if the FIN
+			// has been sent will be 1 higher than the sequence
+			// number of the FIN itself.
+			checker.SeqNum(uint32(c.IRS)+2),
+		))
+	// The RST puts the endpoint into an error state.
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateError; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+
+	// The ACK to the FIN should now be rejected since the connection has been
+	// closed by a RST.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  seqnum.Value(790 + len(data)),
+		AckNum:  c.IRS.Add(seqnum.Size(2)),
+		RcvWnd:  30000,
+	})
+}
+
+func TestShutdownRead(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
+	}
+
+	if err := c.EP.Shutdown(tcpip.ShutdownRead); err != nil {
+		t.Fatalf("Shutdown failed: %s", err)
+	}
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrClosedForReceive {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrClosedForReceive)
+	}
+	var want uint64 = 1
+	if got := c.EP.Stats().(*tcp.Stats).ReadErrors.ReadClosed.Value(); got != want {
+		t.Fatalf("got EP stats Stats.ReadErrors.ReadClosed got %d want %d", got, want)
+	}
+}
+
+func TestFullWindowReceive(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, 10)
+
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	_, _, err := c.EP.Read(nil)
+	if err != tcpip.ErrWouldBlock {
+		t.Fatalf("Read failed: %s", err)
+	}
+
+	// Fill up the window.
+	data := []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+	c.SendPacket(data, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Wait for receive to be notified.
+	select {
+	case <-ch:
+	case <-time.After(5 * time.Second):
+		t.Fatalf("Timed out waiting for data to arrive")
+	}
+
+	// Check that data is acknowledged, and window goes to zero.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+len(data))),
+			checker.TCPFlags(header.TCPFlagAck),
+			checker.Window(0),
+		),
+	)
+
+	// Receive data and check it.
+	v, _, err := c.EP.Read(nil)
+	if err != nil {
+		t.Fatalf("Read failed: %s", err)
+	}
+
+	if !bytes.Equal(data, v) {
+		t.Fatalf("got data = %v, want = %v", v, data)
+	}
+
+	var want uint64 = 1
+	if got := c.EP.Stats().(*tcp.Stats).ReceiveErrors.ZeroRcvWindowState.Value(); got != want {
+		t.Fatalf("got EP stats ReceiveErrors.ZeroRcvWindowState got %d want %d", got, want)
+	}
+
+	// Check that we get an ACK for the newly non-zero window.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+len(data))),
+			checker.TCPFlags(header.TCPFlagAck),
+			checker.Window(10),
+		),
+	)
+}
+
+func TestNoWindowShrinking(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Start off with a window size of 10, then shrink it to 5.
+	c.CreateConnected(789, 30000, 10)
+
+	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 5); err != nil {
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 5) failed: %s", err)
+	}
+
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Send 3 bytes, check that the peer acknowledges them.
+	data := []byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
+	c.SendPacket(data[:3], &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Wait for receive to be notified.
+	select {
+	case <-ch:
+	case <-time.After(5 * time.Second):
+		t.Fatalf("Timed out waiting for data to arrive")
+	}
+
+	// Check that data is acknowledged, and that window doesn't go to zero
+	// just yet because it was previously set to 10. It must go to 7 now.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(793),
+			checker.TCPFlags(header.TCPFlagAck),
+			checker.Window(7),
+		),
+	)
+
+	// Send 7 more bytes, check that the window fills up.
+	c.SendPacket(data[3:], &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  793,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	select {
+	case <-ch:
+	case <-time.After(5 * time.Second):
+		t.Fatalf("Timed out waiting for data to arrive")
+	}
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+len(data))),
+			checker.TCPFlags(header.TCPFlagAck),
+			checker.Window(0),
+		),
+	)
+
+	// Receive data and check it.
+	read := make([]byte, 0, 10)
+	for len(read) < len(data) {
+		v, _, err := c.EP.Read(nil)
+		if err != nil {
+			t.Fatalf("Read failed: %s", err)
+		}
+
+		read = append(read, v...)
+	}
+
+	if !bytes.Equal(data, read) {
+		t.Fatalf("got data = %v, want = %v", read, data)
+	}
+
+	// Check that we get an ACK for the newly non-zero window, which is the
+	// new size.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+len(data))),
+			checker.TCPFlags(header.TCPFlagAck),
+			checker.Window(5),
+		),
+	)
+}
+
+func TestSimpleSend(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	data := []byte{1, 2, 3}
+	view := buffer.NewView(len(data))
+	copy(view, data)
+
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Check that data is received.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.PayloadLen(len(data)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	if p := b[header.IPv4MinimumSize+header.TCPMinimumSize:]; !bytes.Equal(data, p) {
+		t.Fatalf("got data = %v, want = %v", p, data)
+	}
+
+	// Acknowledge the data.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(1 + seqnum.Size(len(data))),
+		RcvWnd:  30000,
+	})
+}
+
+func TestZeroWindowSend(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789 /* iss */, 0 /* rcvWnd */, -1 /* epRcvBuf */)
+
+	data := []byte{1, 2, 3}
+	view := buffer.NewView(len(data))
+	copy(view, data)
+
+	_, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{})
+	if err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Check if we got a zero-window probe.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	// Open up the window. Data should be received now.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Check that data is received.
+	b = c.GetPacket()
+	checker.IPv4(t, b,
+		checker.PayloadLen(len(data)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	if p := b[header.IPv4MinimumSize+header.TCPMinimumSize:]; !bytes.Equal(data, p) {
+		t.Fatalf("got data = %v, want = %v", p, data)
+	}
+
+	// Acknowledge the data.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(1 + seqnum.Size(len(data))),
+		RcvWnd:  30000,
+	})
+}
+
+func TestScaledWindowConnect(t *testing.T) {
+	// This test ensures that window scaling is used when the peer
+	// does advertise it and connection is established with Connect().
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Set the window size greater than the maximum non-scaled window.
+	c.CreateConnectedWithRawOptions(789, 30000, 65535*3, []byte{
+		header.TCPOptionWS, 3, 0, header.TCPOptionNOP,
+	})
+
+	data := []byte{1, 2, 3}
+	view := buffer.NewView(len(data))
+	copy(view, data)
+
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Check that data is received, and that advertised window is 0xbfff,
+	// that is, that it is scaled.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.PayloadLen(len(data)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.Window(0xbfff),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+}
+
+func TestNonScaledWindowConnect(t *testing.T) {
+	// This test ensures that window scaling is not used when the peer
+	// doesn't advertise it and connection is established with Connect().
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Set the window size greater than the maximum non-scaled window.
+	c.CreateConnected(789, 30000, 65535*3)
+
+	data := []byte{1, 2, 3}
+	view := buffer.NewView(len(data))
+	copy(view, data)
+
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Check that data is received, and that advertised window is 0xffff,
+	// that is, that it's not scaled.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.PayloadLen(len(data)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.Window(0xffff),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+}
+
+func TestScaledWindowAccept(t *testing.T) {
+	// This test ensures that window scaling is used when the peer
+	// does advertise it and connection is established with Accept().
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create EP and start listening.
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	defer ep.Close()
+
+	// Set the window size greater than the maximum non-scaled window.
+	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 65535*3); err != nil {
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 65535*3) failed failed: %s", err)
+	}
+
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Do 3-way handshake.
+	c.PassiveConnectWithOptions(100, 2, header.TCPSynOptions{MSS: defaultIPv4MSS})
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	data := []byte{1, 2, 3}
+	view := buffer.NewView(len(data))
+	copy(view, data)
+
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Check that data is received, and that advertised window is 0xbfff,
+	// that is, that it is scaled.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.PayloadLen(len(data)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.Window(0xbfff),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+}
+
+func TestNonScaledWindowAccept(t *testing.T) {
+	// This test ensures that window scaling is not used when the peer
+	// doesn't advertise it and connection is established with Accept().
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create EP and start listening.
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	defer ep.Close()
+
+	// Set the window size greater than the maximum non-scaled window.
+	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 65535*3); err != nil {
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 65535*3) failed failed: %s", err)
+	}
+
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Do 3-way handshake w/ window scaling disabled. The SYN-ACK to the SYN
+	// should not carry the window scaling option.
+	c.PassiveConnect(100, -1, header.TCPSynOptions{MSS: defaultIPv4MSS})
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	data := []byte{1, 2, 3}
+	view := buffer.NewView(len(data))
+	copy(view, data)
+
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Check that data is received, and that advertised window is 0xffff,
+	// that is, that it's not scaled.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.PayloadLen(len(data)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.Window(0xffff),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+}
+
+func TestZeroScaledWindowReceive(t *testing.T) {
+	// This test ensures that the endpoint sends a non-zero window size
+	// advertisement when the scaled window transitions from 0 to non-zero,
+	// but the actual window (not scaled) hasn't gotten to zero.
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Set the window size such that a window scale of 4 will be used.
+	const wnd = 65535 * 10
+	const ws = uint32(4)
+	c.CreateConnectedWithRawOptions(789, 30000, wnd, []byte{
+		header.TCPOptionWS, 3, 0, header.TCPOptionNOP,
+	})
+
+	// Write chunks of 50000 bytes.
+	remain := wnd
+	sent := 0
+	data := make([]byte, 50000)
+	for remain > len(data) {
+		c.SendPacket(data, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  seqnum.Value(790 + sent),
+			AckNum:  c.IRS.Add(1),
+			RcvWnd:  30000,
+		})
+		sent += len(data)
+		remain -= len(data)
+		checker.IPv4(t, c.GetPacket(),
+			checker.PayloadLen(header.TCPMinimumSize),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)+1),
+				checker.AckNum(uint32(790+sent)),
+				checker.Window(uint16(remain>>ws)),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+	}
+
+	// Make the window non-zero, but the scaled window zero.
+	if remain >= 16 {
+		data = data[:remain-15]
+		c.SendPacket(data, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  seqnum.Value(790 + sent),
+			AckNum:  c.IRS.Add(1),
+			RcvWnd:  30000,
+		})
+		sent += len(data)
+		remain -= len(data)
+		checker.IPv4(t, c.GetPacket(),
+			checker.PayloadLen(header.TCPMinimumSize),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)+1),
+				checker.AckNum(uint32(790+sent)),
+				checker.Window(0),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+	}
+
+	// Read at least 1MSS of data. An ack should be sent in response to that.
+	sz := 0
+	for sz < defaultMTU {
+		v, _, err := c.EP.Read(nil)
+		if err != nil {
+			t.Fatalf("Read failed: %s", err)
+		}
+		sz += len(v)
+	}
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+sent)),
+			checker.Window(uint16(sz>>ws)),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
+
+func TestSegmentMerging(t *testing.T) {
+	tests := []struct {
+		name   string
+		stop   func(tcpip.Endpoint)
+		resume func(tcpip.Endpoint)
+	}{
+		{
+			"stop work",
+			func(ep tcpip.Endpoint) {
+				ep.(interface{ StopWork() }).StopWork()
+			},
+			func(ep tcpip.Endpoint) {
+				ep.(interface{ ResumeWork() }).ResumeWork()
+			},
+		},
+		{
+			"cork",
+			func(ep tcpip.Endpoint) {
+				ep.SetSockOptBool(tcpip.CorkOption, true)
+			},
+			func(ep tcpip.Endpoint) {
+				ep.SetSockOptBool(tcpip.CorkOption, false)
+			},
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
+
+			c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+			// Send tcp.InitialCwnd number of segments to fill up
+			// InitialWindow but don't ACK. That should prevent
+			// anymore packets from going out.
+			for i := 0; i < tcp.InitialCwnd; i++ {
+				view := buffer.NewViewFromBytes([]byte{0})
+				if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+					t.Fatalf("Write #%d failed: %s", i+1, err)
+				}
+			}
+
+			// Now send the segments that should get merged as the congestion
+			// window is full and we won't be able to send any more packets.
+			var allData []byte
+			for i, data := range [][]byte{{1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} {
+				allData = append(allData, data...)
+				view := buffer.NewViewFromBytes(data)
+				if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+					t.Fatalf("Write #%d failed: %s", i+1, err)
+				}
+			}
+
+			// Check that we get tcp.InitialCwnd packets.
+			for i := 0; i < tcp.InitialCwnd; i++ {
+				b := c.GetPacket()
+				checker.IPv4(t, b,
+					checker.PayloadLen(header.TCPMinimumSize+1),
+					checker.TCP(
+						checker.DstPort(context.TestPort),
+						checker.SeqNum(uint32(c.IRS)+uint32(i)+1),
+						checker.AckNum(790),
+						checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+					),
+				)
+			}
+
+			// Acknowledge the data.
+			c.SendPacket(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: c.Port,
+				Flags:   header.TCPFlagAck,
+				SeqNum:  790,
+				AckNum:  c.IRS.Add(1 + 10), // 10 for the 10 bytes of payload.
+				RcvWnd:  30000,
+			})
+
+			// Check that data is received.
+			b := c.GetPacket()
+			checker.IPv4(t, b,
+				checker.PayloadLen(len(allData)+header.TCPMinimumSize),
+				checker.TCP(
+					checker.DstPort(context.TestPort),
+					checker.SeqNum(uint32(c.IRS)+11),
+					checker.AckNum(790),
+					checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+				),
+			)
+
+			if got := b[header.IPv4MinimumSize+header.TCPMinimumSize:]; !bytes.Equal(got, allData) {
+				t.Fatalf("got data = %v, want = %v", got, allData)
+			}
+
+			// Acknowledge the data.
+			c.SendPacket(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: c.Port,
+				Flags:   header.TCPFlagAck,
+				SeqNum:  790,
+				AckNum:  c.IRS.Add(11 + seqnum.Size(len(allData))),
+				RcvWnd:  30000,
+			})
+		})
+	}
+}
+
+func TestDelay(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	c.EP.SetSockOptBool(tcpip.DelayOption, true)
+
+	var allData []byte
+	for i, data := range [][]byte{{0}, {1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}, {11}} {
+		allData = append(allData, data...)
+		view := buffer.NewViewFromBytes(data)
+		if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+			t.Fatalf("Write #%d failed: %s", i+1, err)
+		}
+	}
+
+	seq := c.IRS.Add(1)
+	for _, want := range [][]byte{allData[:1], allData[1:]} {
+		// Check that data is received.
+		b := c.GetPacket()
+		checker.IPv4(t, b,
+			checker.PayloadLen(len(want)+header.TCPMinimumSize),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(seq)),
+				checker.AckNum(790),
+				checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+			),
+		)
+
+		if got := b[header.IPv4MinimumSize+header.TCPMinimumSize:]; !bytes.Equal(got, want) {
+			t.Fatalf("got data = %v, want = %v", got, want)
+		}
+
+		seq = seq.Add(seqnum.Size(len(want)))
+		// Acknowledge the data.
+		c.SendPacket(nil, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  790,
+			AckNum:  seq,
+			RcvWnd:  30000,
+		})
+	}
+}
+
+func TestUndelay(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	c.EP.SetSockOptBool(tcpip.DelayOption, true)
+
+	allData := [][]byte{{0}, {1, 2, 3}}
+	for i, data := range allData {
+		view := buffer.NewViewFromBytes(data)
+		if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+			t.Fatalf("Write #%d failed: %s", i+1, err)
+		}
+	}
+
+	seq := c.IRS.Add(1)
+
+	// Check that data is received.
+	first := c.GetPacket()
+	checker.IPv4(t, first,
+		checker.PayloadLen(len(allData[0])+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(seq)),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	if got, want := first[header.IPv4MinimumSize+header.TCPMinimumSize:], allData[0]; !bytes.Equal(got, want) {
+		t.Fatalf("got first packet's data = %v, want = %v", got, want)
+	}
+
+	seq = seq.Add(seqnum.Size(len(allData[0])))
+
+	// Check that we don't get the second packet yet.
+	c.CheckNoPacketTimeout("delayed second packet transmitted", 100*time.Millisecond)
+
+	c.EP.SetSockOptBool(tcpip.DelayOption, false)
+
+	// Check that data is received.
+	second := c.GetPacket()
+	checker.IPv4(t, second,
+		checker.PayloadLen(len(allData[1])+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(seq)),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	if got, want := second[header.IPv4MinimumSize+header.TCPMinimumSize:], allData[1]; !bytes.Equal(got, want) {
+		t.Fatalf("got second packet's data = %v, want = %v", got, want)
+	}
+
+	seq = seq.Add(seqnum.Size(len(allData[1])))
+
+	// Acknowledge the data.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seq,
+		RcvWnd:  30000,
+	})
+}
+
+func TestMSSNotDelayed(t *testing.T) {
+	tests := []struct {
+		name string
+		fn   func(tcpip.Endpoint)
+	}{
+		{"no-op", func(tcpip.Endpoint) {}},
+		{"delay", func(ep tcpip.Endpoint) { ep.SetSockOptBool(tcpip.DelayOption, true) }},
+		{"cork", func(ep tcpip.Endpoint) { ep.SetSockOptBool(tcpip.CorkOption, true) }},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			const maxPayload = 100
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
+
+			c.CreateConnectedWithRawOptions(789, 30000, -1 /* epRcvBuf */, []byte{
+				header.TCPOptionMSS, 4, byte(maxPayload / 256), byte(maxPayload % 256),
+			})
+
+			test.fn(c.EP)
+
+			allData := [][]byte{{0}, make([]byte, maxPayload), make([]byte, maxPayload)}
+			for i, data := range allData {
+				view := buffer.NewViewFromBytes(data)
+				if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+					t.Fatalf("Write #%d failed: %s", i+1, err)
+				}
+			}
+
+			seq := c.IRS.Add(1)
+
+			for i, data := range allData {
+				// Check that data is received.
+				packet := c.GetPacket()
+				checker.IPv4(t, packet,
+					checker.PayloadLen(len(data)+header.TCPMinimumSize),
+					checker.TCP(
+						checker.DstPort(context.TestPort),
+						checker.SeqNum(uint32(seq)),
+						checker.AckNum(790),
+						checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+					),
+				)
+
+				if got, want := packet[header.IPv4MinimumSize+header.TCPMinimumSize:], data; !bytes.Equal(got, want) {
+					t.Fatalf("got packet #%d's data = %v, want = %v", i+1, got, want)
+				}
+
+				seq = seq.Add(seqnum.Size(len(data)))
+			}
+
+			// Acknowledge the data.
+			c.SendPacket(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: c.Port,
+				Flags:   header.TCPFlagAck,
+				SeqNum:  790,
+				AckNum:  seq,
+				RcvWnd:  30000,
+			})
+		})
+	}
+}
+
+func testBrokenUpWrite(t *testing.T, c *context.Context, maxPayload int) {
+	payloadMultiplier := 10
+	dataLen := payloadMultiplier * maxPayload
+	data := make([]byte, dataLen)
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	view := buffer.NewView(len(data))
+	copy(view, data)
+
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Check that data is received in chunks.
+	bytesReceived := 0
+	numPackets := 0
+	for bytesReceived != dataLen {
+		b := c.GetPacket()
+		numPackets++
+		tcpHdr := header.TCP(header.IPv4(b).Payload())
+		payloadLen := len(tcpHdr.Payload())
+		checker.IPv4(t, b,
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)+1+uint32(bytesReceived)),
+				checker.AckNum(790),
+				checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+			),
+		)
+
+		pdata := data[bytesReceived : bytesReceived+payloadLen]
+		if p := tcpHdr.Payload(); !bytes.Equal(pdata, p) {
+			t.Fatalf("got data = %v, want = %v", p, pdata)
+		}
+		bytesReceived += payloadLen
+		var options []byte
+		if c.TimeStampEnabled {
+			// If timestamp option is enabled, echo back the timestamp and increment
+			// the TSEcr value included in the packet and send that back as the TSVal.
+			parsedOpts := tcpHdr.ParsedOptions()
+			tsOpt := [12]byte{header.TCPOptionNOP, header.TCPOptionNOP}
+			header.EncodeTSOption(parsedOpts.TSEcr+1, parsedOpts.TSVal, tsOpt[2:])
+			options = tsOpt[:]
+		}
+		// Acknowledge the data.
+		c.SendPacket(nil, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  790,
+			AckNum:  c.IRS.Add(1 + seqnum.Size(bytesReceived)),
+			RcvWnd:  30000,
+			TCPOpts: options,
+		})
+	}
+	if numPackets == 1 {
+		t.Fatalf("expected write to be broken up into multiple packets, but got 1 packet")
+	}
+}
+
+func TestSendGreaterThanMTU(t *testing.T) {
+	const maxPayload = 100
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+	testBrokenUpWrite(t, c, maxPayload)
+}
+
+func TestSetTTL(t *testing.T) {
+	for _, wantTTL := range []uint8{1, 2, 50, 64, 128, 254, 255} {
+		t.Run(fmt.Sprintf("TTL:%d", wantTTL), func(t *testing.T) {
+			c := context.New(t, 65535)
+			defer c.Cleanup()
+
+			var err *tcpip.Error
+			c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+			if err != nil {
+				t.Fatalf("NewEndpoint failed: %s", err)
+			}
+
+			if err := c.EP.SetSockOptInt(tcpip.TTLOption, int(wantTTL)); err != nil {
+				t.Fatalf("SetSockOptInt(TTLOption, %d) failed: %s", wantTTL, err)
+			}
+
+			if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
+				t.Fatalf("unexpected return value from Connect: %s", err)
+			}
+
+			// Receive SYN packet.
+			b := c.GetPacket()
+
+			checker.IPv4(t, b, checker.TTL(wantTTL))
+		})
+	}
+}
+
+func TestActiveSendMSSLessThanMTU(t *testing.T) {
+	const maxPayload = 100
+	c := context.New(t, 65535)
+	defer c.Cleanup()
+
+	c.CreateConnectedWithRawOptions(789, 30000, -1 /* epRcvBuf */, []byte{
+		header.TCPOptionMSS, 4, byte(maxPayload / 256), byte(maxPayload % 256),
+	})
+	testBrokenUpWrite(t, c, maxPayload)
+}
+
+func TestPassiveSendMSSLessThanMTU(t *testing.T) {
+	const maxPayload = 100
+	const mtu = 1200
+	c := context.New(t, mtu)
+	defer c.Cleanup()
+
+	// Create EP and start listening.
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	defer ep.Close()
+
+	// Set the buffer size to a deterministic size so that we can check the
+	// window scaling option.
+	const rcvBufferSize = 0x20000
+	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBufferSize); err != nil {
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, %d) failed failed: %s", rcvBufferSize, err)
+	}
+
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Do 3-way handshake.
+	c.PassiveConnect(maxPayload, -1, header.TCPSynOptions{MSS: mtu - header.IPv4MinimumSize - header.TCPMinimumSize})
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Check that data gets properly segmented.
+	testBrokenUpWrite(t, c, maxPayload)
+}
+
+func TestSynCookiePassiveSendMSSLessThanMTU(t *testing.T) {
+	const maxPayload = 536
+	const mtu = 2000
+	c := context.New(t, mtu)
+	defer c.Cleanup()
+
+	// Set the SynRcvd threshold to zero to force a syn cookie based accept
+	// to happen.
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+		t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+	}
+
+	// Create EP and start listening.
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	defer ep.Close()
+
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Do 3-way handshake.
+	c.PassiveConnect(maxPayload, -1, header.TCPSynOptions{MSS: mtu - header.IPv4MinimumSize - header.TCPMinimumSize})
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Check that data gets properly segmented.
+	testBrokenUpWrite(t, c, maxPayload)
+}
+
+func TestForwarderSendMSSLessThanMTU(t *testing.T) {
+	const maxPayload = 100
+	const mtu = 1200
+	c := context.New(t, mtu)
+	defer c.Cleanup()
+
+	s := c.Stack()
+	ch := make(chan *tcpip.Error, 1)
+	f := tcp.NewForwarder(s, 65536, 10, func(r *tcp.ForwarderRequest) {
+		var err *tcpip.Error
+		c.EP, err = r.CreateEndpoint(&c.WQ)
+		ch <- err
+	})
+	s.SetTransportProtocolHandler(tcp.ProtocolNumber, f.HandlePacket)
+
+	// Do 3-way handshake.
+	c.PassiveConnect(maxPayload, -1, header.TCPSynOptions{MSS: mtu - header.IPv4MinimumSize - header.TCPMinimumSize})
+
+	// Wait for connection to be available.
+	select {
+	case err := <-ch:
+		if err != nil {
+			t.Fatalf("Error creating endpoint: %s", err)
+		}
+	case <-time.After(2 * time.Second):
+		t.Fatalf("Timed out waiting for connection")
+	}
+
+	// Check that data gets properly segmented.
+	testBrokenUpWrite(t, c, maxPayload)
+}
+
+func TestSynOptionsOnActiveConnect(t *testing.T) {
+	const mtu = 1400
+	c := context.New(t, mtu)
+	defer c.Cleanup()
+
+	// Create TCP endpoint.
+	var err *tcpip.Error
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+
+	// Set the buffer size to a deterministic size so that we can check the
+	// window scaling option.
+	const rcvBufferSize = 0x20000
+	const wndScale = 2
+	if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBufferSize); err != nil {
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, %d) failed failed: %s", rcvBufferSize, err)
+	}
+
+	// Start connection attempt.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventOut)
+	defer c.WQ.EventUnregister(&we)
+
+	if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
+		t.Fatalf("got c.EP.Connect(...) = %s, want = %s", err, tcpip.ErrConnectStarted)
+	}
+
+	// Receive SYN packet.
+	b := c.GetPacket()
+	mss := uint16(mtu - header.IPv4MinimumSize - header.TCPMinimumSize)
+	checker.IPv4(t, b,
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagSyn),
+			checker.TCPSynOptions(header.TCPSynOptions{MSS: mss, WS: wndScale}),
+		),
+	)
+
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	// Wait for retransmit.
+	time.Sleep(1 * time.Second)
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagSyn),
+			checker.SrcPort(tcpHdr.SourcePort()),
+			checker.SeqNum(tcpHdr.SequenceNumber()),
+			checker.TCPSynOptions(header.TCPSynOptions{MSS: mss, WS: wndScale}),
+		),
+	)
+
+	// Send SYN-ACK.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: tcpHdr.DestinationPort(),
+		DstPort: tcpHdr.SourcePort(),
+		Flags:   header.TCPFlagSyn | header.TCPFlagAck,
+		SeqNum:  iss,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Receive ACK packet.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagAck),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(iss)+1),
+		),
+	)
+
+	// Wait for connection to be established.
+	select {
+	case <-ch:
+		if err := c.EP.GetSockOpt(tcpip.ErrorOption{}); err != nil {
+			t.Fatalf("GetSockOpt failed: %s", err)
+		}
+	case <-time.After(1 * time.Second):
+		t.Fatalf("Timed out waiting for connection")
+	}
+}
+
+func TestCloseListener(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create listener.
+	var wq waiter.Queue
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+
+	if err := ep.Bind(tcpip.FullAddress{}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Close the listener and measure how long it takes.
+	t0 := time.Now()
+	ep.Close()
+	if diff := time.Now().Sub(t0); diff > 3*time.Second {
+		t.Fatalf("Took too long to close: %s", diff)
+	}
+}
+
+func TestReceiveOnResetConnection(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	// Send RST segment.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagRst,
+		SeqNum:  790,
+		RcvWnd:  30000,
+	})
+
+	// Try to read.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+loop:
+	for {
+		switch _, _, err := c.EP.Read(nil); err {
+		case tcpip.ErrWouldBlock:
+			select {
+			case <-ch:
+			case <-time.After(1 * time.Second):
+				t.Fatalf("Timed out waiting for reset to arrive")
+			}
+		case tcpip.ErrConnectionReset:
+			break loop
+		default:
+			t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset)
+		}
+	}
+	// Expect the state to be StateError and subsequent Reads to fail with HardError.
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrConnectionReset {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrConnectionReset)
+	}
+	if tcp.EndpointState(c.EP.State()) != tcp.StateError {
+		t.Fatalf("got EP state is not StateError")
+	}
+
+	if got := c.Stack().Stats().TCP.EstablishedResets.Value(); got != 1 {
+		t.Errorf("got stats.TCP.EstablishedResets.Value() = %d, want = 1", got)
+	}
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)
+	}
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)
+	}
+}
+
+func TestSendOnResetConnection(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	// Send RST segment.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagRst,
+		SeqNum:  790,
+		RcvWnd:  30000,
+	})
+
+	// Wait for the RST to be received.
+	time.Sleep(1 * time.Second)
+
+	// Try to write.
+	view := buffer.NewView(10)
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != tcpip.ErrConnectionReset {
+		t.Fatalf("got c.EP.Write(...) = %s, want = %s", err, tcpip.ErrConnectionReset)
+	}
+}
+
+// TestMaxRetransmitsTimeout tests if the connection is timed out after
+// a segment has been retransmitted MaxRetries times.
+func TestMaxRetransmitsTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	const numRetries = 2
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMaxRetriesOption(numRetries)); err != nil {
+		t.Fatalf("could not set protocol option MaxRetries.\n")
+	}
+
+	c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */)
+
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&waitEntry, waiter.EventHUp)
+	defer c.WQ.EventUnregister(&waitEntry)
+
+	_, _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(1)), tcpip.WriteOptions{})
+	if err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Expect first transmit and MaxRetries retransmits.
+	for i := 0; i < numRetries+1; i++ {
+		checker.IPv4(t, c.GetPacket(),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.TCPFlags(header.TCPFlagAck|header.TCPFlagPsh),
+			),
+		)
+	}
+	// Wait for the connection to timeout after MaxRetries retransmits.
+	initRTO := 1 * time.Second
+	select {
+	case <-notifyCh:
+	case <-time.After((2 << numRetries) * initRTO):
+		t.Fatalf("connection still alive after maximum retransmits.\n")
+	}
+
+	// Send an ACK and expect a RST as the connection would have been closed.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlags(header.TCPFlagRst),
+		),
+	)
+
+	if got := c.Stack().Stats().TCP.EstablishedTimedout.Value(); got != 1 {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout.Value() = %d, want = 1", got)
+	}
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)
+	}
+}
+
+// TestMaxRTO tests if the retransmit interval caps to MaxRTO.
+func TestMaxRTO(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	rto := 1 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMaxRTOOption(rto)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPMaxRTO(%d) failed: %s", rto, err)
+	}
+
+	c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */)
+
+	_, _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(1)), tcpip.WriteOptions{})
+	if err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+	const numRetransmits = 2
+	for i := 0; i < numRetransmits; i++ {
+		start := time.Now()
+		checker.IPv4(t, c.GetPacket(),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+			),
+		)
+		if time.Since(start).Round(time.Second).Seconds() != rto.Seconds() {
+			t.Errorf("Retransmit interval not capped to MaxRTO.\n")
+		}
+	}
+}
+
+// TestRetransmitIPv4IDUniqueness tests that the IPv4 Identification field is
+// unique on retransmits.
+func TestRetransmitIPv4IDUniqueness(t *testing.T) {
+	for _, tc := range []struct {
+		name string
+		size int
+	}{
+		{"1Byte", 1},
+		{"512Bytes", 512},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
+
+			c.CreateConnected(789 /* iss */, 30000 /* rcvWnd */, -1 /* epRcvBuf */)
+
+			// Disabling PMTU discovery causes all packets sent from this socket to
+			// have DF=0. This needs to be done because the IPv4 ID uniqueness
+			// applies only to non-atomic IPv4 datagrams as defined in RFC 6864
+			// Section 4, and datagrams with DF=0 are non-atomic.
+			if err := c.EP.SetSockOptInt(tcpip.MTUDiscoverOption, tcpip.PMTUDiscoveryDont); err != nil {
+				t.Fatalf("disabling PMTU discovery via sockopt to force DF=0 failed: %s", err)
+			}
+
+			if _, _, err := c.EP.Write(tcpip.SlicePayload(buffer.NewView(tc.size)), tcpip.WriteOptions{}); err != nil {
+				t.Fatalf("Write failed: %s", err)
+			}
+			pkt := c.GetPacket()
+			checker.IPv4(t, pkt,
+				checker.FragmentFlags(0),
+				checker.TCP(
+					checker.DstPort(context.TestPort),
+					checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+				),
+			)
+			idSet := map[uint16]struct{}{header.IPv4(pkt).ID(): struct{}{}}
+			// Expect two retransmitted packets, and that all packets received have
+			// unique IPv4 ID values.
+			for i := 0; i <= 2; i++ {
+				pkt := c.GetPacket()
+				checker.IPv4(t, pkt,
+					checker.FragmentFlags(0),
+					checker.TCP(
+						checker.DstPort(context.TestPort),
+						checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+					),
+				)
+				id := header.IPv4(pkt).ID()
+				if _, exists := idSet[id]; exists {
+					t.Fatalf("duplicate IPv4 ID=%d found in retransmitted packet", id)
+				}
+				idSet[id] = struct{}{}
+			}
+		})
+	}
+}
+
+func TestFinImmediately(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	// Shutdown immediately, check that we get a FIN.
+	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
+		t.Fatalf("Shutdown failed: %s", err)
+	}
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
+		),
+	)
+
+	// Ack and send FIN as well.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	// Check that the stack acks the FIN.
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+2),
+			checker.AckNum(791),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
+
+func TestFinRetransmit(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	// Shutdown immediately, check that we get a FIN.
+	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
+		t.Fatalf("Shutdown failed: %s", err)
+	}
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
+		),
+	)
+
+	// Don't acknowledge yet. We should get a retransmit of the FIN.
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
+		),
+	)
+
+	// Ack and send FIN as well.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	// Check that the stack acks the FIN.
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+2),
+			checker.AckNum(791),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
+
+func TestFinWithNoPendingData(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	// Write something out, and have it acknowledged.
+	view := buffer.NewView(10)
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	next := uint32(c.IRS) + 1
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(len(view)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+	next += uint32(len(view))
+
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
+	// Shutdown, check that we get a FIN.
+	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
+		t.Fatalf("Shutdown failed: %s", err)
+	}
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(790),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
+		),
+	)
+	next++
+
+	// Ack and send FIN as well.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
+	// Check that the stack acks the FIN.
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(791),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
+
+func TestFinWithPendingDataCwndFull(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	// Write enough segments to fill the congestion window before ACK'ing
+	// any of them.
+	view := buffer.NewView(10)
+	for i := tcp.InitialCwnd; i > 0; i-- {
+		if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+			t.Fatalf("Write failed: %s", err)
+		}
+	}
+
+	next := uint32(c.IRS) + 1
+	for i := tcp.InitialCwnd; i > 0; i-- {
+		checker.IPv4(t, c.GetPacket(),
+			checker.PayloadLen(len(view)+header.TCPMinimumSize),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(next),
+				checker.AckNum(790),
+				checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+			),
+		)
+		next += uint32(len(view))
+	}
+
+	// Shutdown the connection, check that the FIN segment isn't sent
+	// because the congestion window doesn't allow it. Wait until a
+	// retransmit is received.
+	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
+		t.Fatalf("Shutdown failed: %s", err)
+	}
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(len(view)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	// Send the ACK that will allow the FIN to be sent as well.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(790),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
+		),
+	)
+	next++
+
+	// Send a FIN that acknowledges everything. Get an ACK back.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(791),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
+
+func TestFinWithPendingData(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	// Write something out, and acknowledge it to get cwnd to 2.
+	view := buffer.NewView(10)
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	next := uint32(c.IRS) + 1
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(len(view)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+	next += uint32(len(view))
+
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
+	// Write new data, but don't acknowledge it.
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(len(view)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+	next += uint32(len(view))
+
+	// Shutdown the connection, check that we do get a FIN.
+	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
+		t.Fatalf("Shutdown failed: %s", err)
+	}
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(790),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
+		),
+	)
+	next++
+
+	// Send a FIN that acknowledges everything. Get an ACK back.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(791),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
+
+func TestFinWithPartialAck(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	// Write something out, and acknowledge it to get cwnd to 2. Also send
+	// FIN from the test side.
+	view := buffer.NewView(10)
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	next := uint32(c.IRS) + 1
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(len(view)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+	next += uint32(len(view))
+
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
+	// Check that we get an ACK for the fin.
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(791),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	// Write new data, but don't acknowledge it.
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(len(view)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(791),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+	next += uint32(len(view))
+
+	// Shutdown the connection, check that we do get a FIN.
+	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
+		t.Fatalf("Shutdown failed: %s", err)
+	}
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(791),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
+		),
+	)
+	next++
+
+	// Send an ACK for the data, but not for the FIN yet.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  791,
+		AckNum:  seqnum.Value(next - 1),
+		RcvWnd:  30000,
+	})
+
+	// Check that we don't get a retransmit of the FIN.
+	c.CheckNoPacketTimeout("FIN retransmitted when data was ack'd", 100*time.Millisecond)
+
+	// Ack the FIN.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  791,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+}
+
+func TestUpdateListenBacklog(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create listener.
+	var wq waiter.Queue
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+
+	if err := ep.Bind(tcpip.FullAddress{}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Update the backlog with another Listen() on the same endpoint.
+	if err := ep.Listen(20); err != nil {
+		t.Fatalf("Listen failed to update backlog: %s", err)
+	}
+
+	ep.Close()
+}
+
+func scaledSendWindow(t *testing.T, scale uint8) {
+	// This test ensures that the endpoint is using the right scaling by
+	// sending a buffer that is larger than the window size, and ensuring
+	// that the endpoint doesn't send more than allowed.
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	maxPayload := defaultMTU - header.IPv4MinimumSize - header.TCPMinimumSize
+	c.CreateConnectedWithRawOptions(789, 0, -1 /* epRcvBuf */, []byte{
+		header.TCPOptionMSS, 4, byte(maxPayload / 256), byte(maxPayload % 256),
+		header.TCPOptionWS, 3, scale, header.TCPOptionNOP,
+	})
+
+	// Open up the window with a scaled value.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  1,
+	})
+
+	// Send some data. Check that it's capped by the window size.
+	view := buffer.NewView(65535)
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Check that only data that fits in the scaled window is sent.
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen((1<<scale)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	// Reset the connection to free resources.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagRst,
+		SeqNum:  790,
+	})
+}
+
+func TestScaledSendWindow(t *testing.T) {
+	for scale := uint8(0); scale <= 14; scale++ {
+		scaledSendWindow(t, scale)
+	}
+}
+
+func TestReceivedValidSegmentCountIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+	stats := c.Stack().Stats()
+	want := stats.TCP.ValidSegmentsReceived.Value() + 1
+
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  seqnum.Value(790),
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	if got := stats.TCP.ValidSegmentsReceived.Value(); got != want {
+		t.Errorf("got stats.TCP.ValidSegmentsReceived.Value() = %d, want = %d", got, want)
+	}
+	if got := c.EP.Stats().(*tcp.Stats).SegmentsReceived.Value(); got != want {
+		t.Errorf("got EP stats Stats.SegmentsReceived = %d, want = %d", got, want)
+	}
+	// Ensure there were no errors during handshake. If these stats have
+	// incremented, then the connection should not have been established.
+	if got := c.EP.Stats().(*tcp.Stats).SendErrors.NoRoute.Value(); got != 0 {
+		t.Errorf("got EP stats Stats.SendErrors.NoRoute = %d, want = %d", got, 0)
+	}
+	if got := c.EP.Stats().(*tcp.Stats).SendErrors.NoLinkAddr.Value(); got != 0 {
+		t.Errorf("got EP stats Stats.SendErrors.NoLinkAddr = %d, want = %d", got, 0)
+	}
+}
+
+func TestReceivedInvalidSegmentCountIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+	stats := c.Stack().Stats()
+	want := stats.TCP.InvalidSegmentsReceived.Value() + 1
+	vv := c.BuildSegment(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  seqnum.Value(790),
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+	tcpbuf := vv.ToView()[header.IPv4MinimumSize:]
+	tcpbuf[header.TCPDataOffset] = ((header.TCPMinimumSize - 1) / 4) << 4
+
+	c.SendSegment(vv)
+
+	if got := stats.TCP.InvalidSegmentsReceived.Value(); got != want {
+		t.Errorf("got stats.TCP.InvalidSegmentsReceived.Value() = %d, want = %d", got, want)
+	}
+	if got := c.EP.Stats().(*tcp.Stats).ReceiveErrors.MalformedPacketsReceived.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.MalformedPacketsReceived stats = %d, want = %d", got, want)
+	}
+}
+
+func TestReceivedIncorrectChecksumIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+	stats := c.Stack().Stats()
+	want := stats.TCP.ChecksumErrors.Value() + 1
+	vv := c.BuildSegment([]byte{0x1, 0x2, 0x3}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  seqnum.Value(790),
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+	tcpbuf := vv.ToView()[header.IPv4MinimumSize:]
+	// Overwrite a byte in the payload which should cause checksum
+	// verification to fail.
+	tcpbuf[(tcpbuf[header.TCPDataOffset]>>4)*4] = 0x4
+
+	c.SendSegment(vv)
+
+	if got := stats.TCP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.TCP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+	if got := c.EP.Stats().(*tcp.Stats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+		t.Errorf("got EP stats Stats.ReceiveErrors.ChecksumErrors = %d, want = %d", got, want)
+	}
+}
+
+func TestReceivedSegmentQueuing(t *testing.T) {
+	// This test sends 200 segments containing a few bytes each to an
+	// endpoint and checks that they're all received and acknowledged by
+	// the endpoint, that is, that none of the segments are dropped by
+	// internal queues.
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	// Send 200 segments.
+	data := []byte{1, 2, 3}
+	for i := 0; i < 200; i++ {
+		c.SendPacket(data, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  seqnum.Value(790 + i*len(data)),
+			AckNum:  c.IRS.Add(1),
+			RcvWnd:  30000,
+		})
+	}
+
+	// Receive ACKs for all segments.
+	last := seqnum.Value(790 + 200*len(data))
+	for {
+		b := c.GetPacket()
+		checker.IPv4(t, b,
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)+1),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+		tcpHdr := header.TCP(header.IPv4(b).Payload())
+		ack := seqnum.Value(tcpHdr.AckNumber())
+		if ack == last {
+			break
+		}
+
+		if last.LessThan(ack) {
+			t.Fatalf("Acknowledge (%v) beyond the expected (%v)", ack, last)
+		}
+	}
+}
+
+func TestReadAfterClosedState(t *testing.T) {
+	// This test ensures that calling Read() or Peek() after the endpoint
+	// has transitioned to closedState still works if there is pending
+	// data. To transition to stateClosed without calling Close(), we must
+	// shutdown the send path and the peer must send its own FIN.
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Set TCPTimeWaitTimeout to 1 seconds so that sockets are marked closed
+	// after 1 second in TIME_WAIT state.
+	tcpTimeWaitTimeout := 1 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPTimeWaitTimeout(%d) failed: %s", tcpTimeWaitTimeout, err)
+	}
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Shutdown immediately for write, check that we get a FIN.
+	if err := c.EP.Shutdown(tcpip.ShutdownWrite); err != nil {
+		t.Fatalf("Shutdown failed: %s", err)
+	}
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagFin),
+		),
+	)
+
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateFinWait1; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+
+	// Send some data and acknowledge the FIN.
+	data := []byte{1, 2, 3}
+	c.SendPacket(data, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(2),
+		RcvWnd:  30000,
+	})
+
+	// Check that ACK is received.
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+2),
+			checker.AckNum(uint32(791+len(data))),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+
+	// Give the stack the chance to transition to closed state from
+	// TIME_WAIT.
+	time.Sleep(tcpTimeWaitTimeout * 2)
+
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateClose; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+
+	// Wait for receive to be notified.
+	select {
+	case <-ch:
+	case <-time.After(1 * time.Second):
+		t.Fatalf("Timed out waiting for data to arrive")
+	}
+
+	// Check that peek works.
+	peekBuf := make([]byte, 10)
+	n, _, err := c.EP.Peek([][]byte{peekBuf})
+	if err != nil {
+		t.Fatalf("Peek failed: %s", err)
+	}
+
+	peekBuf = peekBuf[:n]
+	if !bytes.Equal(data, peekBuf) {
+		t.Fatalf("got data = %v, want = %v", peekBuf, data)
+	}
+
+	// Receive data.
+	v, _, err := c.EP.Read(nil)
+	if err != nil {
+		t.Fatalf("Read failed: %s", err)
+	}
+
+	if !bytes.Equal(data, v) {
+		t.Fatalf("got data = %v, want = %v", v, data)
+	}
+
+	// Now that we drained the queue, check that functions fail with the
+	// right error code.
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrClosedForReceive {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrClosedForReceive)
+	}
+
+	if _, _, err := c.EP.Peek([][]byte{peekBuf}); err != tcpip.ErrClosedForReceive {
+		t.Fatalf("got c.EP.Peek(...) = %s, want = %s", err, tcpip.ErrClosedForReceive)
+	}
+}
+
+func TestReusePort(t *testing.T) {
+	// This test ensures that ports are immediately available for reuse
+	// after Close on the endpoints using them returns.
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// First case, just an endpoint that was bound.
+	var err *tcpip.Error
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+	if err != nil {
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
+		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
+	}
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	c.EP.Close()
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+	if err != nil {
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
+		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
+	}
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+	c.EP.Close()
+
+	// Second case, an endpoint that was bound and is connecting..
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+	if err != nil {
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
+		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
+	}
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+	if err := c.EP.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrConnectStarted {
+		t.Fatalf("got c.EP.Connect(...) = %s, want = %s", err, tcpip.ErrConnectStarted)
+	}
+	c.EP.Close()
+
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+	if err != nil {
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
+		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
+	}
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+	c.EP.Close()
+
+	// Third case, an endpoint that was bound and is listening.
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+	if err != nil {
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
+		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
+	}
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+	c.EP.Close()
+
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+	if err != nil {
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	if err := c.EP.SetSockOptBool(tcpip.ReuseAddressOption, true); err != nil {
+		t.Fatalf("SetSockOptBool ReuseAddressOption failed: %s", err)
+	}
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+}
+
+func checkRecvBufferSize(t *testing.T, ep tcpip.Endpoint, v int) {
+	t.Helper()
+
+	s, err := ep.GetSockOptInt(tcpip.ReceiveBufferSizeOption)
+	if err != nil {
+		t.Fatalf("GetSockOpt failed: %s", err)
+	}
+
+	if int(s) != v {
+		t.Fatalf("got receive buffer size = %d, want = %d", s, v)
+	}
+}
+
+func checkSendBufferSize(t *testing.T, ep tcpip.Endpoint, v int) {
+	t.Helper()
+
+	s, err := ep.GetSockOptInt(tcpip.SendBufferSizeOption)
+	if err != nil {
+		t.Fatalf("GetSockOpt failed: %s", err)
+	}
+
+	if int(s) != v {
+		t.Fatalf("got send buffer size = %d, want = %d", s, v)
+	}
+}
+
+func TestDefaultBufferSizes(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+	})
+
+	// Check the default values.
+	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+	if err != nil {
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	defer func() {
+		if ep != nil {
+			ep.Close()
+		}
+	}()
+
+	checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize)
+	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize)
+
+	// Change the default send buffer size.
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{
+		Min:     1,
+		Default: tcp.DefaultSendBufferSize * 2,
+		Max:     tcp.DefaultSendBufferSize * 20}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	}
+
+	ep.Close()
+	ep, err = s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+	if err != nil {
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+
+	checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*2)
+	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize)
+
+	// Change the default receive buffer size.
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{
+		Min:     1,
+		Default: tcp.DefaultReceiveBufferSize * 3,
+		Max:     tcp.DefaultReceiveBufferSize * 30}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %v", err)
+	}
+
+	ep.Close()
+	ep, err = s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+	if err != nil {
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+
+	checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*2)
+	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*3)
+}
+
+func TestMinMaxBufferSizes(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+	})
+
+	// Check the default values.
+	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+	if err != nil {
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	defer ep.Close()
+
+	// Change the min/max values for send/receive
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 200, Default: tcp.DefaultReceiveBufferSize * 2, Max: tcp.DefaultReceiveBufferSize * 20}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	}
+
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{Min: 300, Default: tcp.DefaultSendBufferSize * 3, Max: tcp.DefaultSendBufferSize * 30}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	}
+
+	// Set values below the min.
+	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 199); err != nil {
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption, 199) failed: %s", err)
+	}
+
+	checkRecvBufferSize(t, ep, 200)
+
+	if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 299); err != nil {
+		t.Fatalf("SetSockOptInt(SendBufferSizeOption, 299) failed: %s", err)
+	}
+
+	checkSendBufferSize(t, ep, 300)
+
+	// Set values above the max.
+	if err := ep.SetSockOptInt(tcpip.ReceiveBufferSizeOption, 1+tcp.DefaultReceiveBufferSize*20); err != nil {
+		t.Fatalf("SetSockOptInt(ReceiveBufferSizeOption) failed: %s", err)
+	}
+
+	checkRecvBufferSize(t, ep, tcp.DefaultReceiveBufferSize*20)
+
+	if err := ep.SetSockOptInt(tcpip.SendBufferSizeOption, 1+tcp.DefaultSendBufferSize*30); err != nil {
+		t.Fatalf("SetSockOptInt(SendBufferSizeOption) failed: %s", err)
+	}
+
+	checkSendBufferSize(t, ep, tcp.DefaultSendBufferSize*30)
+}
+
+func TestBindToDeviceOption(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()}})
+
+	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+	if err != nil {
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	defer ep.Close()
+
+	if err := s.CreateNIC(321, loopback.New()); err != nil {
+		t.Errorf("CreateNIC failed: %s", err)
+	}
+
+	// nicIDPtr is used instead of taking the address of NICID literals, which is
+	// a compiler error.
+	nicIDPtr := func(s tcpip.NICID) *tcpip.NICID {
+		return &s
+	}
+
+	testActions := []struct {
+		name                 string
+		setBindToDevice      *tcpip.NICID
+		setBindToDeviceError *tcpip.Error
+		getBindToDevice      tcpip.BindToDeviceOption
+	}{
+		{"GetDefaultValue", nil, nil, 0},
+		{"BindToNonExistent", nicIDPtr(999), tcpip.ErrUnknownDevice, 0},
+		{"BindToExistent", nicIDPtr(321), nil, 321},
+		{"UnbindToDevice", nicIDPtr(0), nil, 0},
+	}
+	for _, testAction := range testActions {
+		t.Run(testAction.name, func(t *testing.T) {
+			if testAction.setBindToDevice != nil {
+				bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
+				if gotErr, wantErr := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
+					t.Errorf("SetSockOpt(%#v) got %v, want %v", bindToDevice, gotErr, wantErr)
+				}
+			}
+			bindToDevice := tcpip.BindToDeviceOption(88888)
+			if err := ep.GetSockOpt(&bindToDevice); err != nil {
+				t.Errorf("GetSockOpt got %s, want %v", err, nil)
+			}
+			if got, want := bindToDevice, testAction.getBindToDevice; got != want {
+				t.Errorf("bindToDevice got %d, want %d", got, want)
+			}
+		})
+	}
+}
+
+func makeStack() (*stack.Stack, *tcpip.Error) {
+	s := stack.New(stack.Options{
+		NetworkProtocols: []stack.NetworkProtocol{
+			ipv4.NewProtocol(),
+			ipv6.NewProtocol(),
+		},
+		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+	})
+
+	id := loopback.New()
+	if testing.Verbose() {
+		id = sniffer.New(id)
+	}
+
+	if err := s.CreateNIC(1, id); err != nil {
+		return nil, err
+	}
+
+	for _, ct := range []struct {
+		number  tcpip.NetworkProtocolNumber
+		address tcpip.Address
+	}{
+		{ipv4.ProtocolNumber, context.StackAddr},
+		{ipv6.ProtocolNumber, context.StackV6Addr},
+	} {
+		if err := s.AddAddress(1, ct.number, ct.address); err != nil {
+			return nil, err
+		}
+	}
+
+	s.SetRouteTable([]tcpip.Route{
+		{
+			Destination: header.IPv4EmptySubnet,
+			NIC:         1,
+		},
+		{
+			Destination: header.IPv6EmptySubnet,
+			NIC:         1,
+		},
+	})
+
+	return s, nil
+}
+
+func TestSelfConnect(t *testing.T) {
+	// This test ensures that intentional self-connects work. In particular,
+	// it checks that if an endpoint binds to say 127.0.0.1:1000 then
+	// connects to 127.0.0.1:1000, then it will be connected to itself, and
+	// is able to send and receive data through the same endpoint.
+	s, err := makeStack()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var wq waiter.Queue
+	ep, err := s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	defer ep.Close()
+
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	// Register for notification, then start connection attempt.
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&waitEntry, waiter.EventOut)
+	defer wq.EventUnregister(&waitEntry)
+
+	if err := ep.Connect(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != tcpip.ErrConnectStarted {
+		t.Fatalf("got ep.Connect(...) = %s, want = %s", err, tcpip.ErrConnectStarted)
+	}
+
+	<-notifyCh
+	if err := ep.GetSockOpt(tcpip.ErrorOption{}); err != nil {
+		t.Fatalf("Connect failed: %s", err)
+	}
+
+	// Write something.
+	data := []byte{1, 2, 3}
+	view := buffer.NewView(len(data))
+	copy(view, data)
+	if _, _, err := ep.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Read back what was written.
+	wq.EventUnregister(&waitEntry)
+	wq.EventRegister(&waitEntry, waiter.EventIn)
+	rd, _, err := ep.Read(nil)
+	if err != nil {
+		if err != tcpip.ErrWouldBlock {
+			t.Fatalf("Read failed: %s", err)
+		}
+		<-notifyCh
+		rd, _, err = ep.Read(nil)
+		if err != nil {
+			t.Fatalf("Read failed: %s", err)
+		}
+	}
+
+	if !bytes.Equal(data, rd) {
+		t.Fatalf("got data = %v, want = %v", rd, data)
+	}
+}
+
+func TestConnectAvoidsBoundPorts(t *testing.T) {
+	addressTypes := func(t *testing.T, network string) []string {
+		switch network {
+		case "ipv4":
+			return []string{"v4"}
+		case "ipv6":
+			return []string{"v6"}
+		case "dual":
+			return []string{"v6", "mapped"}
+		default:
+			t.Fatalf("unknown network: '%s'", network)
+		}
+
+		panic("unreachable")
+	}
+
+	address := func(t *testing.T, addressType string, isAny bool) tcpip.Address {
+		switch addressType {
+		case "v4":
+			if isAny {
+				return ""
+			}
+			return context.StackAddr
+		case "v6":
+			if isAny {
+				return ""
+			}
+			return context.StackV6Addr
+		case "mapped":
+			if isAny {
+				return context.V4MappedWildcardAddr
+			}
+			return context.StackV4MappedAddr
+		default:
+			t.Fatalf("unknown address type: '%s'", addressType)
+		}
+
+		panic("unreachable")
+	}
+	// This test ensures that Endpoint.Connect doesn't select already-bound ports.
+	networks := []string{"ipv4", "ipv6", "dual"}
+	for _, exhaustedNetwork := range networks {
+		t.Run(fmt.Sprintf("exhaustedNetwork=%s", exhaustedNetwork), func(t *testing.T) {
+			for _, exhaustedAddressType := range addressTypes(t, exhaustedNetwork) {
+				t.Run(fmt.Sprintf("exhaustedAddressType=%s", exhaustedAddressType), func(t *testing.T) {
+					for _, isAny := range []bool{false, true} {
+						t.Run(fmt.Sprintf("isAny=%t", isAny), func(t *testing.T) {
+							for _, candidateNetwork := range networks {
+								t.Run(fmt.Sprintf("candidateNetwork=%s", candidateNetwork), func(t *testing.T) {
+									for _, candidateAddressType := range addressTypes(t, candidateNetwork) {
+										t.Run(fmt.Sprintf("candidateAddressType=%s", candidateAddressType), func(t *testing.T) {
+											s, err := makeStack()
+											if err != nil {
+												t.Fatal(err)
+											}
+
+											var wq waiter.Queue
+											var eps []tcpip.Endpoint
+											defer func() {
+												for _, ep := range eps {
+													ep.Close()
+												}
+											}()
+											makeEP := func(network string) tcpip.Endpoint {
+												var networkProtocolNumber tcpip.NetworkProtocolNumber
+												switch network {
+												case "ipv4":
+													networkProtocolNumber = ipv4.ProtocolNumber
+												case "ipv6", "dual":
+													networkProtocolNumber = ipv6.ProtocolNumber
+												default:
+													t.Fatalf("unknown network: '%s'", network)
+												}
+												ep, err := s.NewEndpoint(tcp.ProtocolNumber, networkProtocolNumber, &wq)
+												if err != nil {
+													t.Fatalf("NewEndpoint failed: %s", err)
+												}
+												eps = append(eps, ep)
+												switch network {
+												case "ipv4":
+												case "ipv6":
+													if err := ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+														t.Fatalf("SetSockOptBool(V6OnlyOption(true)) failed: %s", err)
+													}
+												case "dual":
+													if err := ep.SetSockOptBool(tcpip.V6OnlyOption, false); err != nil {
+														t.Fatalf("SetSockOptBool(V6OnlyOption(false)) failed: %s", err)
+													}
+												default:
+													t.Fatalf("unknown network: '%s'", network)
+												}
+												return ep
+											}
+
+											var v4reserved, v6reserved bool
+											switch exhaustedAddressType {
+											case "v4", "mapped":
+												v4reserved = true
+											case "v6":
+												v6reserved = true
+												// Dual stack sockets bound to v6 any reserve on v4 as
+												// well.
+												if isAny {
+													switch exhaustedNetwork {
+													case "ipv6":
+													case "dual":
+														v4reserved = true
+													default:
+														t.Fatalf("unknown address type: '%s'", exhaustedNetwork)
+													}
+												}
+											default:
+												t.Fatalf("unknown address type: '%s'", exhaustedAddressType)
+											}
+											var collides bool
+											switch candidateAddressType {
+											case "v4", "mapped":
+												collides = v4reserved
+											case "v6":
+												collides = v6reserved
+											default:
+												t.Fatalf("unknown address type: '%s'", candidateAddressType)
+											}
+
+											for i := ports.FirstEphemeral; i <= math.MaxUint16; i++ {
+												if makeEP(exhaustedNetwork).Bind(tcpip.FullAddress{Addr: address(t, exhaustedAddressType, isAny), Port: uint16(i)}); err != nil {
+													t.Fatalf("Bind(%d) failed: %s", i, err)
+												}
+											}
+											want := tcpip.ErrConnectStarted
+											if collides {
+												want = tcpip.ErrNoPortAvailable
+											}
+											if err := makeEP(candidateNetwork).Connect(tcpip.FullAddress{Addr: address(t, candidateAddressType, false), Port: 31337}); err != want {
+												t.Fatalf("got ep.Connect(..) = %s, want = %s", err, want)
+											}
+										})
+									}
+								})
+							}
+						})
+					}
+				})
+			}
+		})
+	}
+}
+
+func TestPathMTUDiscovery(t *testing.T) {
+	// This test verifies the stack retransmits packets after it receives an
+	// ICMP packet indicating that the path MTU has been exceeded.
+	c := context.New(t, 1500)
+	defer c.Cleanup()
+
+	// Create new connection with MSS of 1460.
+	const maxPayload = 1500 - header.TCPMinimumSize - header.IPv4MinimumSize
+	c.CreateConnectedWithRawOptions(789, 30000, -1 /* epRcvBuf */, []byte{
+		header.TCPOptionMSS, 4, byte(maxPayload / 256), byte(maxPayload % 256),
+	})
+
+	// Send 3200 bytes of data.
+	const writeSize = 3200
+	data := buffer.NewView(writeSize)
+	for i := range data {
+		data[i] = byte(i)
+	}
+
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(data), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	receivePackets := func(c *context.Context, sizes []int, which int, seqNum uint32) []byte {
+		var ret []byte
+		for i, size := range sizes {
+			p := c.GetPacket()
+			if i == which {
+				ret = p
+			}
+			checker.IPv4(t, p,
+				checker.PayloadLen(size+header.TCPMinimumSize),
+				checker.TCP(
+					checker.DstPort(context.TestPort),
+					checker.SeqNum(seqNum),
+					checker.AckNum(790),
+					checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+				),
+			)
+			seqNum += uint32(size)
+		}
+		return ret
+	}
+
+	// Receive three packets.
+	sizes := []int{maxPayload, maxPayload, writeSize - 2*maxPayload}
+	first := receivePackets(c, sizes, 0, uint32(c.IRS)+1)
+
+	// Send "packet too big" messages back to netstack.
+	const newMTU = 1200
+	const newMaxPayload = newMTU - header.IPv4MinimumSize - header.TCPMinimumSize
+	mtu := []byte{0, 0, newMTU / 256, newMTU % 256}
+	c.SendICMPPacket(header.ICMPv4DstUnreachable, header.ICMPv4FragmentationNeeded, mtu, first, newMTU)
+
+	// See retransmitted packets. None exceeding the new max.
+	sizes = []int{newMaxPayload, maxPayload - newMaxPayload, newMaxPayload, maxPayload - newMaxPayload, writeSize - 2*maxPayload}
+	receivePackets(c, sizes, -1, uint32(c.IRS)+1)
+}
+
+func TestTCPEndpointProbe(t *testing.T) {
+	c := context.New(t, 1500)
+	defer c.Cleanup()
+
+	invoked := make(chan struct{})
+	c.Stack().AddTCPProbe(func(state stack.TCPEndpointState) {
+		// Validate that the endpoint ID is what we expect.
+		//
+		// We don't do an extensive validation of every field but a
+		// basic sanity test.
+		if got, want := state.ID.LocalAddress, tcpip.Address(context.StackAddr); got != want {
+			t.Fatalf("got LocalAddress: %q, want: %q", got, want)
+		}
+		if got, want := state.ID.LocalPort, c.Port; got != want {
+			t.Fatalf("got LocalPort: %d, want: %d", got, want)
+		}
+		if got, want := state.ID.RemoteAddress, tcpip.Address(context.TestAddr); got != want {
+			t.Fatalf("got RemoteAddress: %q, want: %q", got, want)
+		}
+		if got, want := state.ID.RemotePort, uint16(context.TestPort); got != want {
+			t.Fatalf("got RemotePort: %d, want: %d", got, want)
+		}
+
+		invoked <- struct{}{}
+	})
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	data := []byte{1, 2, 3}
+	c.SendPacket(data, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+	})
+
+	select {
+	case <-invoked:
+	case <-time.After(100 * time.Millisecond):
+		t.Fatalf("TCP Probe function was not called")
+	}
+}
+
+func TestStackSetCongestionControl(t *testing.T) {
+	testCases := []struct {
+		cc  tcpip.CongestionControlOption
+		err *tcpip.Error
+	}{
+		{"reno", nil},
+		{"cubic", nil},
+		{"blahblah", tcpip.ErrNoSuchFile},
+	}
+
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("SetTransportProtocolOption(.., %v)", tc.cc), func(t *testing.T) {
+			c := context.New(t, 1500)
+			defer c.Cleanup()
+
+			s := c.Stack()
+
+			var oldCC tcpip.CongestionControlOption
+			if err := s.TransportProtocolOption(tcp.ProtocolNumber, &oldCC); err != nil {
+				t.Fatalf("s.TransportProtocolOption(%v, %v) = %s", tcp.ProtocolNumber, &oldCC, err)
+			}
+
+			if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tc.cc); err != tc.err {
+				t.Fatalf("s.SetTransportProtocolOption(%v, %v) = %v, want %v", tcp.ProtocolNumber, tc.cc, err, tc.err)
+			}
+
+			var cc tcpip.CongestionControlOption
+			if err := s.TransportProtocolOption(tcp.ProtocolNumber, &cc); err != nil {
+				t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &cc, err)
+			}
+
+			got, want := cc, oldCC
+			// If SetTransportProtocolOption is expected to succeed
+			// then the returned value for congestion control should
+			// match the one specified in the
+			// SetTransportProtocolOption call above, else it should
+			// be what it was before the call to
+			// SetTransportProtocolOption.
+			if tc.err == nil {
+				want = tc.cc
+			}
+			if got != want {
+				t.Fatalf("got congestion control: %v, want: %v", got, want)
+			}
+		})
+	}
+}
+
+func TestStackAvailableCongestionControl(t *testing.T) {
+	c := context.New(t, 1500)
+	defer c.Cleanup()
+
+	s := c.Stack()
+
+	// Query permitted congestion control algorithms.
+	var aCC tcpip.AvailableCongestionControlOption
+	if err := s.TransportProtocolOption(tcp.ProtocolNumber, &aCC); err != nil {
+		t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &aCC, err)
+	}
+	if got, want := aCC, tcpip.AvailableCongestionControlOption("reno cubic"); got != want {
+		t.Fatalf("got tcpip.AvailableCongestionControlOption: %v, want: %v", got, want)
+	}
+}
+
+func TestStackSetAvailableCongestionControl(t *testing.T) {
+	c := context.New(t, 1500)
+	defer c.Cleanup()
+
+	s := c.Stack()
+
+	// Setting AvailableCongestionControlOption should fail.
+	aCC := tcpip.AvailableCongestionControlOption("xyz")
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, &aCC); err == nil {
+		t.Fatalf("s.TransportProtocolOption(%v, %v) = nil, want non-nil", tcp.ProtocolNumber, &aCC)
+	}
+
+	// Verify that we still get the expected list of congestion control options.
+	var cc tcpip.AvailableCongestionControlOption
+	if err := s.TransportProtocolOption(tcp.ProtocolNumber, &cc); err != nil {
+		t.Fatalf("s.TransportProtocolOption(%v, %v) = %v", tcp.ProtocolNumber, &cc, err)
+	}
+	if got, want := cc, tcpip.AvailableCongestionControlOption("reno cubic"); got != want {
+		t.Fatalf("got tcpip.AvailableCongestionControlOption: %v, want: %v", got, want)
+	}
+}
+
+func TestEndpointSetCongestionControl(t *testing.T) {
+	testCases := []struct {
+		cc  tcpip.CongestionControlOption
+		err *tcpip.Error
+	}{
+		{"reno", nil},
+		{"cubic", nil},
+		{"blahblah", tcpip.ErrNoSuchFile},
+	}
+
+	for _, connected := range []bool{false, true} {
+		for _, tc := range testCases {
+			t.Run(fmt.Sprintf("SetSockOpt(.., %v) w/ connected = %v", tc.cc, connected), func(t *testing.T) {
+				c := context.New(t, 1500)
+				defer c.Cleanup()
+
+				// Create TCP endpoint.
+				var err *tcpip.Error
+				c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+				if err != nil {
+					t.Fatalf("NewEndpoint failed: %s", err)
+				}
+
+				var oldCC tcpip.CongestionControlOption
+				if err := c.EP.GetSockOpt(&oldCC); err != nil {
+					t.Fatalf("c.EP.SockOpt(%v) = %s", &oldCC, err)
+				}
+
+				if connected {
+					c.Connect(789 /* iss */, 32768 /* rcvWnd */, nil)
+				}
+
+				if err := c.EP.SetSockOpt(tc.cc); err != tc.err {
+					t.Fatalf("c.EP.SetSockOpt(%v) = %s, want %s", tc.cc, err, tc.err)
+				}
+
+				var cc tcpip.CongestionControlOption
+				if err := c.EP.GetSockOpt(&cc); err != nil {
+					t.Fatalf("c.EP.SockOpt(%v) = %s", &cc, err)
+				}
+
+				got, want := cc, oldCC
+				// If SetSockOpt is expected to succeed then the
+				// returned value for congestion control should match
+				// the one specified in the SetSockOpt above, else it
+				// should be what it was before the call to SetSockOpt.
+				if tc.err == nil {
+					want = tc.cc
+				}
+				if got != want {
+					t.Fatalf("got congestion control: %v, want: %v", got, want)
+				}
+			})
+		}
+	}
+}
+
+func enableCUBIC(t *testing.T, c *context.Context) {
+	t.Helper()
+	opt := tcpip.CongestionControlOption("cubic")
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, opt); err != nil {
+		t.Fatalf("c.s.SetTransportProtocolOption(tcp.ProtocolNumber, %s = %s", opt, err)
+	}
+}
+
+func TestKeepalive(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	const keepAliveInterval = 3 * time.Second
+	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(100 * time.Millisecond))
+	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
+	c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 5)
+	c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true)
+
+	// 5 unacked keepalives are sent. ACK each one, and check that the
+	// connection stays alive after 5.
+	for i := 0; i < 10; i++ {
+		b := c.GetPacket()
+		checker.IPv4(t, b,
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)),
+				checker.AckNum(uint32(790)),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+
+		// Acknowledge the keepalive.
+		c.SendPacket(nil, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  790,
+			AckNum:  c.IRS,
+			RcvWnd:  30000,
+		})
+	}
+
+	// Check that the connection is still alive.
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Send some data and wait before ACKing it. Keepalives should be disabled
+	// during this period.
+	view := buffer.NewView(3)
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	next := uint32(c.IRS) + 1
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(len(view)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	// Wait for the packet to be retransmitted. Verify that no keepalives
+	// were sent.
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(len(view)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(790),
+			checker.TCPFlags(header.TCPFlagAck|header.TCPFlagPsh),
+		),
+	)
+	c.CheckNoPacket("Keepalive packet received while unACKed data is pending")
+
+	next += uint32(len(view))
+
+	// Send ACK. Keepalives should start sending again.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
+	// Now receive 5 keepalives, but don't ACK them. The connection
+	// should be reset after 5.
+	for i := 0; i < 5; i++ {
+		b := c.GetPacket()
+		checker.IPv4(t, b,
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(next-1)),
+				checker.AckNum(uint32(790)),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+	}
+
+	// Sleep for a litte over the KeepAlive interval to make sure
+	// the timer has time to fire after the last ACK and close the
+	// close the socket.
+	time.Sleep(keepAliveInterval + keepAliveInterval/2)
+
+	// The connection should be terminated after 5 unacked keepalives.
+	// Send an ACK to trigger a RST from the stack as the endpoint should
+	// be dead.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(next)),
+			checker.AckNum(uint32(0)),
+			checker.TCPFlags(header.TCPFlagRst),
+		),
+	)
+
+	if got := c.Stack().Stats().TCP.EstablishedTimedout.Value(); got != 1 {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout.Value() = %d, want = 1", got)
+	}
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrTimeout)
+	}
+
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)
+	}
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)
+	}
+}
+
+func executeHandshake(t *testing.T, c *context.Context, srcPort uint16, synCookieInUse bool) (irs, iss seqnum.Value) {
+	// Send a SYN request.
+	irs = seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: srcPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  irs,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcp := header.TCP(header.IPv4(b).Payload())
+	iss = seqnum.Value(tcp.SequenceNumber())
+	tcpCheckers := []checker.TransportChecker{
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(srcPort),
+		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
+		checker.AckNum(uint32(irs) + 1),
+	}
+
+	if synCookieInUse {
+		// When cookies are in use window scaling is disabled.
+		tcpCheckers = append(tcpCheckers, checker.TCPSynOptions(header.TCPSynOptions{
+			WS:  -1,
+			MSS: c.MSSWithoutOptions(),
+		}))
+	}
+
+	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
+
+	// Send ACK.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: srcPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+		RcvWnd:  30000,
+	})
+	return irs, iss
+}
+
+func executeV6Handshake(t *testing.T, c *context.Context, srcPort uint16, synCookieInUse bool) (irs, iss seqnum.Value) {
+	// Send a SYN request.
+	irs = seqnum.Value(789)
+	c.SendV6Packet(nil, &context.Headers{
+		SrcPort: srcPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  irs,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetV6Packet()
+	tcp := header.TCP(header.IPv6(b).Payload())
+	iss = seqnum.Value(tcp.SequenceNumber())
+	tcpCheckers := []checker.TransportChecker{
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(srcPort),
+		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
+		checker.AckNum(uint32(irs) + 1),
+	}
+
+	if synCookieInUse {
+		// When cookies are in use window scaling is disabled.
+		tcpCheckers = append(tcpCheckers, checker.TCPSynOptions(header.TCPSynOptions{
+			WS:  -1,
+			MSS: c.MSSWithoutOptionsV6(),
+		}))
+	}
+
+	checker.IPv6(t, b, checker.TCP(tcpCheckers...))
+
+	// Send ACK.
+	c.SendV6Packet(nil, &context.Headers{
+		SrcPort: srcPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+		RcvWnd:  30000,
+	})
+	return irs, iss
+}
+
+// TestListenBacklogFull tests that netstack does not complete handshakes if the
+// listen backlog for the endpoint is full.
+func TestListenBacklogFull(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create TCP endpoint.
+	var err *tcpip.Error
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+
+	// Bind to wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	// Test acceptance.
+	// Start listening.
+	listenBacklog := 2
+	if err := c.EP.Listen(listenBacklog); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	for i := 0; i < listenBacklog; i++ {
+		executeHandshake(t, c, context.TestPort+uint16(i), false /*synCookieInUse */)
+	}
+
+	time.Sleep(50 * time.Millisecond)
+
+	// Now execute send one more SYN. The stack should not respond as the backlog
+	// is full at this point.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort + 2,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  seqnum.Value(789),
+		RcvWnd:  30000,
+	})
+	c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond)
+
+	// Try to accept the connections in the backlog.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	for i := 0; i < listenBacklog; i++ {
+		_, _, err = c.EP.Accept()
+		if err == tcpip.ErrWouldBlock {
+			// Wait for connection to be established.
+			select {
+			case <-ch:
+				_, _, err = c.EP.Accept()
+				if err != nil {
+					t.Fatalf("Accept failed: %s", err)
+				}
+
+			case <-time.After(1 * time.Second):
+				t.Fatalf("Timed out waiting for accept")
+			}
+		}
+	}
+
+	// Now verify that there are no more connections that can be accepted.
+	_, _, err = c.EP.Accept()
+	if err != tcpip.ErrWouldBlock {
+		select {
+		case <-ch:
+			t.Fatalf("unexpected endpoint delivered on Accept: %+v", c.EP)
+		case <-time.After(1 * time.Second):
+		}
+	}
+
+	// Now a new handshake must succeed.
+	executeHandshake(t, c, context.TestPort+2, false /*synCookieInUse */)
+
+	newEP, _, err := c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			newEP, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Now verify that the TCP socket is usable and in a connected state.
+	data := "Don't panic"
+	newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{})
+	b := c.GetPacket()
+	tcp := header.TCP(header.IPv4(b).Payload())
+	if string(tcp.Payload()) != data {
+		t.Fatalf("unexpected data: got %s, want %s", string(tcp.Payload()), data)
+	}
+}
+
+// TestListenNoAcceptMulticastBroadcastV4 makes sure that TCP segments with a
+// non unicast IPv4 address are not accepted.
+func TestListenNoAcceptNonUnicastV4(t *testing.T) {
+	multicastAddr := tcpip.Address("\xe0\x00\x01\x02")
+	otherMulticastAddr := tcpip.Address("\xe0\x00\x01\x03")
+
+	tests := []struct {
+		name    string
+		srcAddr tcpip.Address
+		dstAddr tcpip.Address
+	}{
+		{
+			"SourceUnspecified",
+			header.IPv4Any,
+			context.StackAddr,
+		},
+		{
+			"SourceBroadcast",
+			header.IPv4Broadcast,
+			context.StackAddr,
+		},
+		{
+			"SourceOurMulticast",
+			multicastAddr,
+			context.StackAddr,
+		},
+		{
+			"SourceOtherMulticast",
+			otherMulticastAddr,
+			context.StackAddr,
+		},
+		{
+			"DestUnspecified",
+			context.TestAddr,
+			header.IPv4Any,
+		},
+		{
+			"DestBroadcast",
+			context.TestAddr,
+			header.IPv4Broadcast,
+		},
+		{
+			"DestOurMulticast",
+			context.TestAddr,
+			multicastAddr,
+		},
+		{
+			"DestOtherMulticast",
+			context.TestAddr,
+			otherMulticastAddr,
+		},
+	}
+
+	for _, test := range tests {
+		test := test // capture range variable
+
+		t.Run(test.name, func(t *testing.T) {
+			t.Parallel()
+
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
+
+			c.Create(-1)
+
+			if err := c.Stack().JoinGroup(header.IPv4ProtocolNumber, 1, multicastAddr); err != nil {
+				t.Fatalf("JoinGroup failed: %s", err)
+			}
+
+			if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+				t.Fatalf("Bind failed: %s", err)
+			}
+
+			if err := c.EP.Listen(1); err != nil {
+				t.Fatalf("Listen failed: %s", err)
+			}
+
+			irs := seqnum.Value(789)
+			c.SendPacketWithAddrs(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: context.StackPort,
+				Flags:   header.TCPFlagSyn,
+				SeqNum:  irs,
+				RcvWnd:  30000,
+			}, test.srcAddr, test.dstAddr)
+			c.CheckNoPacket("Should not have received a response")
+
+			// Handle normal packet.
+			c.SendPacketWithAddrs(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: context.StackPort,
+				Flags:   header.TCPFlagSyn,
+				SeqNum:  irs,
+				RcvWnd:  30000,
+			}, context.TestAddr, context.StackAddr)
+			checker.IPv4(t, c.GetPacket(),
+				checker.TCP(
+					checker.SrcPort(context.StackPort),
+					checker.DstPort(context.TestPort),
+					checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
+					checker.AckNum(uint32(irs)+1)))
+		})
+	}
+}
+
+// TestListenNoAcceptMulticastBroadcastV6 makes sure that TCP segments with a
+// non unicast IPv6 address are not accepted.
+func TestListenNoAcceptNonUnicastV6(t *testing.T) {
+	multicastAddr := tcpip.Address("\xff\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x01")
+	otherMulticastAddr := tcpip.Address("\xff\x0e\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02")
+
+	tests := []struct {
+		name    string
+		srcAddr tcpip.Address
+		dstAddr tcpip.Address
+	}{
+		{
+			"SourceUnspecified",
+			header.IPv6Any,
+			context.StackV6Addr,
+		},
+		{
+			"SourceAllNodes",
+			header.IPv6AllNodesMulticastAddress,
+			context.StackV6Addr,
+		},
+		{
+			"SourceOurMulticast",
+			multicastAddr,
+			context.StackV6Addr,
+		},
+		{
+			"SourceOtherMulticast",
+			otherMulticastAddr,
+			context.StackV6Addr,
+		},
+		{
+			"DestUnspecified",
+			context.TestV6Addr,
+			header.IPv6Any,
+		},
+		{
+			"DestAllNodes",
+			context.TestV6Addr,
+			header.IPv6AllNodesMulticastAddress,
+		},
+		{
+			"DestOurMulticast",
+			context.TestV6Addr,
+			multicastAddr,
+		},
+		{
+			"DestOtherMulticast",
+			context.TestV6Addr,
+			otherMulticastAddr,
+		},
+	}
+
+	for _, test := range tests {
+		test := test // capture range variable
+
+		t.Run(test.name, func(t *testing.T) {
+			t.Parallel()
+
+			c := context.New(t, defaultMTU)
+			defer c.Cleanup()
+
+			c.CreateV6Endpoint(true)
+
+			if err := c.Stack().JoinGroup(header.IPv6ProtocolNumber, 1, multicastAddr); err != nil {
+				t.Fatalf("JoinGroup failed: %s", err)
+			}
+
+			if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+				t.Fatalf("Bind failed: %s", err)
+			}
+
+			if err := c.EP.Listen(1); err != nil {
+				t.Fatalf("Listen failed: %s", err)
+			}
+
+			irs := seqnum.Value(789)
+			c.SendV6PacketWithAddrs(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: context.StackPort,
+				Flags:   header.TCPFlagSyn,
+				SeqNum:  irs,
+				RcvWnd:  30000,
+			}, test.srcAddr, test.dstAddr)
+			c.CheckNoPacket("Should not have received a response")
+
+			// Handle normal packet.
+			c.SendV6PacketWithAddrs(nil, &context.Headers{
+				SrcPort: context.TestPort,
+				DstPort: context.StackPort,
+				Flags:   header.TCPFlagSyn,
+				SeqNum:  irs,
+				RcvWnd:  30000,
+			}, context.TestV6Addr, context.StackV6Addr)
+			checker.IPv6(t, c.GetV6Packet(),
+				checker.TCP(
+					checker.SrcPort(context.StackPort),
+					checker.DstPort(context.TestPort),
+					checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
+					checker.AckNum(uint32(irs)+1)))
+		})
+	}
+}
+
+func TestListenSynRcvdQueueFull(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create TCP endpoint.
+	var err *tcpip.Error
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+
+	// Bind to wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	// Test acceptance.
+	// Start listening.
+	listenBacklog := 1
+	if err := c.EP.Listen(listenBacklog); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send two SYN's the first one should get a SYN-ACK, the
+	// second one should not get any response and is dropped as
+	// the synRcvd count will be equal to backlog.
+	irs := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  irs,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcp := header.TCP(header.IPv4(b).Payload())
+	iss := seqnum.Value(tcp.SequenceNumber())
+	tcpCheckers := []checker.TransportChecker{
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
+		checker.AckNum(uint32(irs) + 1),
+	}
+	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
+
+	// Now execute send one more SYN. The stack should not respond as the backlog
+	// is full at this point.
+	//
+	// NOTE: we did not complete the handshake for the previous one so the
+	// accept backlog should be empty and there should be one connection in
+	// synRcvd state.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort + 1,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  seqnum.Value(889),
+		RcvWnd:  30000,
+	})
+	c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond)
+
+	// Now complete the previous connection and verify that there is a connection
+	// to accept.
+	// Send ACK.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+		RcvWnd:  30000,
+	})
+
+	// Try to accept the connections in the backlog.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	newEP, _, err := c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			newEP, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Now verify that the TCP socket is usable and in a connected state.
+	data := "Don't panic"
+	newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{})
+	pkt := c.GetPacket()
+	tcp = header.TCP(header.IPv4(pkt).Payload())
+	if string(tcp.Payload()) != data {
+		t.Fatalf("unexpected data: got %s, want %s", string(tcp.Payload()), data)
+	}
+}
+
+func TestListenBacklogFullSynCookieInUse(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(1)); err != nil {
+		t.Fatalf("setting TCPSynRcvdCountThresholdOption to 1 failed: %s", err)
+	}
+
+	// Create TCP endpoint.
+	var err *tcpip.Error
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+
+	// Bind to wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	// Test acceptance.
+	// Start listening.
+	listenBacklog := 1
+	portOffset := uint16(0)
+	if err := c.EP.Listen(listenBacklog); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	executeHandshake(t, c, context.TestPort+portOffset, false)
+	portOffset++
+	// Wait for this to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+
+	// Send a SYN request.
+	irs := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		// pick a different src port for new SYN.
+		SrcPort: context.TestPort + 1,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  irs,
+		RcvWnd:  30000,
+	})
+	// The Syn should be dropped as the endpoint's backlog is full.
+	c.CheckNoPacketTimeout("unexpected packet received", 50*time.Millisecond)
+
+	// Verify that there is only one acceptable connection at this point.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	_, _, err = c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			_, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Now verify that there are no more connections that can be accepted.
+	_, _, err = c.EP.Accept()
+	if err != tcpip.ErrWouldBlock {
+		select {
+		case <-ch:
+			t.Fatalf("unexpected endpoint delivered on Accept: %+v", c.EP)
+		case <-time.After(1 * time.Second):
+		}
+	}
+}
+
+func TestSynRcvdBadSeqNumber(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Create TCP endpoint.
+	var err *tcpip.Error
+	c.EP, err = c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+
+	// Bind to wildcard.
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	// Start listening.
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN to get a SYN-ACK. This should put the ep into SYN-RCVD state
+	irs := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  irs,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	iss := seqnum.Value(tcpHdr.SequenceNumber())
+	tcpCheckers := []checker.TransportChecker{
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
+		checker.AckNum(uint32(irs) + 1),
+	}
+	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
+
+	// Now send a packet with an out-of-window sequence number
+	largeSeqnum := irs + seqnum.Value(tcpHdr.WindowSize()) + 1
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  largeSeqnum,
+		AckNum:  iss + 1,
+		RcvWnd:  30000,
+	})
+
+	// Should receive an ACK with the expected SEQ number
+	b = c.GetPacket()
+	tcpCheckers = []checker.TransportChecker{
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.AckNum(uint32(irs) + 1),
+		checker.SeqNum(uint32(iss + 1)),
+	}
+	checker.IPv4(t, b, checker.TCP(tcpCheckers...))
+
+	// Now that the socket replied appropriately with the ACK,
+	// complete the connection to test that the large SEQ num
+	// did not change the state from SYN-RCVD.
+
+	// Send ACK to move to ESTABLISHED state.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+		RcvWnd:  30000,
+	})
+
+	newEP, _, err := c.EP.Accept()
+
+	if err != nil && err != tcpip.ErrWouldBlock {
+		t.Fatalf("Accept failed: %s", err)
+	}
+
+	if err == tcpip.ErrWouldBlock {
+		// Try to accept the connections in the backlog.
+		we, ch := waiter.NewChannelEntry(nil)
+		c.WQ.EventRegister(&we, waiter.EventIn)
+		defer c.WQ.EventUnregister(&we)
+
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			newEP, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Now verify that the TCP socket is usable and in a connected state.
+	data := "Don't panic"
+	_, _, err = newEP.Write(tcpip.SlicePayload(buffer.NewViewFromBytes([]byte(data))), tcpip.WriteOptions{})
+
+	if err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	pkt := c.GetPacket()
+	tcpHdr = header.TCP(header.IPv4(pkt).Payload())
+	if string(tcpHdr.Payload()) != data {
+		t.Fatalf("unexpected data: got %s, want %s", string(tcpHdr.Payload()), data)
+	}
+}
+
+func TestPassiveConnectionAttemptIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	c.EP = ep
+	if err := ep.Bind(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateBound; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+	if err := c.EP.Listen(1); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateListen; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+
+	stats := c.Stack().Stats()
+	want := stats.TCP.PassiveConnectionOpenings.Value() + 1
+
+	srcPort := uint16(context.TestPort)
+	executeHandshake(t, c, srcPort+1, false)
+
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	// Verify that there is only one acceptable connection at this point.
+	_, _, err = c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			_, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	if got := stats.TCP.PassiveConnectionOpenings.Value(); got != want {
+		t.Errorf("got stats.TCP.PassiveConnectionOpenings.Value() = %d, want = %d", got, want)
+	}
+}
+
+func TestPassiveFailedConnectionAttemptIncrement(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	stats := c.Stack().Stats()
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	c.EP = ep
+	if err := c.EP.Bind(tcpip.FullAddress{Addr: context.StackAddr, Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+	if err := c.EP.Listen(1); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	srcPort := uint16(context.TestPort)
+	// Now attempt a handshakes it will fill up the accept backlog.
+	executeHandshake(t, c, srcPort, false)
+
+	// Give time for the final ACK to be processed as otherwise the next handshake could
+	// get accepted before the previous one based on goroutine scheduling.
+	time.Sleep(50 * time.Millisecond)
+
+	want := stats.TCP.ListenOverflowSynDrop.Value() + 1
+
+	// Now we will send one more SYN and this one should get dropped
+	// Send a SYN request.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: srcPort + 2,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  seqnum.Value(789),
+		RcvWnd:  30000,
+	})
+
+	time.Sleep(50 * time.Millisecond)
+	if got := stats.TCP.ListenOverflowSynDrop.Value(); got != want {
+		t.Errorf("got stats.TCP.ListenOverflowSynDrop.Value() = %d, want = %d", got, want)
+	}
+	if got := c.EP.Stats().(*tcp.Stats).ReceiveErrors.ListenOverflowSynDrop.Value(); got != want {
+		t.Errorf("got EP stats Stats.ReceiveErrors.ListenOverflowSynDrop = %d, want = %d", got, want)
+	}
+
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	// Now check that there is one acceptable connections.
+	_, _, err = c.EP.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			_, _, err = c.EP.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+}
+
+func TestEndpointBindListenAcceptState(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateBound; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+
+	if _, _, err := ep.Read(nil); err != tcpip.ErrNotConnected {
+		t.Errorf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrNotConnected)
+	}
+	if got := ep.Stats().(*tcp.Stats).ReadErrors.NotConnected.Value(); got != 1 {
+		t.Errorf("got EP stats Stats.ReadErrors.NotConnected got %d want %d", got, 1)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateListen; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+
+	c.PassiveConnectWithOptions(100, 5, header.TCPSynOptions{MSS: defaultIPv4MSS})
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	aep, _, err := ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			aep, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+	if got, want := tcp.EndpointState(aep.State()), tcp.StateEstablished; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+	if err := aep.Connect(tcpip.FullAddress{Addr: context.TestAddr, Port: context.TestPort}); err != tcpip.ErrAlreadyConnected {
+		t.Errorf("unexpected error attempting to call connect on an established endpoint, got: %s, want: %s", err, tcpip.ErrAlreadyConnected)
+	}
+	// Listening endpoint remains in listen state.
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateListen; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+
+	ep.Close()
+	// Give worker goroutines time to receive the close notification.
+	time.Sleep(1 * time.Second)
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateClose; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+	// Accepted endpoint remains open when the listen endpoint is closed.
+	if got, want := tcp.EndpointState(aep.State()), tcp.StateEstablished; got != want {
+		t.Errorf("unexpected endpoint state: want %s, got %s", want, got)
+	}
+
+}
+
+// This test verifies that the auto tuning does not grow the receive buffer if
+// the application is not reading the data actively.
+func TestReceiveBufferAutoTuningApplicationLimited(t *testing.T) {
+	const mtu = 1500
+	const mss = mtu - header.IPv4MinimumSize - header.TCPMinimumSize
+
+	c := context.New(t, mtu)
+	defer c.Cleanup()
+
+	stk := c.Stack()
+	// Set lower limits for auto-tuning tests. This is required because the
+	// test stops the worker which can cause packets to be dropped because
+	// the segment queue holding unprocessed packets is limited to 500.
+	const receiveBufferSize = 80 << 10 // 80KB.
+	const maxReceiveBufferSize = receiveBufferSize * 10
+	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	}
+
+	// Enable auto-tuning.
+	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	}
+	// Change the expected window scale to match the value needed for the
+	// maximum buffer size defined above.
+	c.WindowScale = uint8(tcp.FindWndScale(maxReceiveBufferSize))
+
+	rawEP := c.CreateConnectedWithOptions(header.TCPSynOptions{TS: true, WS: 4})
+
+	// NOTE: The timestamp values in the sent packets are meaningless to the
+	// peer so we just increment the timestamp value by 1 every batch as we
+	// are not really using them for anything. Send a single byte to verify
+	// the advertised window.
+	tsVal := rawEP.TSVal + 1
+
+	// Introduce a 25ms latency by delaying the first byte.
+	latency := 25 * time.Millisecond
+	time.Sleep(latency)
+	rawEP.SendPacketWithTS([]byte{1}, tsVal)
+
+	// Verify that the ACK has the expected window.
+	wantRcvWnd := receiveBufferSize
+	wantRcvWnd = (wantRcvWnd >> uint32(c.WindowScale))
+	rawEP.VerifyACKRcvWnd(uint16(wantRcvWnd - 1))
+	time.Sleep(25 * time.Millisecond)
+
+	// Allocate a large enough payload for the test.
+	b := make([]byte, int(receiveBufferSize)*2)
+	offset := 0
+	payloadSize := receiveBufferSize - 1
+	worker := (c.EP).(interface {
+		StopWork()
+		ResumeWork()
+	})
+	tsVal++
+
+	// Stop the worker goroutine.
+	worker.StopWork()
+	start := offset
+	end := offset + payloadSize
+	packetsSent := 0
+	for ; start < end; start += mss {
+		rawEP.SendPacketWithTS(b[start:start+mss], tsVal)
+		packetsSent++
+	}
+
+	// Resume the worker so that it only sees the packets once all of them
+	// are waiting to be read.
+	worker.ResumeWork()
+
+	// Since we read no bytes the window should goto zero till the
+	// application reads some of the data.
+	// Discard all intermediate acks except the last one.
+	if packetsSent > 100 {
+		for i := 0; i < (packetsSent / 100); i++ {
+			_ = c.GetPacket()
+		}
+	}
+	rawEP.VerifyACKRcvWnd(0)
+
+	time.Sleep(25 * time.Millisecond)
+	// Verify that sending more data when window is closed is dropped and
+	// not acked.
+	rawEP.SendPacketWithTS(b[start:start+mss], tsVal)
+
+	// Verify that the stack sends us back an ACK with the sequence number
+	// of the last packet sent indicating it was dropped.
+	p := c.GetPacket()
+	checker.IPv4(t, p, checker.TCP(
+		checker.AckNum(uint32(rawEP.NextSeqNum)-uint32(mss)),
+		checker.Window(0),
+	))
+
+	// Now read all the data from the endpoint and verify that advertised
+	// window increases to the full available buffer size.
+	for {
+		_, _, err := c.EP.Read(nil)
+		if err == tcpip.ErrWouldBlock {
+			break
+		}
+	}
+
+	// Verify that we receive a non-zero window update ACK. When running
+	// under thread santizer this test can end up sending more than 1
+	// ack, 1 for the non-zero window
+	p = c.GetPacket()
+	checker.IPv4(t, p, checker.TCP(
+		checker.AckNum(uint32(rawEP.NextSeqNum)-uint32(mss)),
+		func(t *testing.T, h header.Transport) {
+			tcp, ok := h.(header.TCP)
+			if !ok {
+				return
+			}
+			if w := tcp.WindowSize(); w == 0 || w > uint16(wantRcvWnd) {
+				t.Errorf("expected a non-zero window: got %d, want <= wantRcvWnd", w)
+			}
+		},
+	))
+}
+
+// This test verifies that the auto tuning does not grow the receive buffer if
+// the application is not reading the data actively.
+func TestReceiveBufferAutoTuning(t *testing.T) {
+	const mtu = 1500
+	const mss = mtu - header.IPv4MinimumSize - header.TCPMinimumSize
+
+	c := context.New(t, mtu)
+	defer c.Cleanup()
+
+	// Enable Auto-tuning.
+	stk := c.Stack()
+	// Set lower limits for auto-tuning tests. This is required because the
+	// test stops the worker which can cause packets to be dropped because
+	// the segment queue holding unprocessed packets is limited to 300.
+	const receiveBufferSize = 80 << 10 // 80KB.
+	const maxReceiveBufferSize = receiveBufferSize * 10
+	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: receiveBufferSize, Max: maxReceiveBufferSize}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	}
+
+	// Enable auto-tuning.
+	if err := stk.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	}
+	// Change the expected window scale to match the value needed for the
+	// maximum buffer size used by stack.
+	c.WindowScale = uint8(tcp.FindWndScale(maxReceiveBufferSize))
+
+	rawEP := c.CreateConnectedWithOptions(header.TCPSynOptions{TS: true, WS: 4})
+
+	wantRcvWnd := receiveBufferSize
+	scaleRcvWnd := func(rcvWnd int) uint16 {
+		return uint16(rcvWnd >> uint16(c.WindowScale))
+	}
+	// Allocate a large array to send to the endpoint.
+	b := make([]byte, receiveBufferSize*48)
+
+	// In every iteration we will send double the number of bytes sent in
+	// the previous iteration and read the same from the app. The received
+	// window should grow by at least 2x of bytes read by the app in every
+	// RTT.
+	offset := 0
+	payloadSize := receiveBufferSize / 8
+	worker := (c.EP).(interface {
+		StopWork()
+		ResumeWork()
+	})
+	tsVal := rawEP.TSVal
+	// We are going to do our own computation of what the moderated receive
+	// buffer should be based on sent/copied data per RTT and verify that
+	// the advertised window by the stack matches our calculations.
+	prevCopied := 0
+	done := false
+	latency := 1 * time.Millisecond
+	for i := 0; !done; i++ {
+		tsVal++
+
+		// Stop the worker goroutine.
+		worker.StopWork()
+		start := offset
+		end := offset + payloadSize
+		totalSent := 0
+		packetsSent := 0
+		for ; start < end; start += mss {
+			rawEP.SendPacketWithTS(b[start:start+mss], tsVal)
+			totalSent += mss
+			packetsSent++
+		}
+
+		// Resume it so that it only sees the packets once all of them
+		// are waiting to be read.
+		worker.ResumeWork()
+
+		// Give 1ms for the worker to process the packets.
+		time.Sleep(1 * time.Millisecond)
+
+		// Verify that the advertised window on the ACK is reduced by
+		// the total bytes sent.
+		expectedWnd := wantRcvWnd - totalSent
+		if packetsSent > 100 {
+			for i := 0; i < (packetsSent / 100); i++ {
+				_ = c.GetPacket()
+			}
+		}
+		rawEP.VerifyACKRcvWnd(scaleRcvWnd(expectedWnd))
+
+		// Now read all the data from the endpoint and invoke the
+		// moderation API to allow for receive buffer auto-tuning
+		// to happen before we measure the new window.
+		totalCopied := 0
+		for {
+			b, _, err := c.EP.Read(nil)
+			if err == tcpip.ErrWouldBlock {
+				break
+			}
+			totalCopied += len(b)
+		}
+
+		// Invoke the moderation API. This is required for auto-tuning
+		// to happen. This method is normally expected to be invoked
+		// from a higher layer than tcpip.Endpoint. So we simulate
+		// copying to userspace by invoking it explicitly here.
+		c.EP.ModerateRecvBuf(totalCopied)
+
+		// Now send a keep-alive packet to trigger an ACK so that we can
+		// measure the new window.
+		rawEP.NextSeqNum--
+		rawEP.SendPacketWithTS(nil, tsVal)
+		rawEP.NextSeqNum++
+
+		if i == 0 {
+			// In the first iteration the receiver based RTT is not
+			// yet known as a result the moderation code should not
+			// increase the advertised window.
+			rawEP.VerifyACKRcvWnd(scaleRcvWnd(wantRcvWnd))
+			prevCopied = totalCopied
+		} else {
+			rttCopied := totalCopied
+			if i == 1 {
+				// The moderation code accumulates copied bytes till
+				// RTT is established. So add in the bytes sent in
+				// the first iteration to the total bytes for this
+				// RTT.
+				rttCopied += prevCopied
+				// Now reset it to the initial value used by the
+				// auto tuning logic.
+				prevCopied = tcp.InitialCwnd * mss * 2
+			}
+			newWnd := rttCopied<<1 + 16*mss
+			grow := (newWnd * (rttCopied - prevCopied)) / prevCopied
+			newWnd += (grow << 1)
+			if newWnd > maxReceiveBufferSize {
+				newWnd = maxReceiveBufferSize
+				done = true
+			}
+			rawEP.VerifyACKRcvWnd(scaleRcvWnd(newWnd))
+			wantRcvWnd = newWnd
+			prevCopied = rttCopied
+			// Increase the latency after first two iterations to
+			// establish a low RTT value in the receiver since it
+			// only tracks the lowest value. This ensures that when
+			// ModerateRcvBuf is called the elapsed time is always >
+			// rtt. Without this the test is flaky due to delays due
+			// to scheduling/wakeup etc.
+			latency += 50 * time.Millisecond
+		}
+		time.Sleep(latency)
+		offset += payloadSize
+		payloadSize *= 2
+	}
+}
+
+func TestDelayEnabled(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+	checkDelayOption(t, c, false, false) // Delay is disabled by default.
+
+	for _, v := range []struct {
+		delayEnabled    tcp.DelayEnabled
+		wantDelayOption bool
+	}{
+		{delayEnabled: false, wantDelayOption: false},
+		{delayEnabled: true, wantDelayOption: true},
+	} {
+		c := context.New(t, defaultMTU)
+		defer c.Cleanup()
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, v.delayEnabled); err != nil {
+			t.Fatalf("SetTransportProtocolOption(tcp, %t) failed: %s", v.delayEnabled, err)
+		}
+		checkDelayOption(t, c, v.delayEnabled, v.wantDelayOption)
+	}
+}
+
+func checkDelayOption(t *testing.T, c *context.Context, wantDelayEnabled tcp.DelayEnabled, wantDelayOption bool) {
+	t.Helper()
+
+	var gotDelayEnabled tcp.DelayEnabled
+	if err := c.Stack().TransportProtocolOption(tcp.ProtocolNumber, &gotDelayEnabled); err != nil {
+		t.Fatalf("TransportProtocolOption(tcp, &gotDelayEnabled) failed: %s", err)
+	}
+	if gotDelayEnabled != wantDelayEnabled {
+		t.Errorf("TransportProtocolOption(tcp, &gotDelayEnabled) got %t, want %t", gotDelayEnabled, wantDelayEnabled)
+	}
+
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, new(waiter.Queue))
+	if err != nil {
+		t.Fatalf("NewEndPoint(tcp, ipv4, new(waiter.Queue)) failed: %s", err)
+	}
+	gotDelayOption, err := ep.GetSockOptBool(tcpip.DelayOption)
+	if err != nil {
+		t.Fatalf("ep.GetSockOptBool(tcpip.DelayOption) failed: %s", err)
+	}
+	if gotDelayOption != wantDelayOption {
+		t.Errorf("ep.GetSockOptBool(tcpip.DelayOption) got: %t, want: %t", gotDelayOption, wantDelayOption)
+	}
+}
+
+func TestTCPLingerTimeout(t *testing.T) {
+	c := context.New(t, 1500 /* mtu */)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	testCases := []struct {
+		name             string
+		tcpLingerTimeout time.Duration
+		want             time.Duration
+	}{
+		{"NegativeLingerTimeout", -123123, 0},
+		{"ZeroLingerTimeout", 0, 0},
+		{"InRangeLingerTimeout", 10 * time.Second, 10 * time.Second},
+		// Values > stack's TCPLingerTimeout are capped to the stack's
+		// value. Defaults to tcp.DefaultTCPLingerTimeout(60 seconds)
+		{"AboveMaxLingerTimeout", 65 * time.Second, 60 * time.Second},
+	}
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			if err := c.EP.SetSockOpt(tcpip.TCPLingerTimeoutOption(tc.tcpLingerTimeout)); err != nil {
+				t.Fatalf("SetSockOpt(%s) = %s", tc.tcpLingerTimeout, err)
+			}
+			var v tcpip.TCPLingerTimeoutOption
+			if err := c.EP.GetSockOpt(&v); err != nil {
+				t.Fatalf("GetSockOpt(tcpip.TCPLingerTimeoutOption) = %s", err)
+			}
+			if got, want := time.Duration(v), tc.want; got != want {
+				t.Fatalf("unexpected linger timeout got: %s, want: %s", got, want)
+			}
+		})
+	}
+}
+
+func TestTCPTimeWaitRSTIgnored(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	c.EP.Close()
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+1),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Now send a RST and this should be ignored and not
+	// generate an ACK.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagRst,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	})
+
+	c.CheckNoPacketTimeout("unexpected packet received in TIME_WAIT state", 1*time.Second)
+
+	// Out of order ACK should generate an immediate ACK in
+	// TIME_WAIT.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 3,
+	})
+
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+}
+
+func TestTCPTimeWaitOutOfOrder(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	c.EP.Close()
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+1),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Out of order ACK should generate an immediate ACK in
+	// TIME_WAIT.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 3,
+	})
+
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+}
+
+func TestTCPTimeWaitNewSyn(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	c.EP.Close()
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+1),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Send a SYN request w/ sequence number lower than
+	// the highest sequence number sent. We just reuse
+	// the same number.
+	iss = seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+		RcvWnd:  30000,
+	})
+
+	c.CheckNoPacketTimeout("unexpected packet received in response to SYN", 1*time.Second)
+
+	// Send a SYN request w/ sequence number higher than
+	// the highest sequence number sent.
+	iss = seqnum.Value(792)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b = c.GetPacket()
+	tcpHdr = header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders = &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+}
+
+func TestTCPTimeWaitDuplicateFINExtendsTimeWait(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Set TCPTimeWaitTimeout to 5 seconds so that sockets are marked closed
+	// after 5 seconds in TIME_WAIT state.
+	tcpTimeWaitTimeout := 5 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
+	}
+
+	want := c.Stack().Stats().TCP.EstablishedClosed.Value() + 1
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	c.EP.Close()
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+1),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	time.Sleep(2 * time.Second)
+
+	// Now send a duplicate FIN. This should cause the TIME_WAIT to extend
+	// by another 5 seconds and also send us a duplicate ACK as it should
+	// indicate that the final ACK was potentially lost.
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+2)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Sleep for 4 seconds so at this point we are 1 second past the
+	// original tcpLingerTimeout of 5 seconds.
+	time.Sleep(4 * time.Second)
+
+	// Send an ACK and it should not generate any packet as the socket
+	// should still be in TIME_WAIT for another another 5 seconds due
+	// to the duplicate FIN we sent earlier.
+	*ackHeaders = *finHeaders
+	ackHeaders.SeqNum = ackHeaders.SeqNum + 1
+	ackHeaders.Flags = header.TCPFlagAck
+	c.SendPacket(nil, ackHeaders)
+
+	c.CheckNoPacketTimeout("unexpected packet received from endpoint in TIME_WAIT", 1*time.Second)
+	// Now sleep for another 2 seconds so that we are past the
+	// extended TIME_WAIT of 7 seconds (2 + 5).
+	time.Sleep(2 * time.Second)
+
+	// Resend the same ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Receive the RST that should be generated as there is no valid
+	// endpoint.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(ackHeaders.AckNum)),
+		checker.AckNum(0),
+		checker.TCPFlags(header.TCPFlagRst)))
+
+	if got := c.Stack().Stats().TCP.EstablishedClosed.Value(); got != want {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedClosed = %d, want = %d", got, want)
+	}
+	if got := c.Stack().Stats().TCP.CurrentEstablished.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentEstablished.Value() = %d, want = 0", got)
+	}
+}
+
+func TestTCPCloseWithData(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	// Set TCPTimeWaitTimeout to 5 seconds so that sockets are marked closed
+	// after 5 seconds in TIME_WAIT state.
+	tcpTimeWaitTimeout := 5 * time.Second
+	if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPTimeWaitTimeoutOption(tcpTimeWaitTimeout)); err != nil {
+		t.Fatalf("c.stack.SetTransportProtocolOption(tcp, tcpip.TCPLingerTimeoutOption(%d) failed: %s", tcpTimeWaitTimeout, err)
+	}
+
+	wq := &waiter.Queue{}
+	ep, err := c.Stack().NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		t.Fatalf("NewEndpoint failed: %s", err)
+	}
+	if err := ep.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		t.Fatalf("Listen failed: %s", err)
+	}
+
+	// Send a SYN request.
+	iss := seqnum.Value(789)
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+		RcvWnd:  30000,
+	})
+
+	// Receive the SYN-ACK reply.
+	b := c.GetPacket()
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	ackHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+		RcvWnd:  30000,
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				t.Fatalf("Accept failed: %s", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			t.Fatalf("Timed out waiting for accept")
+		}
+	}
+
+	// Now trigger a passive close by sending a FIN.
+	finHeaders := &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck | header.TCPFlagFin,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 2,
+		RcvWnd:  30000,
+	}
+
+	c.SendPacket(nil, finHeaders)
+
+	// Get the ACK to the FIN we just sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)),
+		checker.AckNum(uint32(iss)+2),
+		checker.TCPFlags(header.TCPFlagAck)))
+
+	// Now write a few bytes and then close the endpoint.
+	data := []byte{1, 2, 3}
+	view := buffer.NewView(len(data))
+	copy(view, data)
+
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	// Check that data is received.
+	b = c.GetPacket()
+	checker.IPv4(t, b,
+		checker.PayloadLen(len(data)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(iss)+2), // Acknum is initial sequence number + 1
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	if p := b[header.IPv4MinimumSize+header.TCPMinimumSize:]; !bytes.Equal(data, p) {
+		t.Errorf("got data = %x, want = %x", p, data)
+	}
+
+	c.EP.Close()
+	// Check the FIN.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(c.IRS+1)+uint32(len(data))),
+		checker.AckNum(uint32(iss+2)),
+		checker.TCPFlags(header.TCPFlagFin|header.TCPFlagAck)))
+
+	// First send a partial ACK.
+	ackHeaders = &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 2,
+		AckNum:  c.IRS + 1 + seqnum.Value(len(data)-1),
+		RcvWnd:  30000,
+	}
+	c.SendPacket(nil, ackHeaders)
+
+	// Now send a full ACK.
+	ackHeaders = &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 2,
+		AckNum:  c.IRS + 1 + seqnum.Value(len(data)),
+		RcvWnd:  30000,
+	}
+	c.SendPacket(nil, ackHeaders)
+
+	// Now ACK the FIN.
+	ackHeaders.AckNum++
+	c.SendPacket(nil, ackHeaders)
+
+	// Now send an ACK and we should get a RST back as the endpoint should
+	// be in CLOSED state.
+	ackHeaders = &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 2,
+		AckNum:  c.IRS + 1 + seqnum.Value(len(data)),
+		RcvWnd:  30000,
+	}
+	c.SendPacket(nil, ackHeaders)
+
+	// Check the RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.SeqNum(uint32(ackHeaders.AckNum)),
+		checker.AckNum(0),
+		checker.TCPFlags(header.TCPFlagRst)))
+}
+
+func TestTCPUserTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&waitEntry, waiter.EventHUp)
+	defer c.WQ.EventUnregister(&waitEntry)
+
+	origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
+
+	// Ensure that on the next retransmit timer fire, the user timeout has
+	// expired.
+	initRTO := 1 * time.Second
+	userTimeout := initRTO / 2
+	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+
+	// Send some data and wait before ACKing it.
+	view := buffer.NewView(3)
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Write failed: %s", err)
+	}
+
+	next := uint32(c.IRS) + 1
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(len(view)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(next),
+			checker.AckNum(790),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	// Wait for the retransmit timer to be fired and the user timeout to cause
+	// close of the connection.
+	select {
+	case <-notifyCh:
+	case <-time.After(2 * initRTO):
+		t.Fatalf("connection still alive after %s, should have been closed after :%s", 2*initRTO, userTimeout)
+	}
+
+	// No packet should be received as the connection should be silently
+	// closed due to timeout.
+	c.CheckNoPacket("unexpected packet received after userTimeout has expired")
+
+	next += uint32(len(view))
+
+	// The connection should be terminated after userTimeout has expired.
+	// Send an ACK to trigger a RST from the stack as the endpoint should
+	// be dead.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(next),
+		RcvWnd:  30000,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(next)),
+			checker.AckNum(uint32(0)),
+			checker.TCPFlags(header.TCPFlagRst),
+		),
+	)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrTimeout)
+	}
+
+	if got, want := c.Stack().Stats().TCP.EstablishedTimedout.Value(), origEstablishedTimedout+1; got != want {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %d, want = %d", got, want)
+	}
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)
+	}
+}
+
+func TestKeepaliveWithUserTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnected(789, 30000, -1 /* epRcvBuf */)
+
+	origEstablishedTimedout := c.Stack().Stats().TCP.EstablishedTimedout.Value()
+
+	const keepAliveInterval = 3 * time.Second
+	c.EP.SetSockOpt(tcpip.KeepaliveIdleOption(100 * time.Millisecond))
+	c.EP.SetSockOpt(tcpip.KeepaliveIntervalOption(keepAliveInterval))
+	c.EP.SetSockOptInt(tcpip.KeepaliveCountOption, 10)
+	c.EP.SetSockOptBool(tcpip.KeepaliveEnabledOption, true)
+
+	// Set userTimeout to be the duration to be 1 keepalive
+	// probes. Which means that after the first probe is sent
+	// the second one should cause the connection to be
+	// closed due to userTimeout being hit.
+	userTimeout := 1 * keepAliveInterval
+	c.EP.SetSockOpt(tcpip.TCPUserTimeoutOption(userTimeout))
+
+	// Check that the connection is still alive.
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrWouldBlock {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Now receive 1 keepalives, but don't ACK it.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)),
+			checker.AckNum(uint32(790)),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+
+	// Sleep for a litte over the KeepAlive interval to make sure
+	// the timer has time to fire after the last ACK and close the
+	// close the socket.
+	time.Sleep(keepAliveInterval + keepAliveInterval/2)
+
+	// The connection should be closed with a timeout.
+	// Send an ACK to trigger a RST from the stack as the endpoint should
+	// be dead.
+	c.SendPacket(nil, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  790,
+		AckNum:  seqnum.Value(c.IRS + 1),
+		RcvWnd:  30000,
+	})
+
+	checker.IPv4(t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS+1)),
+			checker.AckNum(uint32(0)),
+			checker.TCPFlags(header.TCPFlagRst),
+		),
+	)
+
+	if _, _, err := c.EP.Read(nil); err != tcpip.ErrTimeout {
+		t.Fatalf("got c.EP.Read(nil) = %s, want = %s", err, tcpip.ErrTimeout)
+	}
+	if got, want := c.Stack().Stats().TCP.EstablishedTimedout.Value(), origEstablishedTimedout+1; got != want {
+		t.Errorf("got c.Stack().Stats().TCP.EstablishedTimedout = %d, want = %d", got, want)
+	}
+	if got := c.Stack().Stats().TCP.CurrentConnected.Value(); got != 0 {
+		t.Errorf("got stats.TCP.CurrentConnected.Value() = %d, want = 0", got)
+	}
+}
+
+func TestIncreaseWindowOnReceive(t *testing.T) {
+	// This test ensures that the endpoint sends an ack,
+	// after recv() when the window grows to more than 1 MSS.
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	const rcvBuf = 65535 * 10
+	c.CreateConnected(789, 30000, rcvBuf)
+
+	// Write chunks of ~30000 bytes. It's important that two
+	// payloads make it equal or longer than MSS.
+	remain := rcvBuf
+	sent := 0
+	data := make([]byte, defaultMTU/2)
+	lastWnd := uint16(0)
+
+	for remain > len(data) {
+		c.SendPacket(data, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  seqnum.Value(790 + sent),
+			AckNum:  c.IRS.Add(1),
+			RcvWnd:  30000,
+		})
+		sent += len(data)
+		remain -= len(data)
+
+		lastWnd = uint16(remain)
+		if remain > 0xffff {
+			lastWnd = 0xffff
+		}
+		checker.IPv4(t, c.GetPacket(),
+			checker.PayloadLen(header.TCPMinimumSize),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)+1),
+				checker.AckNum(uint32(790+sent)),
+				checker.Window(lastWnd),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+	}
+
+	if lastWnd == 0xffff || lastWnd == 0 {
+		t.Fatalf("expected small, non-zero window: %d", lastWnd)
+	}
+
+	// We now have < 1 MSS in the buffer space. Read the data! An
+	// ack should be sent in response to that. The window was not
+	// zero, but it grew to larger than MSS.
+	if _, _, err := c.EP.Read(nil); err != nil {
+		t.Fatalf("Read failed: %s", err)
+	}
+
+	if _, _, err := c.EP.Read(nil); err != nil {
+		t.Fatalf("Read failed: %s", err)
+	}
+
+	// After reading two packets, we surely crossed MSS. See the ack:
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+sent)),
+			checker.Window(uint16(0xffff)),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
+
+func TestIncreaseWindowOnBufferResize(t *testing.T) {
+	// This test ensures that the endpoint sends an ack,
+	// after available recv buffer grows to more than 1 MSS.
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	const rcvBuf = 65535 * 10
+	c.CreateConnected(789, 30000, rcvBuf)
+
+	// Write chunks of ~30000 bytes. It's important that two
+	// payloads make it equal or longer than MSS.
+	remain := rcvBuf
+	sent := 0
+	data := make([]byte, defaultMTU/2)
+	lastWnd := uint16(0)
+
+	for remain > len(data) {
+		c.SendPacket(data, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			Flags:   header.TCPFlagAck,
+			SeqNum:  seqnum.Value(790 + sent),
+			AckNum:  c.IRS.Add(1),
+			RcvWnd:  30000,
+		})
+		sent += len(data)
+		remain -= len(data)
+
+		lastWnd = uint16(remain)
+		if remain > 0xffff {
+			lastWnd = 0xffff
+		}
+		checker.IPv4(t, c.GetPacket(),
+			checker.PayloadLen(header.TCPMinimumSize),
+			checker.TCP(
+				checker.DstPort(context.TestPort),
+				checker.SeqNum(uint32(c.IRS)+1),
+				checker.AckNum(uint32(790+sent)),
+				checker.Window(lastWnd),
+				checker.TCPFlags(header.TCPFlagAck),
+			),
+		)
+	}
+
+	if lastWnd == 0xffff || lastWnd == 0 {
+		t.Fatalf("expected small, non-zero window: %d", lastWnd)
+	}
+
+	// Increasing the buffer from should generate an ACK,
+	// since window grew from small value to larger equal MSS
+	c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, rcvBuf*2)
+
+	// After reading two packets, we surely crossed MSS. See the ack:
+	checker.IPv4(t, c.GetPacket(),
+		checker.PayloadLen(header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(790+sent)),
+			checker.Window(uint16(0xffff)),
+			checker.TCPFlags(header.TCPFlagAck),
+		),
+	)
+}
+
+func TestTCPDeferAccept(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	const tcpDeferAccept = 1 * time.Second
+	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %s", tcpDeferAccept, err)
+	}
+
+	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Send data. This should result in an acceptable endpoint.
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+
+	// Give a bit of time for the socket to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+	aep, _, err := c.EP.Accept()
+	if err != nil {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: nil", err)
+	}
+
+	aep.Close()
+	// Closing aep without reading the data should trigger a RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
+
+func TestTCPDeferAcceptTimeout(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.Create(-1)
+
+	if err := c.EP.Bind(tcpip.FullAddress{Port: context.StackPort}); err != nil {
+		t.Fatal("Bind failed:", err)
+	}
+
+	if err := c.EP.Listen(10); err != nil {
+		t.Fatal("Listen failed:", err)
+	}
+
+	const tcpDeferAccept = 1 * time.Second
+	if err := c.EP.SetSockOpt(tcpip.TCPDeferAcceptOption(tcpDeferAccept)); err != nil {
+		t.Fatalf("c.EP.SetSockOpt(TCPDeferAcceptOption(%s) failed: %s", tcpDeferAccept, err)
+	}
+
+	irs, iss := executeHandshake(t, c, context.TestPort, false /* synCookiesInUse */)
+
+	if _, _, err := c.EP.Accept(); err != tcpip.ErrWouldBlock {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: %s", err, tcpip.ErrWouldBlock)
+	}
+
+	// Sleep for a little of the tcpDeferAccept timeout.
+	time.Sleep(tcpDeferAccept + 100*time.Millisecond)
+
+	// On timeout expiry we should get a SYN-ACK retransmission.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck|header.TCPFlagSyn),
+		checker.AckNum(uint32(irs)+1)))
+
+	// Send data. This should result in an acceptable endpoint.
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: context.StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  irs + 1,
+		AckNum:  iss + 1,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+
+	// Give sometime for the endpoint to be delivered to the accept queue.
+	time.Sleep(50 * time.Millisecond)
+	aep, _, err := c.EP.Accept()
+	if err != nil {
+		t.Fatalf("c.EP.Accept() returned unexpected error got: %s, want: nil", err)
+	}
+
+	aep.Close()
+	// Closing aep without reading the data should trigger a RST.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.SrcPort(context.StackPort),
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagRst|header.TCPFlagAck),
+		checker.SeqNum(uint32(iss+1)),
+		checker.AckNum(uint32(irs+5))))
+}
+
+func TestResetDuringClose(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	iss := seqnum.Value(789)
+	c.CreateConnected(iss, 30000, -1 /* epRecvBuf */)
+	// Send some data to make sure there is some unread
+	// data to trigger a reset on c.Close.
+	irs := c.IRS
+	c.SendPacket([]byte{1, 2, 3, 4}, &context.Headers{
+		SrcPort: context.TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss.Add(1),
+		AckNum:  irs.Add(1),
+		RcvWnd:  30000,
+	})
+
+	// Receive ACK for the data we sent.
+	checker.IPv4(t, c.GetPacket(), checker.TCP(
+		checker.DstPort(context.TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(irs.Add(1))),
+		checker.AckNum(uint32(iss.Add(5)))))
+
+	// Close in a separate goroutine so that we can trigger
+	// a race with the RST we send below. This should not
+	// panic due to the route being released depeding on
+	// whether Close() sends an active RST or the RST sent
+	// below is processed by the worker first.
+	var wg sync.WaitGroup
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		c.SendPacket(nil, &context.Headers{
+			SrcPort: context.TestPort,
+			DstPort: c.Port,
+			SeqNum:  iss.Add(5),
+			AckNum:  c.IRS.Add(5),
+			RcvWnd:  30000,
+			Flags:   header.TCPFlagRst,
+		})
+	}()
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		c.EP.Close()
+	}()
+
+	wg.Wait()
+}
diff --git a/pkg/tcpip/transport/tcp/tcp_timestamp_test.go b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
new file mode 100644
index 000000000..8edbff964
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/tcp_timestamp_test.go
@@ -0,0 +1,291 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp_test
+
+import (
+	"bytes"
+	"math/rand"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp/testing/context"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// createConnectedWithTimestampOption creates and connects c.ep with the
+// timestamp option enabled.
+func createConnectedWithTimestampOption(c *context.Context) *context.RawEndpoint {
+	return c.CreateConnectedWithOptions(header.TCPSynOptions{TS: true, TSVal: 1})
+}
+
+// TestTimeStampEnabledConnect tests that netstack sends the timestamp option on
+// an active connect and sets the TS Echo Reply fields correctly when the
+// SYN-ACK also indicates support for the TS option and provides a TSVal.
+func TestTimeStampEnabledConnect(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	rep := createConnectedWithTimestampOption(c)
+
+	// Register for read and validate that we have data to read.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	// The following tests ensure that TS option once enabled behaves
+	// correctly as described in
+	// https://tools.ietf.org/html/rfc7323#section-4.3.
+	//
+	// We are not testing delayed ACKs here, but we do test out of order
+	// packet delivery and filling the sequence number hole created due to
+	// the out of order packet.
+	//
+	// The test also verifies that the sequence numbers and timestamps are
+	// as expected.
+	data := []byte{1, 2, 3}
+
+	// First we increment tsVal by a small amount.
+	tsVal := rep.TSVal + 100
+	rep.SendPacketWithTS(data, tsVal)
+	rep.VerifyACKWithTS(tsVal)
+
+	// Next we send an out of order packet.
+	rep.NextSeqNum += 3
+	tsVal += 200
+	rep.SendPacketWithTS(data, tsVal)
+
+	// The ACK should contain the original sequenceNumber and an older TS.
+	rep.NextSeqNum -= 6
+	rep.VerifyACKWithTS(tsVal - 200)
+
+	// Next we fill the hole and the returned ACK should contain the
+	// cumulative sequence number acking all data sent till now and have the
+	// latest timestamp sent below in its TSEcr field.
+	tsVal -= 100
+	rep.SendPacketWithTS(data, tsVal)
+	rep.NextSeqNum += 3
+	rep.VerifyACKWithTS(tsVal)
+
+	// Increment tsVal by a large value that doesn't result in a wrap around.
+	tsVal += 0x7fffffff
+	rep.SendPacketWithTS(data, tsVal)
+	rep.VerifyACKWithTS(tsVal)
+
+	// Increment tsVal again by a large value which should cause the
+	// timestamp value to wrap around. The returned ACK should contain the
+	// wrapped around timestamp in its tsEcr field and not the tsVal from
+	// the previous packet sent above.
+	tsVal += 0x7fffffff
+	rep.SendPacketWithTS(data, tsVal)
+	rep.VerifyACKWithTS(tsVal)
+
+	select {
+	case <-ch:
+	case <-time.After(1 * time.Second):
+		t.Fatalf("Timed out waiting for data to arrive")
+	}
+
+	// There should be 5 views to read and each of them should
+	// contain the same data.
+	for i := 0; i < 5; i++ {
+		got, _, err := c.EP.Read(nil)
+		if err != nil {
+			t.Fatalf("Unexpected error from Read: %v", err)
+		}
+		if want := data; bytes.Compare(got, want) != 0 {
+			t.Fatalf("Data is different: got: %v, want: %v", got, want)
+		}
+	}
+}
+
+// TestTimeStampDisabledConnect tests that netstack sends timestamp option on an
+// active connect but if the SYN-ACK doesn't specify the TS option then
+// timestamp option is not enabled and future packets do not contain a
+// timestamp.
+func TestTimeStampDisabledConnect(t *testing.T) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	c.CreateConnectedWithOptions(header.TCPSynOptions{})
+}
+
+func timeStampEnabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndSize uint16) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	if cookieEnabled {
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+			t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+		}
+	}
+
+	t.Logf("Test w/ CookieEnabled = %v", cookieEnabled)
+	tsVal := rand.Uint32()
+	c.AcceptWithOptions(wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS, TS: true, TSVal: tsVal})
+
+	// Now send some data and validate that timestamp is echoed correctly in the ACK.
+	data := []byte{1, 2, 3}
+	view := buffer.NewView(len(data))
+	copy(view, data)
+
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Unexpected error from Write: %s", err)
+	}
+
+	// Check that data is received and that the timestamp option TSEcr field
+	// matches the expected value.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		// Add 12 bytes for the timestamp option + 2 NOPs to align at 4
+		// byte boundary.
+		checker.PayloadLen(len(data)+header.TCPMinimumSize+12),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.Window(wndSize),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+			checker.TCPTimestampChecker(true, 0, tsVal+1),
+		),
+	)
+}
+
+// TestTimeStampEnabledAccept tests that if the SYN on a passive connect
+// specifies the Timestamp option then the Timestamp option is sent on a SYN-ACK
+// and echoes the tsVal field of the original SYN in the tcEcr field of the
+// SYN-ACK. We cover the cases where SYN cookies are enabled/disabled and verify
+// that Timestamp option is enabled in both cases if requested in the original
+// SYN.
+func TestTimeStampEnabledAccept(t *testing.T) {
+	testCases := []struct {
+		cookieEnabled bool
+		wndScale      int
+		wndSize       uint16
+	}{
+		{true, -1, 0xffff}, // When cookie is used window scaling is disabled.
+		{false, 5, 0x8000}, // DefaultReceiveBufferSize is 1MB >> 5.
+	}
+	for _, tc := range testCases {
+		timeStampEnabledAccept(t, tc.cookieEnabled, tc.wndScale, tc.wndSize)
+	}
+}
+
+func timeStampDisabledAccept(t *testing.T, cookieEnabled bool, wndScale int, wndSize uint16) {
+	c := context.New(t, defaultMTU)
+	defer c.Cleanup()
+
+	if cookieEnabled {
+		if err := c.Stack().SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPSynRcvdCountThresholdOption(0)); err != nil {
+			t.Fatalf("setting TCPSynRcvdCountThresholdOption to 0 failed: %s", err)
+		}
+	}
+
+	t.Logf("Test w/ CookieEnabled = %v", cookieEnabled)
+	c.AcceptWithOptions(wndScale, header.TCPSynOptions{MSS: defaultIPv4MSS})
+
+	// Now send some data with the accepted connection endpoint and validate
+	// that no timestamp option is sent in the TCP segment.
+	data := []byte{1, 2, 3}
+	view := buffer.NewView(len(data))
+	copy(view, data)
+
+	if _, _, err := c.EP.Write(tcpip.SlicePayload(view), tcpip.WriteOptions{}); err != nil {
+		t.Fatalf("Unexpected error from Write: %s", err)
+	}
+
+	// Check that data is received and that the timestamp option is disabled
+	// when SYN cookies are enabled/disabled.
+	b := c.GetPacket()
+	checker.IPv4(t, b,
+		checker.PayloadLen(len(data)+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(context.TestPort),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(790),
+			checker.Window(wndSize),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+			checker.TCPTimestampChecker(false, 0, 0),
+		),
+	)
+}
+
+// TestTimeStampDisabledAccept tests that Timestamp option is not used when the
+// peer doesn't advertise it and connection is established with Accept().
+func TestTimeStampDisabledAccept(t *testing.T) {
+	testCases := []struct {
+		cookieEnabled bool
+		wndScale      int
+		wndSize       uint16
+	}{
+		{true, -1, 0xffff}, // When cookie is used window scaling is disabled.
+		{false, 5, 0x8000}, // DefaultReceiveBufferSize is 1MB >> 5.
+	}
+	for _, tc := range testCases {
+		timeStampDisabledAccept(t, tc.cookieEnabled, tc.wndScale, tc.wndSize)
+	}
+}
+
+func TestSendGreaterThanMTUWithOptions(t *testing.T) {
+	const maxPayload = 100
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+	defer c.Cleanup()
+
+	createConnectedWithTimestampOption(c)
+	testBrokenUpWrite(t, c, maxPayload)
+}
+
+func TestSegmentNotDroppedWhenTimestampMissing(t *testing.T) {
+	const maxPayload = 100
+	c := context.New(t, uint32(header.TCPMinimumSize+header.IPv4MinimumSize+maxPayload))
+	defer c.Cleanup()
+
+	rep := createConnectedWithTimestampOption(c)
+
+	// Register for read.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&we, waiter.EventIn)
+	defer c.WQ.EventUnregister(&we)
+
+	droppedPacketsStat := c.Stack().Stats().DroppedPackets
+	droppedPackets := droppedPacketsStat.Value()
+	data := []byte{1, 2, 3}
+	// Send a packet with no TCP options/timestamp.
+	rep.SendPacket(data, nil)
+
+	select {
+	case <-ch:
+	case <-time.After(1 * time.Second):
+		t.Fatalf("Timed out waiting for data to arrive")
+	}
+
+	// Assert that DroppedPackets was not incremented.
+	if got, want := droppedPacketsStat.Value(), droppedPackets; got != want {
+		t.Fatalf("incorrect number of dropped packets, got: %v, want: %v", got, want)
+	}
+
+	// Issue a read and we should data.
+	got, _, err := c.EP.Read(nil)
+	if err != nil {
+		t.Fatalf("Unexpected error from Read: %v", err)
+	}
+	if want := data; bytes.Compare(got, want) != 0 {
+		t.Fatalf("Data is different: got: %v, want: %v", got, want)
+	}
+}
diff --git a/pkg/tcpip/transport/tcp/testing/context/BUILD b/pkg/tcpip/transport/tcp/testing/context/BUILD
new file mode 100644
index 000000000..ce6a2c31d
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/testing/context/BUILD
@@ -0,0 +1,26 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "context",
+    testonly = 1,
+    srcs = ["context.go"],
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/checker",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/seqnum",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/tcp",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/tcpip/transport/tcp/testing/context/context.go b/pkg/tcpip/transport/tcp/testing/context/context.go
new file mode 100644
index 000000000..06fde2a79
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/testing/context/context.go
@@ -0,0 +1,1121 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package context provides a test context for use in tcp tests. It also
+// provides helper methods to assert/check certain behaviours.
+package context
+
+import (
+	"bytes"
+	"context"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// StackAddr is the IPv4 address assigned to the stack.
+	StackAddr = "\x0a\x00\x00\x01"
+
+	// StackPort is used as the listening port in tests for passive
+	// connects.
+	StackPort = 1234
+
+	// TestAddr is the source address for packets sent to the stack via the
+	// link layer endpoint.
+	TestAddr = "\x0a\x00\x00\x02"
+
+	// TestPort is the TCP port used for packets sent to the stack
+	// via the link layer endpoint.
+	TestPort = 4096
+
+	// StackV6Addr is the IPv6 address assigned to the stack.
+	StackV6Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+
+	// TestV6Addr is the source address for packets sent to the stack via
+	// the link layer endpoint.
+	TestV6Addr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+
+	// StackV4MappedAddr is StackAddr as a mapped v6 address.
+	StackV4MappedAddr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff" + StackAddr
+
+	// TestV4MappedAddr is TestAddr as a mapped v6 address.
+	TestV4MappedAddr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff" + TestAddr
+
+	// V4MappedWildcardAddr is the mapped v6 representation of 0.0.0.0.
+	V4MappedWildcardAddr = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff\x00\x00\x00\x00"
+
+	// testInitialSequenceNumber is the initial sequence number sent in packets that
+	// are sent in response to a SYN or in the initial SYN sent to the stack.
+	testInitialSequenceNumber = 789
+)
+
+// Headers is used to represent the TCP header fields when building a
+// new packet.
+type Headers struct {
+	// SrcPort holds the src port value to be used in the packet.
+	SrcPort uint16
+
+	// DstPort holds the destination port value to be used in the packet.
+	DstPort uint16
+
+	// SeqNum is the value of the sequence number field in the TCP header.
+	SeqNum seqnum.Value
+
+	// AckNum represents the acknowledgement number field in the TCP header.
+	AckNum seqnum.Value
+
+	// Flags are the TCP flags in the TCP header.
+	Flags int
+
+	// RcvWnd is the window to be advertised in the ReceiveWindow field of
+	// the TCP header.
+	RcvWnd seqnum.Size
+
+	// TCPOpts holds the options to be sent in the option field of the TCP
+	// header.
+	TCPOpts []byte
+}
+
+// Context provides an initialized Network stack and a link layer endpoint
+// for use in TCP tests.
+type Context struct {
+	t      *testing.T
+	linkEP *channel.Endpoint
+	s      *stack.Stack
+
+	// IRS holds the initial sequence number in the SYN sent by endpoint in
+	// case of an active connect or the sequence number sent by the endpoint
+	// in the SYN-ACK sent in response to a SYN when listening in passive
+	// mode.
+	IRS seqnum.Value
+
+	// Port holds the port bound by EP below in case of an active connect or
+	// the listening port number in case of a passive connect.
+	Port uint16
+
+	// EP is the test endpoint in the stack owned by this context. This endpoint
+	// is used in various tests to either initiate an active connect or is used
+	// as a passive listening endpoint to accept inbound connections.
+	EP tcpip.Endpoint
+
+	// Wq is the wait queue associated with EP and is used to block for events
+	// on EP.
+	WQ waiter.Queue
+
+	// TimeStampEnabled is true if ep is connected with the timestamp option
+	// enabled.
+	TimeStampEnabled bool
+
+	// WindowScale is the expected window scale in SYN packets sent by
+	// the stack.
+	WindowScale uint8
+}
+
+// New allocates and initializes a test context containing a new
+// stack and a link-layer endpoint.
+func New(t *testing.T, mtu uint32) *Context {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{tcp.NewProtocol()},
+	})
+
+	// Allow minimum send/receive buffer sizes to be 1 during tests.
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SendBufferSizeOption{Min: 1, Default: tcp.DefaultSendBufferSize, Max: 10 * tcp.DefaultSendBufferSize}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	}
+
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.ReceiveBufferSizeOption{Min: 1, Default: tcp.DefaultReceiveBufferSize, Max: 10 * tcp.DefaultReceiveBufferSize}); err != nil {
+		t.Fatalf("SetTransportProtocolOption failed: %s", err)
+	}
+
+	// Increase minimum RTO in tests to avoid test flakes due to early
+	// retransmit in case the test executors are overloaded and cause timers
+	// to fire earlier than expected.
+	if err := s.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.TCPMinRTOOption(3*time.Second)); err != nil {
+		t.Fatalf("failed to set stack-wide minRTO: %s", err)
+	}
+
+	// Some of the congestion control tests send up to 640 packets, we so
+	// set the channel size to 1000.
+	ep := channel.New(1000, mtu, "")
+	wep := stack.LinkEndpoint(ep)
+	if testing.Verbose() {
+		wep = sniffer.New(ep)
+	}
+	opts := stack.NICOptions{Name: "nic1"}
+	if err := s.CreateNICWithOptions(1, wep, opts); err != nil {
+		t.Fatalf("CreateNICWithOptions(_, _, %+v) failed: %v", opts, err)
+	}
+	wep2 := stack.LinkEndpoint(channel.New(1000, mtu, ""))
+	if testing.Verbose() {
+		wep2 = sniffer.New(channel.New(1000, mtu, ""))
+	}
+	opts2 := stack.NICOptions{Name: "nic2"}
+	if err := s.CreateNICWithOptions(2, wep2, opts2); err != nil {
+		t.Fatalf("CreateNICWithOptions(_, _, %+v) failed: %v", opts2, err)
+	}
+
+	if err := s.AddAddress(1, ipv4.ProtocolNumber, StackAddr); err != nil {
+		t.Fatalf("AddAddress failed: %v", err)
+	}
+
+	if err := s.AddAddress(1, ipv6.ProtocolNumber, StackV6Addr); err != nil {
+		t.Fatalf("AddAddress failed: %v", err)
+	}
+
+	s.SetRouteTable([]tcpip.Route{
+		{
+			Destination: header.IPv4EmptySubnet,
+			NIC:         1,
+		},
+		{
+			Destination: header.IPv6EmptySubnet,
+			NIC:         1,
+		},
+	})
+
+	return &Context{
+		t:           t,
+		s:           s,
+		linkEP:      ep,
+		WindowScale: uint8(tcp.FindWndScale(tcp.DefaultReceiveBufferSize)),
+	}
+}
+
+// Cleanup closes the context endpoint if required.
+func (c *Context) Cleanup() {
+	if c.EP != nil {
+		c.EP.Close()
+	}
+	c.Stack().Close()
+}
+
+// Stack returns a reference to the stack in the Context.
+func (c *Context) Stack() *stack.Stack {
+	return c.s
+}
+
+// CheckNoPacketTimeout verifies that no packet is received during the time
+// specified by wait.
+func (c *Context) CheckNoPacketTimeout(errMsg string, wait time.Duration) {
+	c.t.Helper()
+
+	ctx, cancel := context.WithTimeout(context.Background(), wait)
+	defer cancel()
+	if _, ok := c.linkEP.ReadContext(ctx); ok {
+		c.t.Fatal(errMsg)
+	}
+}
+
+// CheckNoPacket verifies that no packet is received for 1 second.
+func (c *Context) CheckNoPacket(errMsg string) {
+	c.CheckNoPacketTimeout(errMsg, 1*time.Second)
+}
+
+// GetPacket reads a packet from the link layer endpoint and verifies
+// that it is an IPv4 packet with the expected source and destination
+// addresses. It will fail with an error if no packet is received for
+// 2 seconds.
+func (c *Context) GetPacket() []byte {
+	c.t.Helper()
+
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+	p, ok := c.linkEP.ReadContext(ctx)
+	if !ok {
+		c.t.Fatalf("Packet wasn't written out")
+		return nil
+	}
+
+	if p.Proto != ipv4.ProtocolNumber {
+		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv4.ProtocolNumber)
+	}
+
+	hdr := p.Pkt.Header.View()
+	b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
+
+	if p.GSO != nil && p.GSO.L3HdrLen != header.IPv4MinimumSize {
+		c.t.Errorf("L3HdrLen %v (expected %v)", p.GSO.L3HdrLen, header.IPv4MinimumSize)
+	}
+
+	checker.IPv4(c.t, b, checker.SrcAddr(StackAddr), checker.DstAddr(TestAddr))
+	return b
+}
+
+// GetPacketNonBlocking reads a packet from the link layer endpoint
+// and verifies that it is an IPv4 packet with the expected source
+// and destination address. If no packet is available it will return
+// nil immediately.
+func (c *Context) GetPacketNonBlocking() []byte {
+	c.t.Helper()
+
+	p, ok := c.linkEP.Read()
+	if !ok {
+		return nil
+	}
+
+	if p.Proto != ipv4.ProtocolNumber {
+		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv4.ProtocolNumber)
+	}
+
+	hdr := p.Pkt.Header.View()
+	b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
+
+	checker.IPv4(c.t, b, checker.SrcAddr(StackAddr), checker.DstAddr(TestAddr))
+	return b
+}
+
+// SendICMPPacket builds and sends an ICMPv4 packet via the link layer endpoint.
+func (c *Context) SendICMPPacket(typ header.ICMPv4Type, code uint8, p1, p2 []byte, maxTotalSize int) {
+	// Allocate a buffer data and headers.
+	buf := buffer.NewView(header.IPv4MinimumSize + header.ICMPv4PayloadOffset + len(p2))
+	if len(buf) > maxTotalSize {
+		buf = buf[:maxTotalSize]
+	}
+
+	ip := header.IPv4(buf)
+	ip.Encode(&header.IPv4Fields{
+		IHL:         header.IPv4MinimumSize,
+		TotalLength: uint16(len(buf)),
+		TTL:         65,
+		Protocol:    uint8(header.ICMPv4ProtocolNumber),
+		SrcAddr:     TestAddr,
+		DstAddr:     StackAddr,
+	})
+	ip.SetChecksum(^ip.CalculateChecksum())
+
+	icmp := header.ICMPv4(buf[header.IPv4MinimumSize:])
+	icmp.SetType(typ)
+	icmp.SetCode(code)
+	const icmpv4VariableHeaderOffset = 4
+	copy(icmp[icmpv4VariableHeaderOffset:], p1)
+	copy(icmp[header.ICMPv4PayloadOffset:], p2)
+
+	// Inject packet.
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+}
+
+// BuildSegment builds a TCP segment based on the given Headers and payload.
+func (c *Context) BuildSegment(payload []byte, h *Headers) buffer.VectorisedView {
+	return c.BuildSegmentWithAddrs(payload, h, TestAddr, StackAddr)
+}
+
+// BuildSegmentWithAddrs builds a TCP segment based on the given Headers,
+// payload and source and destination IPv4 addresses.
+func (c *Context) BuildSegmentWithAddrs(payload []byte, h *Headers, src, dst tcpip.Address) buffer.VectorisedView {
+	// Allocate a buffer for data and headers.
+	buf := buffer.NewView(header.TCPMinimumSize + header.IPv4MinimumSize + len(h.TCPOpts) + len(payload))
+	copy(buf[len(buf)-len(payload):], payload)
+	copy(buf[len(buf)-len(payload)-len(h.TCPOpts):], h.TCPOpts)
+
+	// Initialize the IP header.
+	ip := header.IPv4(buf)
+	ip.Encode(&header.IPv4Fields{
+		IHL:         header.IPv4MinimumSize,
+		TotalLength: uint16(len(buf)),
+		TTL:         65,
+		Protocol:    uint8(tcp.ProtocolNumber),
+		SrcAddr:     src,
+		DstAddr:     dst,
+	})
+	ip.SetChecksum(^ip.CalculateChecksum())
+
+	// Initialize the TCP header.
+	t := header.TCP(buf[header.IPv4MinimumSize:])
+	t.Encode(&header.TCPFields{
+		SrcPort:    h.SrcPort,
+		DstPort:    h.DstPort,
+		SeqNum:     uint32(h.SeqNum),
+		AckNum:     uint32(h.AckNum),
+		DataOffset: uint8(header.TCPMinimumSize + len(h.TCPOpts)),
+		Flags:      uint8(h.Flags),
+		WindowSize: uint16(h.RcvWnd),
+	})
+
+	// Calculate the TCP pseudo-header checksum.
+	xsum := header.PseudoHeaderChecksum(tcp.ProtocolNumber, src, dst, uint16(len(t)))
+
+	// Calculate the TCP checksum and set it.
+	xsum = header.Checksum(payload, xsum)
+	t.SetChecksum(^t.CalculateChecksum(xsum))
+
+	// Inject packet.
+	return buf.ToVectorisedView()
+}
+
+// SendSegment sends a TCP segment that has already been built and written to a
+// buffer.VectorisedView.
+func (c *Context) SendSegment(s buffer.VectorisedView) {
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Data: s,
+	})
+}
+
+// SendPacket builds and sends a TCP segment(with the provided payload & TCP
+// headers) in an IPv4 packet via the link layer endpoint.
+func (c *Context) SendPacket(payload []byte, h *Headers) {
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Data: c.BuildSegment(payload, h),
+	})
+}
+
+// SendPacketWithAddrs builds and sends a TCP segment(with the provided payload
+// & TCPheaders) in an IPv4 packet via the link layer endpoint using the
+// provided source and destination IPv4 addresses.
+func (c *Context) SendPacketWithAddrs(payload []byte, h *Headers, src, dst tcpip.Address) {
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Data: c.BuildSegmentWithAddrs(payload, h, src, dst),
+	})
+}
+
+// SendAck sends an ACK packet.
+func (c *Context) SendAck(seq seqnum.Value, bytesReceived int) {
+	c.SendAckWithSACK(seq, bytesReceived, nil)
+}
+
+// SendAckWithSACK sends an ACK packet which includes the sackBlocks specified.
+func (c *Context) SendAckWithSACK(seq seqnum.Value, bytesReceived int, sackBlocks []header.SACKBlock) {
+	options := make([]byte, 40)
+	offset := 0
+	if len(sackBlocks) > 0 {
+		offset += header.EncodeNOP(options[offset:])
+		offset += header.EncodeNOP(options[offset:])
+		offset += header.EncodeSACKBlocks(sackBlocks, options[offset:])
+	}
+
+	c.SendPacket(nil, &Headers{
+		SrcPort: TestPort,
+		DstPort: c.Port,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  seq,
+		AckNum:  c.IRS.Add(1 + seqnum.Size(bytesReceived)),
+		RcvWnd:  30000,
+		TCPOpts: options[:offset],
+	})
+}
+
+// ReceiveAndCheckPacket reads a packet from the link layer endpoint and
+// verifies that the packet packet payload of packet matches the slice
+// of data indicated by offset & size.
+func (c *Context) ReceiveAndCheckPacket(data []byte, offset, size int) {
+	c.t.Helper()
+
+	c.ReceiveAndCheckPacketWithOptions(data, offset, size, 0)
+}
+
+// ReceiveAndCheckPacketWithOptions reads a packet from the link layer endpoint
+// and verifies that the packet packet payload of packet matches the slice of
+// data indicated by offset & size and skips optlen bytes in addition to the IP
+// TCP headers when comparing the data.
+func (c *Context) ReceiveAndCheckPacketWithOptions(data []byte, offset, size, optlen int) {
+	c.t.Helper()
+
+	b := c.GetPacket()
+	checker.IPv4(c.t, b,
+		checker.PayloadLen(size+header.TCPMinimumSize+optlen),
+		checker.TCP(
+			checker.DstPort(TestPort),
+			checker.SeqNum(uint32(c.IRS.Add(seqnum.Size(1+offset)))),
+			checker.AckNum(uint32(seqnum.Value(testInitialSequenceNumber).Add(1))),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	pdata := data[offset:][:size]
+	if p := b[header.IPv4MinimumSize+header.TCPMinimumSize+optlen:]; bytes.Compare(pdata, p) != 0 {
+		c.t.Fatalf("Data is different: expected %v, got %v", pdata, p)
+	}
+}
+
+// ReceiveNonBlockingAndCheckPacket reads a packet from the link layer endpoint
+// and verifies that the packet packet payload of packet matches the slice of
+// data indicated by offset & size. It returns true if a packet was received and
+// processed.
+func (c *Context) ReceiveNonBlockingAndCheckPacket(data []byte, offset, size int) bool {
+	c.t.Helper()
+
+	b := c.GetPacketNonBlocking()
+	if b == nil {
+		return false
+	}
+	checker.IPv4(c.t, b,
+		checker.PayloadLen(size+header.TCPMinimumSize),
+		checker.TCP(
+			checker.DstPort(TestPort),
+			checker.SeqNum(uint32(c.IRS.Add(seqnum.Size(1+offset)))),
+			checker.AckNum(uint32(seqnum.Value(testInitialSequenceNumber).Add(1))),
+			checker.TCPFlagsMatch(header.TCPFlagAck, ^uint8(header.TCPFlagPsh)),
+		),
+	)
+
+	pdata := data[offset:][:size]
+	if p := b[header.IPv4MinimumSize+header.TCPMinimumSize:]; bytes.Compare(pdata, p) != 0 {
+		c.t.Fatalf("Data is different: expected %v, got %v", pdata, p)
+	}
+	return true
+}
+
+// CreateV6Endpoint creates and initializes c.ep as a IPv6 Endpoint. If v6Only
+// is true then it sets the IP_V6ONLY option on the socket to make it a IPv6
+// only endpoint instead of a default dual stack socket.
+func (c *Context) CreateV6Endpoint(v6only bool) {
+	var err *tcpip.Error
+	c.EP, err = c.s.NewEndpoint(tcp.ProtocolNumber, ipv6.ProtocolNumber, &c.WQ)
+	if err != nil {
+		c.t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	if err := c.EP.SetSockOptBool(tcpip.V6OnlyOption, v6only); err != nil {
+		c.t.Fatalf("SetSockOpt failed failed: %v", err)
+	}
+}
+
+// GetV6Packet reads a single packet from the link layer endpoint of the context
+// and asserts that it is an IPv6 Packet with the expected src/dest addresses.
+func (c *Context) GetV6Packet() []byte {
+	c.t.Helper()
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+	p, ok := c.linkEP.ReadContext(ctx)
+	if !ok {
+		c.t.Fatalf("Packet wasn't written out")
+		return nil
+	}
+
+	if p.Proto != ipv6.ProtocolNumber {
+		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, ipv6.ProtocolNumber)
+	}
+	b := make([]byte, p.Pkt.Header.UsedLength()+p.Pkt.Data.Size())
+	copy(b, p.Pkt.Header.View())
+	copy(b[p.Pkt.Header.UsedLength():], p.Pkt.Data.ToView())
+
+	checker.IPv6(c.t, b, checker.SrcAddr(StackV6Addr), checker.DstAddr(TestV6Addr))
+	return b
+}
+
+// SendV6Packet builds and sends an IPv6 Packet via the link layer endpoint of
+// the context.
+func (c *Context) SendV6Packet(payload []byte, h *Headers) {
+	c.SendV6PacketWithAddrs(payload, h, TestV6Addr, StackV6Addr)
+}
+
+// SendV6PacketWithAddrs builds and sends an IPv6 Packet via the link layer
+// endpoint of the context using the provided source and destination IPv6
+// addresses.
+func (c *Context) SendV6PacketWithAddrs(payload []byte, h *Headers, src, dst tcpip.Address) {
+	// Allocate a buffer for data and headers.
+	buf := buffer.NewView(header.TCPMinimumSize + header.IPv6MinimumSize + len(payload))
+	copy(buf[len(buf)-len(payload):], payload)
+
+	// Initialize the IP header.
+	ip := header.IPv6(buf)
+	ip.Encode(&header.IPv6Fields{
+		PayloadLength: uint16(header.TCPMinimumSize + len(payload)),
+		NextHeader:    uint8(tcp.ProtocolNumber),
+		HopLimit:      65,
+		SrcAddr:       src,
+		DstAddr:       dst,
+	})
+
+	// Initialize the TCP header.
+	t := header.TCP(buf[header.IPv6MinimumSize:])
+	t.Encode(&header.TCPFields{
+		SrcPort:    h.SrcPort,
+		DstPort:    h.DstPort,
+		SeqNum:     uint32(h.SeqNum),
+		AckNum:     uint32(h.AckNum),
+		DataOffset: header.TCPMinimumSize,
+		Flags:      uint8(h.Flags),
+		WindowSize: uint16(h.RcvWnd),
+	})
+
+	// Calculate the TCP pseudo-header checksum.
+	xsum := header.PseudoHeaderChecksum(tcp.ProtocolNumber, src, dst, uint16(len(t)))
+
+	// Calculate the TCP checksum and set it.
+	xsum = header.Checksum(payload, xsum)
+	t.SetChecksum(^t.CalculateChecksum(xsum))
+
+	// Inject packet.
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+}
+
+// CreateConnected creates a connected TCP endpoint.
+func (c *Context) CreateConnected(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf int) {
+	c.CreateConnectedWithRawOptions(iss, rcvWnd, epRcvBuf, nil)
+}
+
+// Connect performs the 3-way handshake for c.EP with the provided Initial
+// Sequence Number (iss) and receive window(rcvWnd) and any options if
+// specified.
+//
+// It also sets the receive buffer for the endpoint to the specified
+// value in epRcvBuf.
+//
+// PreCondition: c.EP must already be created.
+func (c *Context) Connect(iss seqnum.Value, rcvWnd seqnum.Size, options []byte) {
+	c.t.Helper()
+
+	// Start connection attempt.
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&waitEntry, waiter.EventOut)
+	defer c.WQ.EventUnregister(&waitEntry)
+
+	if err := c.EP.Connect(tcpip.FullAddress{Addr: TestAddr, Port: TestPort}); err != tcpip.ErrConnectStarted {
+		c.t.Fatalf("Unexpected return value from Connect: %v", err)
+	}
+
+	// Receive SYN packet.
+	b := c.GetPacket()
+	checker.IPv4(c.t, b,
+		checker.TCP(
+			checker.DstPort(TestPort),
+			checker.TCPFlags(header.TCPFlagSyn),
+		),
+	)
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
+		c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	tcpHdr := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcpHdr.SequenceNumber())
+
+	c.SendPacket(nil, &Headers{
+		SrcPort: tcpHdr.DestinationPort(),
+		DstPort: tcpHdr.SourcePort(),
+		Flags:   header.TCPFlagSyn | header.TCPFlagAck,
+		SeqNum:  iss,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  rcvWnd,
+		TCPOpts: options,
+	})
+
+	// Receive ACK packet.
+	checker.IPv4(c.t, c.GetPacket(),
+		checker.TCP(
+			checker.DstPort(TestPort),
+			checker.TCPFlags(header.TCPFlagAck),
+			checker.SeqNum(uint32(c.IRS)+1),
+			checker.AckNum(uint32(iss)+1),
+		),
+	)
+
+	// Wait for connection to be established.
+	select {
+	case <-notifyCh:
+		if err := c.EP.GetSockOpt(tcpip.ErrorOption{}); err != nil {
+			c.t.Fatalf("Unexpected error when connecting: %v", err)
+		}
+	case <-time.After(1 * time.Second):
+		c.t.Fatalf("Timed out waiting for connection")
+	}
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateEstablished; got != want {
+		c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	c.Port = tcpHdr.SourcePort()
+}
+
+// Create creates a TCP endpoint.
+func (c *Context) Create(epRcvBuf int) {
+	// Create TCP endpoint.
+	var err *tcpip.Error
+	c.EP, err = c.s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		c.t.Fatalf("NewEndpoint failed: %v", err)
+	}
+
+	if epRcvBuf != -1 {
+		if err := c.EP.SetSockOptInt(tcpip.ReceiveBufferSizeOption, epRcvBuf); err != nil {
+			c.t.Fatalf("SetSockOpt failed failed: %v", err)
+		}
+	}
+}
+
+// CreateConnectedWithRawOptions creates a connected TCP endpoint and sends
+// the specified option bytes as the Option field in the initial SYN packet.
+//
+// It also sets the receive buffer for the endpoint to the specified
+// value in epRcvBuf.
+func (c *Context) CreateConnectedWithRawOptions(iss seqnum.Value, rcvWnd seqnum.Size, epRcvBuf int, options []byte) {
+	c.Create(epRcvBuf)
+	c.Connect(iss, rcvWnd, options)
+}
+
+// RawEndpoint is just a small wrapper around a TCP endpoint's state to make
+// sending data and ACK packets easy while being able to manipulate the sequence
+// numbers and timestamp values as needed.
+type RawEndpoint struct {
+	C          *Context
+	SrcPort    uint16
+	DstPort    uint16
+	Flags      int
+	NextSeqNum seqnum.Value
+	AckNum     seqnum.Value
+	WndSize    seqnum.Size
+	RecentTS   uint32 // Stores the latest timestamp to echo back.
+	TSVal      uint32 // TSVal stores the last timestamp sent by this endpoint.
+
+	// SackPermitted is true if SACKPermitted option was negotiated for this endpoint.
+	SACKPermitted bool
+}
+
+// SendPacketWithTS embeds the provided tsVal in the Timestamp option
+// for the packet to be sent out.
+func (r *RawEndpoint) SendPacketWithTS(payload []byte, tsVal uint32) {
+	r.TSVal = tsVal
+	tsOpt := [12]byte{header.TCPOptionNOP, header.TCPOptionNOP}
+	header.EncodeTSOption(r.TSVal, r.RecentTS, tsOpt[2:])
+	r.SendPacket(payload, tsOpt[:])
+}
+
+// SendPacket is a small wrapper function to build and send packets.
+func (r *RawEndpoint) SendPacket(payload []byte, opts []byte) {
+	packetHeaders := &Headers{
+		SrcPort: r.SrcPort,
+		DstPort: r.DstPort,
+		Flags:   r.Flags,
+		SeqNum:  r.NextSeqNum,
+		AckNum:  r.AckNum,
+		RcvWnd:  r.WndSize,
+		TCPOpts: opts,
+	}
+	r.C.SendPacket(payload, packetHeaders)
+	r.NextSeqNum = r.NextSeqNum.Add(seqnum.Size(len(payload)))
+}
+
+// VerifyACKWithTS verifies that the tsEcr field in the ack matches the provided
+// tsVal.
+func (r *RawEndpoint) VerifyACKWithTS(tsVal uint32) {
+	// Read ACK and verify that tsEcr of ACK packet is [1,2,3,4]
+	ackPacket := r.C.GetPacket()
+	checker.IPv4(r.C.t, ackPacket,
+		checker.TCP(
+			checker.DstPort(r.SrcPort),
+			checker.TCPFlags(header.TCPFlagAck),
+			checker.SeqNum(uint32(r.AckNum)),
+			checker.AckNum(uint32(r.NextSeqNum)),
+			checker.TCPTimestampChecker(true, 0, tsVal),
+		),
+	)
+	// Store the parsed TSVal from the ack as recentTS.
+	tcpSeg := header.TCP(header.IPv4(ackPacket).Payload())
+	opts := tcpSeg.ParsedOptions()
+	r.RecentTS = opts.TSVal
+}
+
+// VerifyACKRcvWnd verifies that the window advertised by the incoming ACK
+// matches the provided rcvWnd.
+func (r *RawEndpoint) VerifyACKRcvWnd(rcvWnd uint16) {
+	ackPacket := r.C.GetPacket()
+	checker.IPv4(r.C.t, ackPacket,
+		checker.TCP(
+			checker.DstPort(r.SrcPort),
+			checker.TCPFlags(header.TCPFlagAck),
+			checker.SeqNum(uint32(r.AckNum)),
+			checker.AckNum(uint32(r.NextSeqNum)),
+			checker.Window(rcvWnd),
+		),
+	)
+}
+
+// VerifyACKNoSACK verifies that the ACK does not contain a SACK block.
+func (r *RawEndpoint) VerifyACKNoSACK() {
+	r.VerifyACKHasSACK(nil)
+}
+
+// VerifyACKHasSACK verifies that the ACK contains the specified SACKBlocks.
+func (r *RawEndpoint) VerifyACKHasSACK(sackBlocks []header.SACKBlock) {
+	// Read ACK and verify that the TCP options in the segment do
+	// not contain a SACK block.
+	ackPacket := r.C.GetPacket()
+	checker.IPv4(r.C.t, ackPacket,
+		checker.TCP(
+			checker.DstPort(r.SrcPort),
+			checker.TCPFlags(header.TCPFlagAck),
+			checker.SeqNum(uint32(r.AckNum)),
+			checker.AckNum(uint32(r.NextSeqNum)),
+			checker.TCPSACKBlockChecker(sackBlocks),
+		),
+	)
+}
+
+// CreateConnectedWithOptions creates and connects c.ep with the specified TCP
+// options enabled and returns a RawEndpoint which represents the other end of
+// the connection.
+//
+// It also verifies where required(eg.Timestamp) that the ACK to the SYN-ACK
+// does not carry an option that was not requested.
+func (c *Context) CreateConnectedWithOptions(wantOptions header.TCPSynOptions) *RawEndpoint {
+	var err *tcpip.Error
+	c.EP, err = c.s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, &c.WQ)
+	if err != nil {
+		c.t.Fatalf("c.s.NewEndpoint(tcp, ipv4...) = %v", err)
+	}
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateInitial; got != want {
+		c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	// Start connection attempt.
+	waitEntry, notifyCh := waiter.NewChannelEntry(nil)
+	c.WQ.EventRegister(&waitEntry, waiter.EventOut)
+	defer c.WQ.EventUnregister(&waitEntry)
+
+	testFullAddr := tcpip.FullAddress{Addr: TestAddr, Port: TestPort}
+	err = c.EP.Connect(testFullAddr)
+	if err != tcpip.ErrConnectStarted {
+		c.t.Fatalf("c.ep.Connect(%v) = %v", testFullAddr, err)
+	}
+	// Receive SYN packet.
+	b := c.GetPacket()
+	// Validate that the syn has the timestamp option and a valid
+	// TS value.
+	mss := uint16(c.linkEP.MTU() - header.IPv4MinimumSize - header.TCPMinimumSize)
+
+	checker.IPv4(c.t, b,
+		checker.TCP(
+			checker.DstPort(TestPort),
+			checker.TCPFlags(header.TCPFlagSyn),
+			checker.TCPSynOptions(header.TCPSynOptions{
+				MSS:           mss,
+				TS:            true,
+				WS:            int(c.WindowScale),
+				SACKPermitted: c.SACKEnabled(),
+			}),
+		),
+	)
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateSynSent; got != want {
+		c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	tcpSeg := header.TCP(header.IPv4(b).Payload())
+	synOptions := header.ParseSynOptions(tcpSeg.Options(), false)
+
+	// Build options w/ tsVal to be sent in the SYN-ACK.
+	synAckOptions := make([]byte, header.TCPOptionsMaximumSize)
+	offset := 0
+	if wantOptions.WS != -1 {
+		offset += header.EncodeWSOption(wantOptions.WS, synAckOptions[offset:])
+	}
+	if wantOptions.TS {
+		offset += header.EncodeTSOption(wantOptions.TSVal, synOptions.TSVal, synAckOptions[offset:])
+	}
+	if wantOptions.SACKPermitted {
+		offset += header.EncodeSACKPermittedOption(synAckOptions[offset:])
+	}
+
+	offset += header.AddTCPOptionPadding(synAckOptions, offset)
+
+	// Build SYN-ACK.
+	c.IRS = seqnum.Value(tcpSeg.SequenceNumber())
+	iss := seqnum.Value(testInitialSequenceNumber)
+	c.SendPacket(nil, &Headers{
+		SrcPort: tcpSeg.DestinationPort(),
+		DstPort: tcpSeg.SourcePort(),
+		Flags:   header.TCPFlagSyn | header.TCPFlagAck,
+		SeqNum:  iss,
+		AckNum:  c.IRS.Add(1),
+		RcvWnd:  30000,
+		TCPOpts: synAckOptions[:offset],
+	})
+
+	// Read ACK.
+	ackPacket := c.GetPacket()
+
+	// Verify TCP header fields.
+	tcpCheckers := []checker.TransportChecker{
+		checker.DstPort(TestPort),
+		checker.TCPFlags(header.TCPFlagAck),
+		checker.SeqNum(uint32(c.IRS) + 1),
+		checker.AckNum(uint32(iss) + 1),
+	}
+
+	// Verify that tsEcr of ACK packet is wantOptions.TSVal if the
+	// timestamp option was enabled, if not then we verify that
+	// there is no timestamp in the ACK packet.
+	if wantOptions.TS {
+		tcpCheckers = append(tcpCheckers, checker.TCPTimestampChecker(true, 0, wantOptions.TSVal))
+	} else {
+		tcpCheckers = append(tcpCheckers, checker.TCPTimestampChecker(false, 0, 0))
+	}
+
+	checker.IPv4(c.t, ackPacket, checker.TCP(tcpCheckers...))
+
+	ackSeg := header.TCP(header.IPv4(ackPacket).Payload())
+	ackOptions := ackSeg.ParsedOptions()
+
+	// Wait for connection to be established.
+	select {
+	case <-notifyCh:
+		err = c.EP.GetSockOpt(tcpip.ErrorOption{})
+		if err != nil {
+			c.t.Fatalf("Unexpected error when connecting: %v", err)
+		}
+	case <-time.After(1 * time.Second):
+		c.t.Fatalf("Timed out waiting for connection")
+	}
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateEstablished; got != want {
+		c.t.Fatalf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	// Store the source port in use by the endpoint.
+	c.Port = tcpSeg.SourcePort()
+
+	// Mark in context that timestamp option is enabled for this endpoint.
+	c.TimeStampEnabled = true
+
+	return &RawEndpoint{
+		C:             c,
+		SrcPort:       tcpSeg.DestinationPort(),
+		DstPort:       tcpSeg.SourcePort(),
+		Flags:         header.TCPFlagAck | header.TCPFlagPsh,
+		NextSeqNum:    iss + 1,
+		AckNum:        c.IRS.Add(1),
+		WndSize:       30000,
+		RecentTS:      ackOptions.TSVal,
+		TSVal:         wantOptions.TSVal,
+		SACKPermitted: wantOptions.SACKPermitted,
+	}
+}
+
+// AcceptWithOptions initializes a listening endpoint and connects to it with the
+// provided options enabled. It also verifies that the SYN-ACK has the expected
+// values for the provided options.
+//
+// The function returns a RawEndpoint representing the other end of the accepted
+// endpoint.
+func (c *Context) AcceptWithOptions(wndScale int, synOptions header.TCPSynOptions) *RawEndpoint {
+	// Create EP and start listening.
+	wq := &waiter.Queue{}
+	ep, err := c.s.NewEndpoint(tcp.ProtocolNumber, ipv4.ProtocolNumber, wq)
+	if err != nil {
+		c.t.Fatalf("NewEndpoint failed: %v", err)
+	}
+	defer ep.Close()
+
+	if err := ep.Bind(tcpip.FullAddress{Port: StackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %v", err)
+	}
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateBound; got != want {
+		c.t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	if err := ep.Listen(10); err != nil {
+		c.t.Fatalf("Listen failed: %v", err)
+	}
+	if got, want := tcp.EndpointState(ep.State()), tcp.StateListen; got != want {
+		c.t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	rep := c.PassiveConnectWithOptions(100, wndScale, synOptions)
+
+	// Try to accept the connection.
+	we, ch := waiter.NewChannelEntry(nil)
+	wq.EventRegister(&we, waiter.EventIn)
+	defer wq.EventUnregister(&we)
+
+	c.EP, _, err = ep.Accept()
+	if err == tcpip.ErrWouldBlock {
+		// Wait for connection to be established.
+		select {
+		case <-ch:
+			c.EP, _, err = ep.Accept()
+			if err != nil {
+				c.t.Fatalf("Accept failed: %v", err)
+			}
+
+		case <-time.After(1 * time.Second):
+			c.t.Fatalf("Timed out waiting for accept")
+		}
+	}
+	if got, want := tcp.EndpointState(c.EP.State()), tcp.StateEstablished; got != want {
+		c.t.Errorf("Unexpected endpoint state: want %v, got %v", want, got)
+	}
+
+	return rep
+}
+
+// PassiveConnect just disables WindowScaling and delegates the call to
+// PassiveConnectWithOptions.
+func (c *Context) PassiveConnect(maxPayload, wndScale int, synOptions header.TCPSynOptions) {
+	synOptions.WS = -1
+	c.PassiveConnectWithOptions(maxPayload, wndScale, synOptions)
+}
+
+// PassiveConnectWithOptions initiates a new connection (with the specified TCP
+// options enabled) to the port on which the Context.ep is listening for new
+// connections. It also validates that the SYN-ACK has the expected values for
+// the enabled options.
+//
+// NOTE: MSS is not a negotiated option and it can be asymmetric
+// in each direction. This function uses the maxPayload to set the MSS to be
+// sent to the peer on a connect and validates that the MSS in the SYN-ACK
+// response is equal to the MTU - (tcphdr len + iphdr len).
+//
+// wndScale is the expected window scale in the SYN-ACK and synOptions.WS is the
+// value of the window scaling option to be sent in the SYN. If synOptions.WS >
+// 0 then we send the WindowScale option.
+func (c *Context) PassiveConnectWithOptions(maxPayload, wndScale int, synOptions header.TCPSynOptions) *RawEndpoint {
+	opts := make([]byte, header.TCPOptionsMaximumSize)
+	offset := 0
+	offset += header.EncodeMSSOption(uint32(maxPayload), opts)
+
+	if synOptions.WS >= 0 {
+		offset += header.EncodeWSOption(3, opts[offset:])
+	}
+	if synOptions.TS {
+		offset += header.EncodeTSOption(synOptions.TSVal, synOptions.TSEcr, opts[offset:])
+	}
+
+	if synOptions.SACKPermitted {
+		offset += header.EncodeSACKPermittedOption(opts[offset:])
+	}
+
+	paddingToAdd := 4 - offset%4
+	// Now add any padding bytes that might be required to quad align the
+	// options.
+	for i := offset; i < offset+paddingToAdd; i++ {
+		opts[i] = header.TCPOptionNOP
+	}
+	offset += paddingToAdd
+
+	// Send a SYN request.
+	iss := seqnum.Value(testInitialSequenceNumber)
+	c.SendPacket(nil, &Headers{
+		SrcPort: TestPort,
+		DstPort: StackPort,
+		Flags:   header.TCPFlagSyn,
+		SeqNum:  iss,
+		RcvWnd:  30000,
+		TCPOpts: opts[:offset],
+	})
+
+	// Receive the SYN-ACK reply. Make sure MSS and other expected options
+	// are present.
+	b := c.GetPacket()
+	tcp := header.TCP(header.IPv4(b).Payload())
+	c.IRS = seqnum.Value(tcp.SequenceNumber())
+
+	tcpCheckers := []checker.TransportChecker{
+		checker.SrcPort(StackPort),
+		checker.DstPort(TestPort),
+		checker.TCPFlags(header.TCPFlagAck | header.TCPFlagSyn),
+		checker.AckNum(uint32(iss) + 1),
+		checker.TCPSynOptions(header.TCPSynOptions{MSS: synOptions.MSS, WS: wndScale, SACKPermitted: synOptions.SACKPermitted && c.SACKEnabled()}),
+	}
+
+	// If TS option was enabled in the original SYN then add a checker to
+	// validate the Timestamp option in the SYN-ACK.
+	if synOptions.TS {
+		tcpCheckers = append(tcpCheckers, checker.TCPTimestampChecker(synOptions.TS, 0, synOptions.TSVal))
+	} else {
+		tcpCheckers = append(tcpCheckers, checker.TCPTimestampChecker(false, 0, 0))
+	}
+
+	checker.IPv4(c.t, b, checker.TCP(tcpCheckers...))
+	rcvWnd := seqnum.Size(30000)
+	ackHeaders := &Headers{
+		SrcPort: TestPort,
+		DstPort: StackPort,
+		Flags:   header.TCPFlagAck,
+		SeqNum:  iss + 1,
+		AckNum:  c.IRS + 1,
+		RcvWnd:  rcvWnd,
+	}
+
+	// If WS was expected to be in effect then scale the advertised window
+	// correspondingly.
+	if synOptions.WS > 0 {
+		ackHeaders.RcvWnd = rcvWnd >> byte(synOptions.WS)
+	}
+
+	parsedOpts := tcp.ParsedOptions()
+	if synOptions.TS {
+		// Echo the tsVal back to the peer in the tsEcr field of the
+		// timestamp option.
+		// Increment TSVal by 1 from the value sent in the SYN and echo
+		// the TSVal in the SYN-ACK in the TSEcr field.
+		opts := [12]byte{header.TCPOptionNOP, header.TCPOptionNOP}
+		header.EncodeTSOption(synOptions.TSVal+1, parsedOpts.TSVal, opts[2:])
+		ackHeaders.TCPOpts = opts[:]
+	}
+
+	// Send ACK.
+	c.SendPacket(nil, ackHeaders)
+
+	c.Port = StackPort
+
+	return &RawEndpoint{
+		C:             c,
+		SrcPort:       TestPort,
+		DstPort:       StackPort,
+		Flags:         header.TCPFlagPsh | header.TCPFlagAck,
+		NextSeqNum:    iss + 1,
+		AckNum:        c.IRS + 1,
+		WndSize:       rcvWnd,
+		SACKPermitted: synOptions.SACKPermitted && c.SACKEnabled(),
+		RecentTS:      parsedOpts.TSVal,
+		TSVal:         synOptions.TSVal + 1,
+	}
+}
+
+// SACKEnabled returns true if the TCP Protocol option SACKEnabled is set to true
+// for the Stack in the context.
+func (c *Context) SACKEnabled() bool {
+	var v tcp.SACKEnabled
+	if err := c.Stack().TransportProtocolOption(tcp.ProtocolNumber, &v); err != nil {
+		// Stack doesn't support SACK. So just return.
+		return false
+	}
+	return bool(v)
+}
+
+// SetGSOEnabled enables or disables generic segmentation offload.
+func (c *Context) SetGSOEnabled(enable bool) {
+	if enable {
+		c.linkEP.LinkEPCapabilities |= stack.CapabilityHardwareGSO
+	} else {
+		c.linkEP.LinkEPCapabilities &^= stack.CapabilityHardwareGSO
+	}
+}
+
+// MSSWithoutOptions returns the value for the MSS used by the stack when no
+// options are in use.
+func (c *Context) MSSWithoutOptions() uint16 {
+	return uint16(c.linkEP.MTU() - header.IPv4MinimumSize - header.TCPMinimumSize)
+}
+
+// MSSWithoutOptionsV6 returns the value for the MSS used by the stack when no
+// options are in use for IPv6 packets.
+func (c *Context) MSSWithoutOptionsV6() uint16 {
+	return uint16(c.linkEP.MTU() - header.IPv6MinimumSize - header.TCPMinimumSize)
+}
diff --git a/pkg/tcpip/transport/tcp/timer.go b/pkg/tcpip/transport/tcp/timer.go
new file mode 100644
index 000000000..7981d469b
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/timer.go
@@ -0,0 +1,142 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sleep"
+)
+
+type timerState int
+
+const (
+	timerStateDisabled timerState = iota
+	timerStateEnabled
+	timerStateOrphaned
+)
+
+// timer is a timer implementation that reduces the interactions with the
+// runtime timer infrastructure by letting timers run (and potentially
+// eventually expire) even if they are stopped. It makes it cheaper to
+// disable/reenable timers at the expense of spurious wakes. This is useful for
+// cases when the same timer is disabled/reenabled repeatedly with relatively
+// long timeouts farther into the future.
+//
+// TCP retransmit timers benefit from this because they the timeouts are long
+// (currently at least 200ms), and get disabled when acks are received, and
+// reenabled when new pending segments are sent.
+//
+// It is advantageous to avoid interacting with the runtime because it acquires
+// a global mutex and performs O(log n) operations, where n is the global number
+// of timers, whenever a timer is enabled or disabled, and may make a syscall.
+//
+// This struct is thread-compatible.
+type timer struct {
+	// state is the current state of the timer, it can be one of the
+	// following values:
+	//     disabled - the timer is disabled.
+	//     orphaned - the timer is disabled, but the runtime timer is
+	//                enabled, which means that it will evetually cause a
+	//                spurious wake (unless it gets enabled again before
+	//                then).
+	//     enabled  - the timer is enabled, but the runtime timer may be set
+	//                to an earlier expiration time due to a previous
+	//                orphaned state.
+	state timerState
+
+	// target is the expiration time of the current timer. It is only
+	// meaningful in the enabled state.
+	target time.Time
+
+	// runtimeTarget is the expiration time of the runtime timer. It is
+	// meaningful in the enabled and orphaned states.
+	runtimeTarget time.Time
+
+	// timer is the runtime timer used to wait on.
+	timer *time.Timer
+}
+
+// init initializes the timer. Once it expires, it the given waker will be
+// asserted.
+func (t *timer) init(w *sleep.Waker) {
+	t.state = timerStateDisabled
+
+	// Initialize a runtime timer that will assert the waker, then
+	// immediately stop it.
+	t.timer = time.AfterFunc(time.Hour, func() {
+		w.Assert()
+	})
+	t.timer.Stop()
+}
+
+// cleanup frees all resources associated with the timer.
+func (t *timer) cleanup() {
+	t.timer.Stop()
+	*t = timer{}
+}
+
+// checkExpiration checks if the given timer has actually expired, it should be
+// called whenever a sleeper wakes up due to the waker being asserted, and is
+// used to check if it's a supurious wake (due to a previously orphaned timer)
+// or a legitimate one.
+func (t *timer) checkExpiration() bool {
+	// Transition to fully disabled state if we're just consuming an
+	// orphaned timer.
+	if t.state == timerStateOrphaned {
+		t.state = timerStateDisabled
+		return false
+	}
+
+	// The timer is enabled, but it may have expired early. Check if that's
+	// the case, and if so, reset the runtime timer to the correct time.
+	now := time.Now()
+	if now.Before(t.target) {
+		t.runtimeTarget = t.target
+		t.timer.Reset(t.target.Sub(now))
+		return false
+	}
+
+	// The timer has actually expired, disable it for now and inform the
+	// caller.
+	t.state = timerStateDisabled
+	return true
+}
+
+// disable disables the timer, leaving it in an orphaned state if it wasn't
+// already disabled.
+func (t *timer) disable() {
+	if t.state != timerStateDisabled {
+		t.state = timerStateOrphaned
+	}
+}
+
+// enabled returns true if the timer is currently enabled, false otherwise.
+func (t *timer) enabled() bool {
+	return t.state == timerStateEnabled
+}
+
+// enable enables the timer, programming the runtime timer if necessary.
+func (t *timer) enable(d time.Duration) {
+	t.target = time.Now().Add(d)
+
+	// Check if we need to set the runtime timer.
+	if t.state == timerStateDisabled || t.target.Before(t.runtimeTarget) {
+		t.runtimeTarget = t.target
+		t.timer.Reset(d)
+	}
+
+	t.state = timerStateEnabled
+}
diff --git a/pkg/tcpip/transport/tcp/timer_test.go b/pkg/tcpip/transport/tcp/timer_test.go
new file mode 100644
index 000000000..dbd6dff54
--- /dev/null
+++ b/pkg/tcpip/transport/tcp/timer_test.go
@@ -0,0 +1,47 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcp
+
+import (
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sleep"
+)
+
+func TestCleanup(t *testing.T) {
+	const (
+		timerDurationSeconds     = 2
+		isAssertedTimeoutSeconds = timerDurationSeconds + 1
+	)
+
+	tmr := timer{}
+	w := sleep.Waker{}
+	tmr.init(&w)
+	tmr.enable(timerDurationSeconds * time.Second)
+	tmr.cleanup()
+
+	if want := (timer{}); tmr != want {
+		t.Errorf("got tmr = %+v, want = %+v", tmr, want)
+	}
+
+	// The waker should not be asserted.
+	for i := 0; i < isAssertedTimeoutSeconds; i++ {
+		time.Sleep(time.Second)
+		if w.IsAsserted() {
+			t.Fatalf("waker asserted unexpectedly")
+		}
+	}
+}
diff --git a/pkg/tcpip/transport/tcpconntrack/BUILD b/pkg/tcpip/transport/tcpconntrack/BUILD
new file mode 100644
index 000000000..3ad6994a7
--- /dev/null
+++ b/pkg/tcpip/transport/tcpconntrack/BUILD
@@ -0,0 +1,23 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "tcpconntrack",
+    srcs = ["tcp_conntrack.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/seqnum",
+    ],
+)
+
+go_test(
+    name = "tcpconntrack_test",
+    size = "small",
+    srcs = ["tcp_conntrack_test.go"],
+    deps = [
+        ":tcpconntrack",
+        "//pkg/tcpip/header",
+    ],
+)
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
new file mode 100644
index 000000000..12bc1b5b5
--- /dev/null
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack.go
@@ -0,0 +1,352 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tcpconntrack implements a TCP connection tracking object. It allows
+// users with access to a segment stream to figure out when a connection is
+// established, reset, and closed (and in the last case, who closed first).
+package tcpconntrack
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/seqnum"
+)
+
+// Result is returned when the state of a TCB is updated in response to an
+// inbound or outbound segment.
+type Result int
+
+const (
+	// ResultDrop indicates that the segment should be dropped.
+	ResultDrop Result = iota
+
+	// ResultConnecting indicates that the connection remains in a
+	// connecting state.
+	ResultConnecting
+
+	// ResultAlive indicates that the connection remains alive (connected).
+	ResultAlive
+
+	// ResultReset indicates that the connection was reset.
+	ResultReset
+
+	// ResultClosedByPeer indicates that the connection was gracefully
+	// closed, and the inbound stream was closed first.
+	ResultClosedByPeer
+
+	// ResultClosedBySelf indicates that the connection was gracefully
+	// closed, and the outbound stream was closed first.
+	ResultClosedBySelf
+)
+
+// TCB is a TCP Control Block. It holds state necessary to keep track of a TCP
+// connection and inform the caller when the connection has been closed.
+type TCB struct {
+	inbound  stream
+	outbound stream
+
+	// State handlers.
+	handlerInbound  func(*TCB, header.TCP) Result
+	handlerOutbound func(*TCB, header.TCP) Result
+
+	// firstFin holds a pointer to the first stream to send a FIN.
+	firstFin *stream
+
+	// state is the current state of the stream.
+	state Result
+}
+
+// Init initializes the state of the TCB according to the initial SYN.
+func (t *TCB) Init(initialSyn header.TCP) Result {
+	t.handlerInbound = synSentStateInbound
+	t.handlerOutbound = synSentStateOutbound
+
+	iss := seqnum.Value(initialSyn.SequenceNumber())
+	t.outbound.una = iss
+	t.outbound.nxt = iss.Add(logicalLen(initialSyn))
+	t.outbound.end = t.outbound.nxt
+
+	// Even though "end" is a sequence number, we don't know the initial
+	// receive sequence number yet, so we store the window size until we get
+	// a SYN from the peer.
+	t.inbound.una = 0
+	t.inbound.nxt = 0
+	t.inbound.end = seqnum.Value(initialSyn.WindowSize())
+	t.state = ResultConnecting
+	return t.state
+}
+
+// UpdateStateInbound updates the state of the TCB based on the supplied inbound
+// segment.
+func (t *TCB) UpdateStateInbound(tcp header.TCP) Result {
+	st := t.handlerInbound(t, tcp)
+	if st != ResultDrop {
+		t.state = st
+	}
+	return st
+}
+
+// UpdateStateOutbound updates the state of the TCB based on the supplied
+// outbound segment.
+func (t *TCB) UpdateStateOutbound(tcp header.TCP) Result {
+	st := t.handlerOutbound(t, tcp)
+	if st != ResultDrop {
+		t.state = st
+	}
+	return st
+}
+
+// IsAlive returns true as long as the connection is established(Alive)
+// or connecting state.
+func (t *TCB) IsAlive() bool {
+	return !t.inbound.rstSeen && !t.outbound.rstSeen && (!t.inbound.closed() || !t.outbound.closed())
+}
+
+// OutboundSendSequenceNumber returns the snd.NXT for the outbound stream.
+func (t *TCB) OutboundSendSequenceNumber() seqnum.Value {
+	return t.outbound.nxt
+}
+
+// InboundSendSequenceNumber returns the snd.NXT for the inbound stream.
+func (t *TCB) InboundSendSequenceNumber() seqnum.Value {
+	return t.inbound.nxt
+}
+
+// adapResult modifies the supplied "Result" according to the state of the TCB;
+// if r is anything other than "Alive", or if one of the streams isn't closed
+// yet, it is returned unmodified. Otherwise it's converted to either
+// ClosedBySelf or ClosedByPeer depending on which stream was closed first.
+func (t *TCB) adaptResult(r Result) Result {
+	// Check the unmodified case.
+	if r != ResultAlive || !t.inbound.closed() || !t.outbound.closed() {
+		return r
+	}
+
+	// Find out which was closed first.
+	if t.firstFin == &t.outbound {
+		return ResultClosedBySelf
+	}
+
+	return ResultClosedByPeer
+}
+
+// synSentStateInbound is the state handler for inbound segments when the
+// connection is in SYN-SENT state.
+func synSentStateInbound(t *TCB, tcp header.TCP) Result {
+	flags := tcp.Flags()
+	ackPresent := flags&header.TCPFlagAck != 0
+	ack := seqnum.Value(tcp.AckNumber())
+
+	// Ignore segment if ack is present but not acceptable.
+	if ackPresent && !(ack-1).InRange(t.outbound.una, t.outbound.nxt) {
+		return ResultConnecting
+	}
+
+	// If reset is specified, we will let the packet through no matter what
+	// but we will also destroy the connection if the ACK is present (and
+	// implicitly acceptable).
+	if flags&header.TCPFlagRst != 0 {
+		if ackPresent {
+			t.inbound.rstSeen = true
+			return ResultReset
+		}
+		return ResultConnecting
+	}
+
+	// Ignore segment if SYN is not set.
+	if flags&header.TCPFlagSyn == 0 {
+		return ResultConnecting
+	}
+
+	// Update state informed by this SYN.
+	irs := seqnum.Value(tcp.SequenceNumber())
+	t.inbound.una = irs
+	t.inbound.nxt = irs.Add(logicalLen(tcp))
+	t.inbound.end += irs
+
+	t.outbound.end = t.outbound.una.Add(seqnum.Size(tcp.WindowSize()))
+
+	// If the ACK was set (it is acceptable), update our unacknowledgement
+	// tracking.
+	if ackPresent {
+		// Advance the "una" and "end" indices of the outbound stream.
+		if t.outbound.una.LessThan(ack) {
+			t.outbound.una = ack
+		}
+
+		if end := ack.Add(seqnum.Size(tcp.WindowSize())); t.outbound.end.LessThan(end) {
+			t.outbound.end = end
+		}
+	}
+
+	// Update handlers so that new calls will be handled by new state.
+	t.handlerInbound = allOtherInbound
+	t.handlerOutbound = allOtherOutbound
+
+	return ResultAlive
+}
+
+// synSentStateOutbound is the state handler for outbound segments when the
+// connection is in SYN-SENT state.
+func synSentStateOutbound(t *TCB, tcp header.TCP) Result {
+	// Drop outbound segments that aren't retransmits of the original one.
+	if tcp.Flags() != header.TCPFlagSyn ||
+		tcp.SequenceNumber() != uint32(t.outbound.una) {
+		return ResultDrop
+	}
+
+	// Update the receive window. We only remember the largest value seen.
+	if wnd := seqnum.Value(tcp.WindowSize()); wnd > t.inbound.end {
+		t.inbound.end = wnd
+	}
+
+	return ResultConnecting
+}
+
+// update updates the state of inbound and outbound streams, given the supplied
+// inbound segment. For outbound segments, this same function can be called with
+// swapped inbound/outbound streams.
+func update(tcp header.TCP, inbound, outbound *stream, firstFin **stream) Result {
+	// Ignore segments out of the window.
+	s := seqnum.Value(tcp.SequenceNumber())
+	if !inbound.acceptable(s, dataLen(tcp)) {
+		return ResultAlive
+	}
+
+	flags := tcp.Flags()
+	if flags&header.TCPFlagRst != 0 {
+		inbound.rstSeen = true
+		return ResultReset
+	}
+
+	// Ignore segments that don't have the ACK flag, and those with the SYN
+	// flag.
+	if flags&header.TCPFlagAck == 0 || flags&header.TCPFlagSyn != 0 {
+		return ResultAlive
+	}
+
+	// Ignore segments that acknowledge not yet sent data.
+	ack := seqnum.Value(tcp.AckNumber())
+	if outbound.nxt.LessThan(ack) {
+		return ResultAlive
+	}
+
+	// Advance the "una" and "end" indices of the outbound stream.
+	if outbound.una.LessThan(ack) {
+		outbound.una = ack
+	}
+
+	if end := ack.Add(seqnum.Size(tcp.WindowSize())); outbound.end.LessThan(end) {
+		outbound.end = end
+	}
+
+	// Advance the "nxt" index of the inbound stream.
+	end := s.Add(logicalLen(tcp))
+	if inbound.nxt.LessThan(end) {
+		inbound.nxt = end
+	}
+
+	// Note the index of the FIN segment. And stash away a pointer to the
+	// first stream to see a FIN.
+	if flags&header.TCPFlagFin != 0 && !inbound.finSeen {
+		inbound.finSeen = true
+		inbound.fin = end - 1
+
+		if *firstFin == nil {
+			*firstFin = inbound
+		}
+	}
+
+	return ResultAlive
+}
+
+// allOtherInbound is the state handler for inbound segments in all states
+// except SYN-SENT.
+func allOtherInbound(t *TCB, tcp header.TCP) Result {
+	return t.adaptResult(update(tcp, &t.inbound, &t.outbound, &t.firstFin))
+}
+
+// allOtherOutbound is the state handler for outbound segments in all states
+// except SYN-SENT.
+func allOtherOutbound(t *TCB, tcp header.TCP) Result {
+	return t.adaptResult(update(tcp, &t.outbound, &t.inbound, &t.firstFin))
+}
+
+// streams holds the state of a TCP unidirectional stream.
+type stream struct {
+	// The interval [una, end) is the allowed interval as defined by the
+	// receiver, i.e., anything less than una has already been acknowledged
+	// and anything greater than or equal to end is beyond the receiver
+	// window. The interval [una, nxt) is the acknowledgable range, whose
+	// right edge indicates the sequence number of the next byte to be sent
+	// by the sender, i.e., anything greater than or equal to nxt hasn't
+	// been sent yet.
+	una seqnum.Value
+	nxt seqnum.Value
+	end seqnum.Value
+
+	// finSeen indicates if a FIN has already been sent on this stream.
+	finSeen bool
+
+	// fin is the sequence number of the FIN. It is only valid after finSeen
+	// is set to true.
+	fin seqnum.Value
+
+	// rstSeen indicates if a RST has already been sent on this stream.
+	rstSeen bool
+}
+
+// acceptable determines if the segment with the given sequence number and data
+// length is acceptable, i.e., if it's within the [una, end) window or, in case
+// the window is zero, if it's a packet with no payload and sequence number
+// equal to una.
+func (s *stream) acceptable(segSeq seqnum.Value, segLen seqnum.Size) bool {
+	return header.Acceptable(segSeq, segLen, s.una, s.end)
+}
+
+// closed determines if the stream has already been closed. This happens when
+// a FIN has been set by the sender and acknowledged by the receiver.
+func (s *stream) closed() bool {
+	return s.finSeen && s.fin.LessThan(s.una)
+}
+
+// dataLen returns the length of the TCP segment payload.
+func dataLen(tcp header.TCP) seqnum.Size {
+	return seqnum.Size(len(tcp) - int(tcp.DataOffset()))
+}
+
+// logicalLen calculates the logical length of the TCP segment.
+func logicalLen(tcp header.TCP) seqnum.Size {
+	l := dataLen(tcp)
+	flags := tcp.Flags()
+	if flags&header.TCPFlagSyn != 0 {
+		l++
+	}
+	if flags&header.TCPFlagFin != 0 {
+		l++
+	}
+	return l
+}
+
+// IsEmpty returns true if tcb is not initialized.
+func (t *TCB) IsEmpty() bool {
+	if t.inbound != (stream{}) || t.outbound != (stream{}) {
+		return false
+	}
+
+	if t.firstFin != nil || t.state != ResultDrop {
+		return false
+	}
+
+	return true
+}
diff --git a/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go
new file mode 100644
index 000000000..5e271b7ca
--- /dev/null
+++ b/pkg/tcpip/transport/tcpconntrack/tcp_conntrack_test.go
@@ -0,0 +1,511 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tcpconntrack_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/tcpconntrack"
+)
+
+// connected creates a connection tracker TCB and sets it to a connected state
+// by performing a 3-way handshake.
+func connected(t *testing.T, iss, irs uint32, isw, irw uint16) *tcpconntrack.TCB {
+	// Send SYN.
+	tcp := make(header.TCP, header.TCPMinimumSize)
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     iss,
+		AckNum:     0,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagSyn,
+		WindowSize: irw,
+	})
+
+	tcb := tcpconntrack.TCB{}
+	tcb.Init(tcp)
+
+	// Receive SYN-ACK.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     irs,
+		AckNum:     iss + 1,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagSyn | header.TCPFlagAck,
+		WindowSize: isw,
+	})
+
+	if r := tcb.UpdateStateInbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+
+	// Send ACK.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     iss + 1,
+		AckNum:     irs + 1,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagAck,
+		WindowSize: irw,
+	})
+
+	if r := tcb.UpdateStateOutbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+
+	return &tcb
+}
+
+func TestConnectionRefused(t *testing.T) {
+	// Send SYN.
+	tcp := make(header.TCP, header.TCPMinimumSize)
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     1234,
+		AckNum:     0,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagSyn,
+		WindowSize: 30000,
+	})
+
+	tcb := tcpconntrack.TCB{}
+	tcb.Init(tcp)
+
+	// Receive RST.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     789,
+		AckNum:     1235,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagRst | header.TCPFlagAck,
+		WindowSize: 50000,
+	})
+
+	if r := tcb.UpdateStateInbound(tcp); r != tcpconntrack.ResultReset {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultReset)
+	}
+}
+
+func TestConnectionRefusedInSynRcvd(t *testing.T) {
+	// Send SYN.
+	tcp := make(header.TCP, header.TCPMinimumSize)
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     1234,
+		AckNum:     0,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagSyn,
+		WindowSize: 30000,
+	})
+
+	tcb := tcpconntrack.TCB{}
+	tcb.Init(tcp)
+
+	// Receive SYN.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     789,
+		AckNum:     0,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagSyn,
+		WindowSize: 50000,
+	})
+
+	if r := tcb.UpdateStateInbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+
+	// Receive RST with no ACK.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     790,
+		AckNum:     0,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagRst,
+		WindowSize: 50000,
+	})
+
+	if r := tcb.UpdateStateInbound(tcp); r != tcpconntrack.ResultReset {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultReset)
+	}
+}
+
+func TestConnectionResetInSynRcvd(t *testing.T) {
+	// Send SYN.
+	tcp := make(header.TCP, header.TCPMinimumSize)
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     1234,
+		AckNum:     0,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagSyn,
+		WindowSize: 30000,
+	})
+
+	tcb := tcpconntrack.TCB{}
+	tcb.Init(tcp)
+
+	// Receive SYN.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     789,
+		AckNum:     0,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagSyn,
+		WindowSize: 50000,
+	})
+
+	if r := tcb.UpdateStateInbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+
+	// Send RST with no ACK.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     1235,
+		AckNum:     0,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagRst,
+	})
+
+	if r := tcb.UpdateStateOutbound(tcp); r != tcpconntrack.ResultReset {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultReset)
+	}
+}
+
+func TestRetransmitOnSynSent(t *testing.T) {
+	// Send initial SYN.
+	tcp := make(header.TCP, header.TCPMinimumSize)
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     1234,
+		AckNum:     0,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagSyn,
+		WindowSize: 30000,
+	})
+
+	tcb := tcpconntrack.TCB{}
+	tcb.Init(tcp)
+
+	// Retransmit the same SYN.
+	if r := tcb.UpdateStateOutbound(tcp); r != tcpconntrack.ResultConnecting {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultConnecting)
+	}
+}
+
+func TestRetransmitOnSynRcvd(t *testing.T) {
+	// Send initial SYN.
+	tcp := make(header.TCP, header.TCPMinimumSize)
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     1234,
+		AckNum:     0,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagSyn,
+		WindowSize: 30000,
+	})
+
+	tcb := tcpconntrack.TCB{}
+	tcb.Init(tcp)
+
+	// Receive SYN. This will cause the state to go to SYN-RCVD.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     789,
+		AckNum:     0,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagSyn,
+		WindowSize: 50000,
+	})
+
+	if r := tcb.UpdateStateInbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+
+	// Retransmit the original SYN.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     1234,
+		AckNum:     0,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagSyn,
+		WindowSize: 30000,
+	})
+
+	if r := tcb.UpdateStateOutbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+
+	// Transmit a SYN-ACK.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     1234,
+		AckNum:     790,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagSyn | header.TCPFlagAck,
+		WindowSize: 30000,
+	})
+
+	if r := tcb.UpdateStateOutbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+}
+
+func TestClosedBySelf(t *testing.T) {
+	tcb := connected(t, 1234, 789, 30000, 50000)
+
+	// Send FIN.
+	tcp := make(header.TCP, header.TCPMinimumSize)
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     1235,
+		AckNum:     790,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagAck | header.TCPFlagFin,
+		WindowSize: 30000,
+	})
+
+	if r := tcb.UpdateStateOutbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+
+	// Receive FIN/ACK.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     790,
+		AckNum:     1236,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagAck | header.TCPFlagFin,
+		WindowSize: 50000,
+	})
+
+	if r := tcb.UpdateStateInbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+
+	// Send ACK.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     1236,
+		AckNum:     791,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagAck,
+		WindowSize: 30000,
+	})
+
+	if r := tcb.UpdateStateOutbound(tcp); r != tcpconntrack.ResultClosedBySelf {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultClosedBySelf)
+	}
+}
+
+func TestClosedByPeer(t *testing.T) {
+	tcb := connected(t, 1234, 789, 30000, 50000)
+
+	// Receive FIN.
+	tcp := make(header.TCP, header.TCPMinimumSize)
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     790,
+		AckNum:     1235,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagAck | header.TCPFlagFin,
+		WindowSize: 50000,
+	})
+
+	if r := tcb.UpdateStateInbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+
+	// Send FIN/ACK.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     1235,
+		AckNum:     791,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagAck | header.TCPFlagFin,
+		WindowSize: 30000,
+	})
+
+	if r := tcb.UpdateStateOutbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+
+	// Receive ACK.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     791,
+		AckNum:     1236,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagAck,
+		WindowSize: 50000,
+	})
+
+	if r := tcb.UpdateStateInbound(tcp); r != tcpconntrack.ResultClosedByPeer {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultClosedByPeer)
+	}
+}
+
+func TestSendAndReceiveDataClosedBySelf(t *testing.T) {
+	sseq := uint32(1234)
+	rseq := uint32(789)
+	tcb := connected(t, sseq, rseq, 30000, 50000)
+	sseq++
+	rseq++
+
+	// Send some data.
+	tcp := make(header.TCP, header.TCPMinimumSize+1024)
+
+	for i := uint32(0); i < 10; i++ {
+		// Send some data.
+		tcp.Encode(&header.TCPFields{
+			SeqNum:     sseq,
+			AckNum:     rseq,
+			DataOffset: header.TCPMinimumSize,
+			Flags:      header.TCPFlagAck,
+			WindowSize: 30000,
+		})
+		sseq += uint32(len(tcp)) - header.TCPMinimumSize
+
+		if r := tcb.UpdateStateOutbound(tcp); r != tcpconntrack.ResultAlive {
+			t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+		}
+
+		// Receive ack for data.
+		tcp.Encode(&header.TCPFields{
+			SeqNum:     rseq,
+			AckNum:     sseq,
+			DataOffset: header.TCPMinimumSize,
+			Flags:      header.TCPFlagAck,
+			WindowSize: 50000,
+		})
+
+		if r := tcb.UpdateStateInbound(tcp[:header.TCPMinimumSize]); r != tcpconntrack.ResultAlive {
+			t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+		}
+	}
+
+	for i := uint32(0); i < 10; i++ {
+		// Receive some data.
+		tcp.Encode(&header.TCPFields{
+			SeqNum:     rseq,
+			AckNum:     sseq,
+			DataOffset: header.TCPMinimumSize,
+			Flags:      header.TCPFlagAck,
+			WindowSize: 50000,
+		})
+		rseq += uint32(len(tcp)) - header.TCPMinimumSize
+
+		if r := tcb.UpdateStateInbound(tcp); r != tcpconntrack.ResultAlive {
+			t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+		}
+
+		// Send ack for data.
+		tcp.Encode(&header.TCPFields{
+			SeqNum:     sseq,
+			AckNum:     rseq,
+			DataOffset: header.TCPMinimumSize,
+			Flags:      header.TCPFlagAck,
+			WindowSize: 30000,
+		})
+
+		if r := tcb.UpdateStateOutbound(tcp[:header.TCPMinimumSize]); r != tcpconntrack.ResultAlive {
+			t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+		}
+	}
+
+	// Send FIN.
+	tcp = tcp[:header.TCPMinimumSize]
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     sseq,
+		AckNum:     rseq,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagAck | header.TCPFlagFin,
+		WindowSize: 30000,
+	})
+	sseq++
+
+	if r := tcb.UpdateStateOutbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+
+	// Receive FIN/ACK.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     rseq,
+		AckNum:     sseq,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagAck | header.TCPFlagFin,
+		WindowSize: 50000,
+	})
+	rseq++
+
+	if r := tcb.UpdateStateInbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+
+	// Send ACK.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     sseq,
+		AckNum:     rseq,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagAck,
+		WindowSize: 30000,
+	})
+
+	if r := tcb.UpdateStateOutbound(tcp); r != tcpconntrack.ResultClosedBySelf {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultClosedBySelf)
+	}
+}
+
+func TestIgnoreBadResetOnSynSent(t *testing.T) {
+	// Send SYN.
+	tcp := make(header.TCP, header.TCPMinimumSize)
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     1234,
+		AckNum:     0,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagSyn,
+		WindowSize: 30000,
+	})
+
+	tcb := tcpconntrack.TCB{}
+	tcb.Init(tcp)
+
+	// Receive a RST with a bad ACK, it should not cause the connection to
+	// be reset.
+	acks := []uint32{1234, 1236, 1000, 5000}
+	flags := []uint8{header.TCPFlagRst, header.TCPFlagRst | header.TCPFlagAck}
+	for _, a := range acks {
+		for _, f := range flags {
+			tcp.Encode(&header.TCPFields{
+				SeqNum:     789,
+				AckNum:     a,
+				DataOffset: header.TCPMinimumSize,
+				Flags:      f,
+				WindowSize: 50000,
+			})
+
+			if r := tcb.UpdateStateInbound(tcp); r != tcpconntrack.ResultConnecting {
+				t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+			}
+		}
+	}
+
+	// Complete the handshake.
+	// Receive SYN-ACK.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     789,
+		AckNum:     1235,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagSyn | header.TCPFlagAck,
+		WindowSize: 50000,
+	})
+
+	if r := tcb.UpdateStateInbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+
+	// Send ACK.
+	tcp.Encode(&header.TCPFields{
+		SeqNum:     1235,
+		AckNum:     790,
+		DataOffset: header.TCPMinimumSize,
+		Flags:      header.TCPFlagAck,
+		WindowSize: 30000,
+	})
+
+	if r := tcb.UpdateStateOutbound(tcp); r != tcpconntrack.ResultAlive {
+		t.Fatalf("Bad result: got %v, want %v", r, tcpconntrack.ResultAlive)
+	}
+}
diff --git a/pkg/tcpip/transport/udp/BUILD b/pkg/tcpip/transport/udp/BUILD
new file mode 100644
index 000000000..b5d2d0ba6
--- /dev/null
+++ b/pkg/tcpip/transport/udp/BUILD
@@ -0,0 +1,60 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "udp_packet_list",
+    out = "udp_packet_list.go",
+    package = "udp",
+    prefix = "udpPacket",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*udpPacket",
+        "Linker": "*udpPacket",
+    },
+)
+
+go_library(
+    name = "udp",
+    srcs = [
+        "endpoint.go",
+        "endpoint_state.go",
+        "forwarder.go",
+        "protocol.go",
+        "udp_packet_list.go",
+    ],
+    imports = ["gvisor.dev/gvisor/pkg/tcpip/buffer"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sleep",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/ports",
+        "//pkg/tcpip/stack",
+        "//pkg/tcpip/transport/raw",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "udp_x_test",
+    size = "small",
+    srcs = ["udp_test.go"],
+    deps = [
+        ":udp",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/checker",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/link/loopback",
+        "//pkg/tcpip/link/sniffer",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/network/ipv6",
+        "//pkg/tcpip/stack",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/tcpip/transport/udp/endpoint.go b/pkg/tcpip/transport/udp/endpoint.go
new file mode 100644
index 000000000..0584ec8dc
--- /dev/null
+++ b/pkg/tcpip/transport/udp/endpoint.go
@@ -0,0 +1,1497 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package udp
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/ports"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// +stateify savable
+type udpPacket struct {
+	udpPacketEntry
+	senderAddress tcpip.FullAddress
+	packetInfo    tcpip.IPPacketInfo
+	data          buffer.VectorisedView `state:".(buffer.VectorisedView)"`
+	timestamp     int64
+	// tos stores either the receiveTOS or receiveTClass value.
+	tos uint8
+}
+
+// EndpointState represents the state of a UDP endpoint.
+type EndpointState uint32
+
+// Endpoint states. Note that are represented in a netstack-specific manner and
+// may not be meaningful externally. Specifically, they need to be translated to
+// Linux's representation for these states if presented to userspace.
+const (
+	StateInitial EndpointState = iota
+	StateBound
+	StateConnected
+	StateClosed
+)
+
+// String implements fmt.Stringer.String.
+func (s EndpointState) String() string {
+	switch s {
+	case StateInitial:
+		return "INITIAL"
+	case StateBound:
+		return "BOUND"
+	case StateConnected:
+		return "CONNECTING"
+	case StateClosed:
+		return "CLOSED"
+	default:
+		return "UNKNOWN"
+	}
+}
+
+// endpoint represents a UDP endpoint. This struct serves as the interface
+// between users of the endpoint and the protocol implementation; it is legal to
+// have concurrent goroutines make calls into the endpoint, they are properly
+// synchronized.
+//
+// It implements tcpip.Endpoint.
+//
+// +stateify savable
+type endpoint struct {
+	stack.TransportEndpointInfo
+
+	// The following fields are initialized at creation time and do not
+	// change throughout the lifetime of the endpoint.
+	stack       *stack.Stack `state:"manual"`
+	waiterQueue *waiter.Queue
+	uniqueID    uint64
+
+	// The following fields are used to manage the receive queue, and are
+	// protected by rcvMu.
+	rcvMu         sync.Mutex `state:"nosave"`
+	rcvReady      bool
+	rcvList       udpPacketList
+	rcvBufSizeMax int `state:".(int)"`
+	rcvBufSize    int
+	rcvClosed     bool
+
+	// The following fields are protected by the mu mutex.
+	mu             sync.RWMutex `state:"nosave"`
+	sndBufSize     int
+	sndBufSizeMax  int
+	state          EndpointState
+	route          stack.Route `state:"manual"`
+	dstPort        uint16
+	v6only         bool
+	ttl            uint8
+	multicastTTL   uint8
+	multicastAddr  tcpip.Address
+	multicastNICID tcpip.NICID
+	multicastLoop  bool
+	portFlags      ports.Flags
+	bindToDevice   tcpip.NICID
+	broadcast      bool
+	noChecksum     bool
+
+	lastErrorMu sync.Mutex   `state:"nosave"`
+	lastError   *tcpip.Error `state:".(string)"`
+
+	// Values used to reserve a port or register a transport endpoint.
+	// (which ever happens first).
+	boundBindToDevice tcpip.NICID
+	boundPortFlags    ports.Flags
+
+	// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
+	// applied while sending packets. Defaults to 0 as on Linux.
+	sendTOS uint8
+
+	// receiveTOS determines if the incoming IPv4 TOS header field is passed
+	// as ancillary data to ControlMessages on Read.
+	receiveTOS bool
+
+	// receiveTClass determines if the incoming IPv6 TClass header field is
+	// passed as ancillary data to ControlMessages on Read.
+	receiveTClass bool
+
+	// receiveIPPacketInfo determines if the packet info is returned by Read.
+	receiveIPPacketInfo bool
+
+	// shutdownFlags represent the current shutdown state of the endpoint.
+	shutdownFlags tcpip.ShutdownFlags
+
+	// multicastMemberships that need to be remvoed when the endpoint is
+	// closed. Protected by the mu mutex.
+	multicastMemberships []multicastMembership
+
+	// effectiveNetProtos contains the network protocols actually in use. In
+	// most cases it will only contain "netProto", but in cases like IPv6
+	// endpoints with v6only set to false, this could include multiple
+	// protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
+	// IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
+	// address).
+	effectiveNetProtos []tcpip.NetworkProtocolNumber
+
+	// TODO(b/142022063): Add ability to save and restore per endpoint stats.
+	stats tcpip.TransportEndpointStats `state:"nosave"`
+
+	// owner is used to get uid and gid of the packet.
+	owner tcpip.PacketOwner
+}
+
+// +stateify savable
+type multicastMembership struct {
+	nicID         tcpip.NICID
+	multicastAddr tcpip.Address
+}
+
+func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
+	e := &endpoint{
+		stack: s,
+		TransportEndpointInfo: stack.TransportEndpointInfo{
+			NetProto:   netProto,
+			TransProto: header.UDPProtocolNumber,
+		},
+		waiterQueue: waiterQueue,
+		// RFC 1075 section 5.4 recommends a TTL of 1 for membership
+		// requests.
+		//
+		// RFC 5135 4.2.1 appears to assume that IGMP messages have a
+		// TTL of 1.
+		//
+		// RFC 5135 Appendix A defines TTL=1: A multicast source that
+		// wants its traffic to not traverse a router (e.g., leave a
+		// home network) may find it useful to send traffic with IP
+		// TTL=1.
+		//
+		// Linux defaults to TTL=1.
+		multicastTTL:  1,
+		multicastLoop: true,
+		rcvBufSizeMax: 32 * 1024,
+		sndBufSizeMax: 32 * 1024,
+		state:         StateInitial,
+		uniqueID:      s.UniqueID(),
+	}
+
+	// Override with stack defaults.
+	var ss stack.SendBufferSizeOption
+	if err := s.Option(&ss); err == nil {
+		e.sndBufSizeMax = ss.Default
+	}
+
+	var rs stack.ReceiveBufferSizeOption
+	if err := s.Option(&rs); err == nil {
+		e.rcvBufSizeMax = rs.Default
+	}
+
+	return e
+}
+
+// UniqueID implements stack.TransportEndpoint.UniqueID.
+func (e *endpoint) UniqueID() uint64 {
+	return e.uniqueID
+}
+
+func (e *endpoint) takeLastError() *tcpip.Error {
+	e.lastErrorMu.Lock()
+	defer e.lastErrorMu.Unlock()
+
+	err := e.lastError
+	e.lastError = nil
+	return err
+}
+
+// Abort implements stack.TransportEndpoint.Abort.
+func (e *endpoint) Abort() {
+	e.Close()
+}
+
+// Close puts the endpoint in a closed state and frees all resources
+// associated with it.
+func (e *endpoint) Close() {
+	e.mu.Lock()
+	e.shutdownFlags = tcpip.ShutdownRead | tcpip.ShutdownWrite
+
+	switch e.state {
+	case StateBound, StateConnected:
+		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice)
+		e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.boundPortFlags, e.boundBindToDevice, tcpip.FullAddress{})
+		e.boundBindToDevice = 0
+		e.boundPortFlags = ports.Flags{}
+	}
+
+	for _, mem := range e.multicastMemberships {
+		e.stack.LeaveGroup(e.NetProto, mem.nicID, mem.multicastAddr)
+	}
+	e.multicastMemberships = nil
+
+	// Close the receive list and drain it.
+	e.rcvMu.Lock()
+	e.rcvClosed = true
+	e.rcvBufSize = 0
+	for !e.rcvList.Empty() {
+		p := e.rcvList.Front()
+		e.rcvList.Remove(p)
+	}
+	e.rcvMu.Unlock()
+
+	e.route.Release()
+
+	// Update the state.
+	e.state = StateClosed
+
+	e.mu.Unlock()
+
+	e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.EventIn | waiter.EventOut)
+}
+
+// ModerateRecvBuf implements tcpip.Endpoint.ModerateRecvBuf.
+func (e *endpoint) ModerateRecvBuf(copied int) {}
+
+// Read reads data from the endpoint. This method does not block if
+// there is no data pending.
+func (e *endpoint) Read(addr *tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
+	if err := e.takeLastError(); err != nil {
+		return buffer.View{}, tcpip.ControlMessages{}, err
+	}
+
+	e.rcvMu.Lock()
+
+	if e.rcvList.Empty() {
+		err := tcpip.ErrWouldBlock
+		if e.rcvClosed {
+			e.stats.ReadErrors.ReadClosed.Increment()
+			err = tcpip.ErrClosedForReceive
+		}
+		e.rcvMu.Unlock()
+		return buffer.View{}, tcpip.ControlMessages{}, err
+	}
+
+	p := e.rcvList.Front()
+	e.rcvList.Remove(p)
+	e.rcvBufSize -= p.data.Size()
+	e.rcvMu.Unlock()
+
+	if addr != nil {
+		*addr = p.senderAddress
+	}
+
+	cm := tcpip.ControlMessages{
+		HasTimestamp: true,
+		Timestamp:    p.timestamp,
+	}
+	e.mu.RLock()
+	receiveTOS := e.receiveTOS
+	receiveTClass := e.receiveTClass
+	receiveIPPacketInfo := e.receiveIPPacketInfo
+	e.mu.RUnlock()
+	if receiveTOS {
+		cm.HasTOS = true
+		cm.TOS = p.tos
+	}
+	if receiveTClass {
+		cm.HasTClass = true
+		// Although TClass is an 8-bit value it's read in the CMsg as a uint32.
+		cm.TClass = uint32(p.tos)
+	}
+	if receiveIPPacketInfo {
+		cm.HasIPPacketInfo = true
+		cm.PacketInfo = p.packetInfo
+	}
+	return p.data.ToView(), cm, nil
+}
+
+// prepareForWrite prepares the endpoint for sending data. In particular, it
+// binds it if it's still in the initial state. To do so, it must first
+// reacquire the mutex in exclusive mode.
+//
+// Returns true for retry if preparation should be retried.
+func (e *endpoint) prepareForWrite(to *tcpip.FullAddress) (retry bool, err *tcpip.Error) {
+	switch e.state {
+	case StateInitial:
+	case StateConnected:
+		return false, nil
+
+	case StateBound:
+		if to == nil {
+			return false, tcpip.ErrDestinationRequired
+		}
+		return false, nil
+	default:
+		return false, tcpip.ErrInvalidEndpointState
+	}
+
+	e.mu.RUnlock()
+	defer e.mu.RLock()
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// The state changed when we released the shared locked and re-acquired
+	// it in exclusive mode. Try again.
+	if e.state != StateInitial {
+		return true, nil
+	}
+
+	// The state is still 'initial', so try to bind the endpoint.
+	if err := e.bindLocked(tcpip.FullAddress{}); err != nil {
+		return false, err
+	}
+
+	return true, nil
+}
+
+// connectRoute establishes a route to the specified interface or the
+// configured multicast interface if no interface is specified and the
+// specified address is a multicast address.
+func (e *endpoint) connectRoute(nicID tcpip.NICID, addr tcpip.FullAddress, netProto tcpip.NetworkProtocolNumber) (stack.Route, tcpip.NICID, *tcpip.Error) {
+	localAddr := e.ID.LocalAddress
+	if isBroadcastOrMulticast(localAddr) {
+		// A packet can only originate from a unicast address (i.e., an interface).
+		localAddr = ""
+	}
+
+	if header.IsV4MulticastAddress(addr.Addr) || header.IsV6MulticastAddress(addr.Addr) {
+		if nicID == 0 {
+			nicID = e.multicastNICID
+		}
+		if localAddr == "" && nicID == 0 {
+			localAddr = e.multicastAddr
+		}
+	}
+
+	// Find a route to the desired destination.
+	r, err := e.stack.FindRoute(nicID, localAddr, addr.Addr, netProto, e.multicastLoop)
+	if err != nil {
+		return stack.Route{}, 0, err
+	}
+	return r, nicID, nil
+}
+
+// Write writes data to the endpoint's peer. This method does not block
+// if the data cannot be written.
+func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+	n, ch, err := e.write(p, opts)
+	switch err {
+	case nil:
+		e.stats.PacketsSent.Increment()
+	case tcpip.ErrMessageTooLong, tcpip.ErrInvalidOptionValue:
+		e.stats.WriteErrors.InvalidArgs.Increment()
+	case tcpip.ErrClosedForSend:
+		e.stats.WriteErrors.WriteClosed.Increment()
+	case tcpip.ErrInvalidEndpointState:
+		e.stats.WriteErrors.InvalidEndpointState.Increment()
+	case tcpip.ErrNoLinkAddress:
+		e.stats.SendErrors.NoLinkAddr.Increment()
+	case tcpip.ErrNoRoute, tcpip.ErrBroadcastDisabled, tcpip.ErrNetworkUnreachable:
+		// Errors indicating any problem with IP routing of the packet.
+		e.stats.SendErrors.NoRoute.Increment()
+	default:
+		// For all other errors when writing to the network layer.
+		e.stats.SendErrors.SendToNetworkFailed.Increment()
+	}
+	return n, ch, err
+}
+
+func (e *endpoint) write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
+	if err := e.takeLastError(); err != nil {
+		return 0, nil, err
+	}
+
+	// MSG_MORE is unimplemented. (This also means that MSG_EOR is a no-op.)
+	if opts.More {
+		return 0, nil, tcpip.ErrInvalidOptionValue
+	}
+
+	to := opts.To
+
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	// If we've shutdown with SHUT_WR we are in an invalid state for sending.
+	if e.shutdownFlags&tcpip.ShutdownWrite != 0 {
+		return 0, nil, tcpip.ErrClosedForSend
+	}
+
+	// Prepare for write.
+	for {
+		retry, err := e.prepareForWrite(to)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		if !retry {
+			break
+		}
+	}
+
+	var route *stack.Route
+	var resolve func(waker *sleep.Waker) (ch <-chan struct{}, err *tcpip.Error)
+	var dstPort uint16
+	if to == nil {
+		route = &e.route
+		dstPort = e.dstPort
+		resolve = func(waker *sleep.Waker) (ch <-chan struct{}, err *tcpip.Error) {
+			// Promote lock to exclusive if using a shared route, given that it may
+			// need to change in Route.Resolve() call below.
+			e.mu.RUnlock()
+			e.mu.Lock()
+
+			// Recheck state after lock was re-acquired.
+			if e.state != StateConnected {
+				err = tcpip.ErrInvalidEndpointState
+			}
+			if err == nil && route.IsResolutionRequired() {
+				ch, err = route.Resolve(waker)
+			}
+
+			e.mu.Unlock()
+			e.mu.RLock()
+
+			// Recheck state after lock was re-acquired.
+			if e.state != StateConnected {
+				err = tcpip.ErrInvalidEndpointState
+			}
+			return
+		}
+	} else {
+		// Reject destination address if it goes through a different
+		// NIC than the endpoint was bound to.
+		nicID := to.NIC
+		if e.BindNICID != 0 {
+			if nicID != 0 && nicID != e.BindNICID {
+				return 0, nil, tcpip.ErrNoRoute
+			}
+
+			nicID = e.BindNICID
+		}
+
+		if to.Addr == header.IPv4Broadcast && !e.broadcast {
+			return 0, nil, tcpip.ErrBroadcastDisabled
+		}
+
+		dst, netProto, err := e.checkV4MappedLocked(*to)
+		if err != nil {
+			return 0, nil, err
+		}
+
+		r, _, err := e.connectRoute(nicID, dst, netProto)
+		if err != nil {
+			return 0, nil, err
+		}
+		defer r.Release()
+
+		route = &r
+		dstPort = dst.Port
+		resolve = route.Resolve
+	}
+
+	if route.IsResolutionRequired() {
+		if ch, err := resolve(nil); err != nil {
+			if err == tcpip.ErrWouldBlock {
+				return 0, ch, tcpip.ErrNoLinkAddress
+			}
+			return 0, nil, err
+		}
+	}
+
+	v, err := p.FullPayload()
+	if err != nil {
+		return 0, nil, err
+	}
+	if len(v) > header.UDPMaximumPacketSize {
+		// Payload can't possibly fit in a packet.
+		return 0, nil, tcpip.ErrMessageTooLong
+	}
+
+	ttl := e.ttl
+	useDefaultTTL := ttl == 0
+
+	if header.IsV4MulticastAddress(route.RemoteAddress) || header.IsV6MulticastAddress(route.RemoteAddress) {
+		ttl = e.multicastTTL
+		// Multicast allows a 0 TTL.
+		useDefaultTTL = false
+	}
+
+	if err := sendUDP(route, buffer.View(v).ToVectorisedView(), e.ID.LocalPort, dstPort, ttl, useDefaultTTL, e.sendTOS, e.owner, e.noChecksum); err != nil {
+		return 0, nil, err
+	}
+	return int64(len(v)), nil, nil
+}
+
+// Peek only returns data from a single datagram, so do nothing here.
+func (e *endpoint) Peek([][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
+	return 0, tcpip.ControlMessages{}, nil
+}
+
+// SetSockOptBool implements tcpip.Endpoint.SetSockOptBool.
+func (e *endpoint) SetSockOptBool(opt tcpip.SockOptBool, v bool) *tcpip.Error {
+	switch opt {
+	case tcpip.BroadcastOption:
+		e.mu.Lock()
+		e.broadcast = v
+		e.mu.Unlock()
+
+	case tcpip.MulticastLoopOption:
+		e.mu.Lock()
+		e.multicastLoop = v
+		e.mu.Unlock()
+
+	case tcpip.NoChecksumOption:
+		e.mu.Lock()
+		e.noChecksum = v
+		e.mu.Unlock()
+
+	case tcpip.ReceiveTOSOption:
+		e.mu.Lock()
+		e.receiveTOS = v
+		e.mu.Unlock()
+
+	case tcpip.ReceiveTClassOption:
+		// We only support this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return tcpip.ErrNotSupported
+		}
+
+		e.mu.Lock()
+		e.receiveTClass = v
+		e.mu.Unlock()
+
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.Lock()
+		e.receiveIPPacketInfo = v
+		e.mu.Unlock()
+
+	case tcpip.ReuseAddressOption:
+		e.mu.Lock()
+		e.portFlags.MostRecent = v
+		e.mu.Unlock()
+
+	case tcpip.ReusePortOption:
+		e.mu.Lock()
+		e.portFlags.LoadBalanced = v
+		e.mu.Unlock()
+
+	case tcpip.V6OnlyOption:
+		// We only recognize this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return tcpip.ErrInvalidEndpointState
+		}
+
+		e.mu.Lock()
+		defer e.mu.Unlock()
+
+		// We only allow this to be set when we're in the initial state.
+		if e.state != StateInitial {
+			return tcpip.ErrInvalidEndpointState
+		}
+
+		e.v6only = v
+	}
+
+	return nil
+}
+
+// SetSockOptInt implements tcpip.Endpoint.SetSockOptInt.
+func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) *tcpip.Error {
+	switch opt {
+	case tcpip.MTUDiscoverOption:
+		// Return not supported if the value is not disabling path
+		// MTU discovery.
+		if v != tcpip.PMTUDiscoveryDont {
+			return tcpip.ErrNotSupported
+		}
+
+	case tcpip.MulticastTTLOption:
+		e.mu.Lock()
+		e.multicastTTL = uint8(v)
+		e.mu.Unlock()
+
+	case tcpip.TTLOption:
+		e.mu.Lock()
+		e.ttl = uint8(v)
+		e.mu.Unlock()
+
+	case tcpip.IPv4TOSOption:
+		e.mu.Lock()
+		e.sendTOS = uint8(v)
+		e.mu.Unlock()
+
+	case tcpip.IPv6TrafficClassOption:
+		e.mu.Lock()
+		e.sendTOS = uint8(v)
+		e.mu.Unlock()
+
+	case tcpip.ReceiveBufferSizeOption:
+		// Make sure the receive buffer size is within the min and max
+		// allowed.
+		var rs stack.ReceiveBufferSizeOption
+		if err := e.stack.Option(&rs); err != nil {
+			panic(fmt.Sprintf("e.stack.Option(%#v) = %s", rs, err))
+		}
+
+		if v < rs.Min {
+			v = rs.Min
+		}
+		if v > rs.Max {
+			v = rs.Max
+		}
+
+		e.mu.Lock()
+		e.rcvBufSizeMax = v
+		e.mu.Unlock()
+		return nil
+	case tcpip.SendBufferSizeOption:
+		// Make sure the send buffer size is within the min and max
+		// allowed.
+		var ss stack.SendBufferSizeOption
+		if err := e.stack.Option(&ss); err != nil {
+			panic(fmt.Sprintf("e.stack.Option(%#v) = %s", ss, err))
+		}
+
+		if v < ss.Min {
+			v = ss.Min
+		}
+		if v > ss.Max {
+			v = ss.Max
+		}
+
+		e.mu.Lock()
+		e.sndBufSizeMax = v
+		e.mu.Unlock()
+		return nil
+	}
+
+	return nil
+}
+
+// SetSockOpt implements tcpip.Endpoint.SetSockOpt.
+func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
+	switch v := opt.(type) {
+	case tcpip.MulticastInterfaceOption:
+		e.mu.Lock()
+		defer e.mu.Unlock()
+
+		fa := tcpip.FullAddress{Addr: v.InterfaceAddr}
+		fa, netProto, err := e.checkV4MappedLocked(fa)
+		if err != nil {
+			return err
+		}
+		nic := v.NIC
+		addr := fa.Addr
+
+		if nic == 0 && addr == "" {
+			e.multicastAddr = ""
+			e.multicastNICID = 0
+			break
+		}
+
+		if nic != 0 {
+			if !e.stack.CheckNIC(nic) {
+				return tcpip.ErrBadLocalAddress
+			}
+		} else {
+			nic = e.stack.CheckLocalAddress(0, netProto, addr)
+			if nic == 0 {
+				return tcpip.ErrBadLocalAddress
+			}
+		}
+
+		if e.BindNICID != 0 && e.BindNICID != nic {
+			return tcpip.ErrInvalidEndpointState
+		}
+
+		e.multicastNICID = nic
+		e.multicastAddr = addr
+
+	case tcpip.AddMembershipOption:
+		if !header.IsV4MulticastAddress(v.MulticastAddr) && !header.IsV6MulticastAddress(v.MulticastAddr) {
+			return tcpip.ErrInvalidOptionValue
+		}
+
+		nicID := v.NIC
+
+		// The interface address is considered not-set if it is empty or contains
+		// all-zeros. The former represent the zero-value in golang, the latter the
+		// same in a setsockopt(IP_ADD_MEMBERSHIP, &ip_mreqn) syscall.
+		allZeros := header.IPv4Any
+		if len(v.InterfaceAddr) == 0 || v.InterfaceAddr == allZeros {
+			if nicID == 0 {
+				r, err := e.stack.FindRoute(0, "", v.MulticastAddr, header.IPv4ProtocolNumber, false /* multicastLoop */)
+				if err == nil {
+					nicID = r.NICID()
+					r.Release()
+				}
+			}
+		} else {
+			nicID = e.stack.CheckLocalAddress(nicID, e.NetProto, v.InterfaceAddr)
+		}
+		if nicID == 0 {
+			return tcpip.ErrUnknownDevice
+		}
+
+		memToInsert := multicastMembership{nicID: nicID, multicastAddr: v.MulticastAddr}
+
+		e.mu.Lock()
+		defer e.mu.Unlock()
+
+		for _, mem := range e.multicastMemberships {
+			if mem == memToInsert {
+				return tcpip.ErrPortInUse
+			}
+		}
+
+		if err := e.stack.JoinGroup(e.NetProto, nicID, v.MulticastAddr); err != nil {
+			return err
+		}
+
+		e.multicastMemberships = append(e.multicastMemberships, memToInsert)
+
+	case tcpip.RemoveMembershipOption:
+		if !header.IsV4MulticastAddress(v.MulticastAddr) && !header.IsV6MulticastAddress(v.MulticastAddr) {
+			return tcpip.ErrInvalidOptionValue
+		}
+
+		nicID := v.NIC
+		if v.InterfaceAddr == header.IPv4Any {
+			if nicID == 0 {
+				r, err := e.stack.FindRoute(0, "", v.MulticastAddr, header.IPv4ProtocolNumber, false /* multicastLoop */)
+				if err == nil {
+					nicID = r.NICID()
+					r.Release()
+				}
+			}
+		} else {
+			nicID = e.stack.CheckLocalAddress(nicID, e.NetProto, v.InterfaceAddr)
+		}
+		if nicID == 0 {
+			return tcpip.ErrUnknownDevice
+		}
+
+		memToRemove := multicastMembership{nicID: nicID, multicastAddr: v.MulticastAddr}
+		memToRemoveIndex := -1
+
+		e.mu.Lock()
+		defer e.mu.Unlock()
+
+		for i, mem := range e.multicastMemberships {
+			if mem == memToRemove {
+				memToRemoveIndex = i
+				break
+			}
+		}
+		if memToRemoveIndex == -1 {
+			return tcpip.ErrBadLocalAddress
+		}
+
+		if err := e.stack.LeaveGroup(e.NetProto, nicID, v.MulticastAddr); err != nil {
+			return err
+		}
+
+		e.multicastMemberships[memToRemoveIndex] = e.multicastMemberships[len(e.multicastMemberships)-1]
+		e.multicastMemberships = e.multicastMemberships[:len(e.multicastMemberships)-1]
+
+	case tcpip.BindToDeviceOption:
+		id := tcpip.NICID(v)
+		if id != 0 && !e.stack.HasNIC(id) {
+			return tcpip.ErrUnknownDevice
+		}
+		e.mu.Lock()
+		e.bindToDevice = id
+		e.mu.Unlock()
+	}
+	return nil
+}
+
+// GetSockOptBool implements tcpip.Endpoint.GetSockOptBool.
+func (e *endpoint) GetSockOptBool(opt tcpip.SockOptBool) (bool, *tcpip.Error) {
+	switch opt {
+	case tcpip.BroadcastOption:
+		e.mu.RLock()
+		v := e.broadcast
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.KeepaliveEnabledOption:
+		return false, nil
+
+	case tcpip.MulticastLoopOption:
+		e.mu.RLock()
+		v := e.multicastLoop
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.NoChecksumOption:
+		e.mu.RLock()
+		v := e.noChecksum
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.ReceiveTOSOption:
+		e.mu.RLock()
+		v := e.receiveTOS
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.ReceiveTClassOption:
+		// We only support this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return false, tcpip.ErrNotSupported
+		}
+
+		e.mu.RLock()
+		v := e.receiveTClass
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.ReceiveIPPacketInfoOption:
+		e.mu.RLock()
+		v := e.receiveIPPacketInfo
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.ReuseAddressOption:
+		e.mu.RLock()
+		v := e.portFlags.MostRecent
+		e.mu.RUnlock()
+
+		return v, nil
+
+	case tcpip.ReusePortOption:
+		e.mu.RLock()
+		v := e.portFlags.LoadBalanced
+		e.mu.RUnlock()
+
+		return v, nil
+
+	case tcpip.V6OnlyOption:
+		// We only recognize this option on v6 endpoints.
+		if e.NetProto != header.IPv6ProtocolNumber {
+			return false, tcpip.ErrUnknownProtocolOption
+		}
+
+		e.mu.RLock()
+		v := e.v6only
+		e.mu.RUnlock()
+
+		return v, nil
+
+	default:
+		return false, tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
+func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, *tcpip.Error) {
+	switch opt {
+	case tcpip.IPv4TOSOption:
+		e.mu.RLock()
+		v := int(e.sendTOS)
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.IPv6TrafficClassOption:
+		e.mu.RLock()
+		v := int(e.sendTOS)
+		e.mu.RUnlock()
+		return v, nil
+
+	case tcpip.MTUDiscoverOption:
+		// The only supported setting is path MTU discovery disabled.
+		return tcpip.PMTUDiscoveryDont, nil
+
+	case tcpip.MulticastTTLOption:
+		e.mu.Lock()
+		v := int(e.multicastTTL)
+		e.mu.Unlock()
+		return v, nil
+
+	case tcpip.ReceiveQueueSizeOption:
+		v := 0
+		e.rcvMu.Lock()
+		if !e.rcvList.Empty() {
+			p := e.rcvList.Front()
+			v = p.data.Size()
+		}
+		e.rcvMu.Unlock()
+		return v, nil
+
+	case tcpip.SendBufferSizeOption:
+		e.mu.Lock()
+		v := e.sndBufSizeMax
+		e.mu.Unlock()
+		return v, nil
+
+	case tcpip.ReceiveBufferSizeOption:
+		e.rcvMu.Lock()
+		v := e.rcvBufSizeMax
+		e.rcvMu.Unlock()
+		return v, nil
+
+	case tcpip.TTLOption:
+		e.mu.Lock()
+		v := int(e.ttl)
+		e.mu.Unlock()
+		return v, nil
+
+	default:
+		return -1, tcpip.ErrUnknownProtocolOption
+	}
+}
+
+// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
+func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
+	switch o := opt.(type) {
+	case tcpip.ErrorOption:
+		return e.takeLastError()
+	case *tcpip.MulticastInterfaceOption:
+		e.mu.Lock()
+		*o = tcpip.MulticastInterfaceOption{
+			e.multicastNICID,
+			e.multicastAddr,
+		}
+		e.mu.Unlock()
+
+	case *tcpip.BindToDeviceOption:
+		e.mu.RLock()
+		*o = tcpip.BindToDeviceOption(e.bindToDevice)
+		e.mu.RUnlock()
+
+	default:
+		return tcpip.ErrUnknownProtocolOption
+	}
+	return nil
+}
+
+// sendUDP sends a UDP segment via the provided network endpoint and under the
+// provided identity.
+func sendUDP(r *stack.Route, data buffer.VectorisedView, localPort, remotePort uint16, ttl uint8, useDefaultTTL bool, tos uint8, owner tcpip.PacketOwner, noChecksum bool) *tcpip.Error {
+	// Allocate a buffer for the UDP header.
+	hdr := buffer.NewPrependable(header.UDPMinimumSize + int(r.MaxHeaderLength()))
+
+	// Initialize the header.
+	udp := header.UDP(hdr.Prepend(header.UDPMinimumSize))
+
+	length := uint16(hdr.UsedLength() + data.Size())
+	udp.Encode(&header.UDPFields{
+		SrcPort: localPort,
+		DstPort: remotePort,
+		Length:  length,
+	})
+
+	// Set the checksum field unless TX checksum offload is enabled.
+	// On IPv4, UDP checksum is optional, and a zero value indicates the
+	// transmitter skipped the checksum generation (RFC768).
+	// On IPv6, UDP checksum is not optional (RFC2460 Section 8.1).
+	if r.Capabilities()&stack.CapabilityTXChecksumOffload == 0 &&
+		(!noChecksum || r.NetProto == header.IPv6ProtocolNumber) {
+		xsum := r.PseudoHeaderChecksum(ProtocolNumber, length)
+		for _, v := range data.Views() {
+			xsum = header.Checksum(v, xsum)
+		}
+		udp.SetChecksum(^udp.CalculateChecksum(xsum))
+	}
+
+	if useDefaultTTL {
+		ttl = r.DefaultTTL()
+	}
+	if err := r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{
+		Protocol: ProtocolNumber,
+		TTL:      ttl,
+		TOS:      tos,
+	}, &stack.PacketBuffer{
+		Header:          hdr,
+		Data:            data,
+		TransportHeader: buffer.View(udp),
+		Owner:           owner,
+	}); err != nil {
+		r.Stats().UDP.PacketSendErrors.Increment()
+		return err
+	}
+
+	// Track count of packets sent.
+	r.Stats().UDP.PacketsSent.Increment()
+	return nil
+}
+
+// checkV4MappedLocked determines the effective network protocol and converts
+// addr to its canonical form.
+func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, *tcpip.Error) {
+	unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.v6only)
+	if err != nil {
+		return tcpip.FullAddress{}, 0, err
+	}
+	return unwrapped, netProto, nil
+}
+
+// Disconnect implements tcpip.Endpoint.Disconnect.
+func (e *endpoint) Disconnect() *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	if e.state != StateConnected {
+		return nil
+	}
+	var (
+		id  stack.TransportEndpointID
+		btd tcpip.NICID
+	)
+
+	// We change this value below and we need the old value to unregister
+	// the endpoint.
+	boundPortFlags := e.boundPortFlags
+
+	// Exclude ephemerally bound endpoints.
+	if e.BindNICID != 0 || e.ID.LocalAddress == "" {
+		var err *tcpip.Error
+		id = stack.TransportEndpointID{
+			LocalPort:    e.ID.LocalPort,
+			LocalAddress: e.ID.LocalAddress,
+		}
+		id, btd, err = e.registerWithStack(e.RegisterNICID, e.effectiveNetProtos, id)
+		if err != nil {
+			return err
+		}
+		e.state = StateBound
+		boundPortFlags = e.boundPortFlags
+	} else {
+		if e.ID.LocalPort != 0 {
+			// Release the ephemeral port.
+			e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, boundPortFlags, e.boundBindToDevice, tcpip.FullAddress{})
+			e.boundPortFlags = ports.Flags{}
+		}
+		e.state = StateInitial
+	}
+
+	e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, boundPortFlags, e.boundBindToDevice)
+	e.ID = id
+	e.boundBindToDevice = btd
+	e.route.Release()
+	e.route = stack.Route{}
+	e.dstPort = 0
+
+	return nil
+}
+
+// Connect connects the endpoint to its peer. Specifying a NIC is optional.
+func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
+	if addr.Port == 0 {
+		// We don't support connecting to port zero.
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	nicID := addr.NIC
+	var localPort uint16
+	switch e.state {
+	case StateInitial:
+	case StateBound, StateConnected:
+		localPort = e.ID.LocalPort
+		if e.BindNICID == 0 {
+			break
+		}
+
+		if nicID != 0 && nicID != e.BindNICID {
+			return tcpip.ErrInvalidEndpointState
+		}
+
+		nicID = e.BindNICID
+	default:
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	addr, netProto, err := e.checkV4MappedLocked(addr)
+	if err != nil {
+		return err
+	}
+
+	r, nicID, err := e.connectRoute(nicID, addr, netProto)
+	if err != nil {
+		return err
+	}
+	defer r.Release()
+
+	id := stack.TransportEndpointID{
+		LocalAddress:  e.ID.LocalAddress,
+		LocalPort:     localPort,
+		RemotePort:    addr.Port,
+		RemoteAddress: r.RemoteAddress,
+	}
+
+	if e.state == StateInitial {
+		id.LocalAddress = r.LocalAddress
+	}
+
+	// Even if we're connected, this endpoint can still be used to send
+	// packets on a different network protocol, so we register both even if
+	// v6only is set to false and this is an ipv6 endpoint.
+	netProtos := []tcpip.NetworkProtocolNumber{netProto}
+	if netProto == header.IPv6ProtocolNumber && !e.v6only {
+		netProtos = []tcpip.NetworkProtocolNumber{
+			header.IPv4ProtocolNumber,
+			header.IPv6ProtocolNumber,
+		}
+	}
+
+	oldPortFlags := e.boundPortFlags
+
+	id, btd, err := e.registerWithStack(nicID, netProtos, id)
+	if err != nil {
+		return err
+	}
+
+	// Remove the old registration.
+	if e.ID.LocalPort != 0 {
+		e.stack.UnregisterTransportEndpoint(e.RegisterNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, oldPortFlags, e.boundBindToDevice)
+	}
+
+	e.ID = id
+	e.boundBindToDevice = btd
+	e.route = r.Clone()
+	e.dstPort = addr.Port
+	e.RegisterNICID = nicID
+	e.effectiveNetProtos = netProtos
+
+	e.state = StateConnected
+
+	e.rcvMu.Lock()
+	e.rcvReady = true
+	e.rcvMu.Unlock()
+
+	return nil
+}
+
+// ConnectEndpoint is not supported.
+func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
+	return tcpip.ErrInvalidEndpointState
+}
+
+// Shutdown closes the read and/or write end of the endpoint connection
+// to its peer.
+func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	// A socket in the bound state can still receive multicast messages,
+	// so we need to notify waiters on shutdown.
+	if e.state != StateBound && e.state != StateConnected {
+		return tcpip.ErrNotConnected
+	}
+
+	e.shutdownFlags |= flags
+
+	if flags&tcpip.ShutdownRead != 0 {
+		e.rcvMu.Lock()
+		wasClosed := e.rcvClosed
+		e.rcvClosed = true
+		e.rcvMu.Unlock()
+
+		if !wasClosed {
+			e.waiterQueue.Notify(waiter.EventIn)
+		}
+	}
+
+	return nil
+}
+
+// Listen is not supported by UDP, it just fails.
+func (*endpoint) Listen(int) *tcpip.Error {
+	return tcpip.ErrNotSupported
+}
+
+// Accept is not supported by UDP, it just fails.
+func (*endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
+	return nil, nil, tcpip.ErrNotSupported
+}
+
+func (e *endpoint) registerWithStack(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, id stack.TransportEndpointID) (stack.TransportEndpointID, tcpip.NICID, *tcpip.Error) {
+	if e.ID.LocalPort == 0 {
+		port, err := e.stack.ReservePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.portFlags, e.bindToDevice, tcpip.FullAddress{})
+		if err != nil {
+			return id, e.bindToDevice, err
+		}
+		id.LocalPort = port
+	}
+	e.boundPortFlags = e.portFlags
+
+	err := e.stack.RegisterTransportEndpoint(nicID, netProtos, ProtocolNumber, id, e, e.boundPortFlags, e.bindToDevice)
+	if err != nil {
+		e.stack.ReleasePort(netProtos, ProtocolNumber, id.LocalAddress, id.LocalPort, e.boundPortFlags, e.bindToDevice, tcpip.FullAddress{})
+		e.boundPortFlags = ports.Flags{}
+	}
+	return id, e.bindToDevice, err
+}
+
+func (e *endpoint) bindLocked(addr tcpip.FullAddress) *tcpip.Error {
+	// Don't allow binding once endpoint is not in the initial state
+	// anymore.
+	if e.state != StateInitial {
+		return tcpip.ErrInvalidEndpointState
+	}
+
+	addr, netProto, err := e.checkV4MappedLocked(addr)
+	if err != nil {
+		return err
+	}
+
+	// Expand netProtos to include v4 and v6 if the caller is binding to a
+	// wildcard (empty) address, and this is an IPv6 endpoint with v6only
+	// set to false.
+	netProtos := []tcpip.NetworkProtocolNumber{netProto}
+	if netProto == header.IPv6ProtocolNumber && !e.v6only && addr.Addr == "" {
+		netProtos = []tcpip.NetworkProtocolNumber{
+			header.IPv6ProtocolNumber,
+			header.IPv4ProtocolNumber,
+		}
+	}
+
+	nicID := addr.NIC
+	if len(addr.Addr) != 0 && !isBroadcastOrMulticast(addr.Addr) {
+		// A local unicast address was specified, verify that it's valid.
+		nicID = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
+		if nicID == 0 {
+			return tcpip.ErrBadLocalAddress
+		}
+	}
+
+	id := stack.TransportEndpointID{
+		LocalPort:    addr.Port,
+		LocalAddress: addr.Addr,
+	}
+	id, btd, err := e.registerWithStack(nicID, netProtos, id)
+	if err != nil {
+		return err
+	}
+
+	e.ID = id
+	e.boundBindToDevice = btd
+	e.RegisterNICID = nicID
+	e.effectiveNetProtos = netProtos
+
+	// Mark endpoint as bound.
+	e.state = StateBound
+
+	e.rcvMu.Lock()
+	e.rcvReady = true
+	e.rcvMu.Unlock()
+
+	return nil
+}
+
+// Bind binds the endpoint to a specific local address and port.
+// Specifying a NIC is optional.
+func (e *endpoint) Bind(addr tcpip.FullAddress) *tcpip.Error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	err := e.bindLocked(addr)
+	if err != nil {
+		return err
+	}
+
+	// Save the effective NICID generated by bindLocked.
+	e.BindNICID = e.RegisterNICID
+
+	return nil
+}
+
+// GetLocalAddress returns the address to which the endpoint is bound.
+func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	addr := e.ID.LocalAddress
+	if e.state == StateConnected {
+		addr = e.route.LocalAddress
+	}
+
+	return tcpip.FullAddress{
+		NIC:  e.RegisterNICID,
+		Addr: addr,
+		Port: e.ID.LocalPort,
+	}, nil
+}
+
+// GetRemoteAddress returns the address to which the endpoint is connected.
+func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+
+	if e.state != StateConnected {
+		return tcpip.FullAddress{}, tcpip.ErrNotConnected
+	}
+
+	return tcpip.FullAddress{
+		NIC:  e.RegisterNICID,
+		Addr: e.ID.RemoteAddress,
+		Port: e.ID.RemotePort,
+	}, nil
+}
+
+// Readiness returns the current readiness of the endpoint. For example, if
+// waiter.EventIn is set, the endpoint is immediately readable.
+func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
+	// The endpoint is always writable.
+	result := waiter.EventOut & mask
+
+	// Determine if the endpoint is readable if requested.
+	if (mask & waiter.EventIn) != 0 {
+		e.rcvMu.Lock()
+		if !e.rcvList.Empty() || e.rcvClosed {
+			result |= waiter.EventIn
+		}
+		e.rcvMu.Unlock()
+	}
+
+	return result
+}
+
+// HandlePacket is called by the stack when new packets arrive to this transport
+// endpoint.
+func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) {
+	// Get the header then trim it from the view.
+	hdr := header.UDP(pkt.TransportHeader)
+	if int(hdr.Length()) > pkt.Data.Size()+header.UDPMinimumSize {
+		// Malformed packet.
+		e.stack.Stats().UDP.MalformedPacketsReceived.Increment()
+		e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
+		return
+	}
+
+	// Verify checksum unless RX checksum offload is enabled.
+	// On IPv4, UDP checksum is optional, and a zero value means
+	// the transmitter omitted the checksum generation (RFC768).
+	// On IPv6, UDP checksum is not optional (RFC2460 Section 8.1).
+	if r.Capabilities()&stack.CapabilityRXChecksumOffload == 0 &&
+		(hdr.Checksum() != 0 || r.NetProto == header.IPv6ProtocolNumber) {
+		xsum := r.PseudoHeaderChecksum(ProtocolNumber, hdr.Length())
+		for _, v := range pkt.Data.Views() {
+			xsum = header.Checksum(v, xsum)
+		}
+		if hdr.CalculateChecksum(xsum) != 0xffff {
+			// Checksum Error.
+			e.stack.Stats().UDP.ChecksumErrors.Increment()
+			e.stats.ReceiveErrors.ChecksumErrors.Increment()
+			return
+		}
+	}
+
+	e.rcvMu.Lock()
+	e.stack.Stats().UDP.PacketsReceived.Increment()
+	e.stats.PacketsReceived.Increment()
+
+	// Drop the packet if our buffer is currently full.
+	if !e.rcvReady || e.rcvClosed {
+		e.rcvMu.Unlock()
+		e.stack.Stats().UDP.ReceiveBufferErrors.Increment()
+		e.stats.ReceiveErrors.ClosedReceiver.Increment()
+		return
+	}
+
+	if e.rcvBufSize >= e.rcvBufSizeMax {
+		e.rcvMu.Unlock()
+		e.stack.Stats().UDP.ReceiveBufferErrors.Increment()
+		e.stats.ReceiveErrors.ReceiveBufferOverflow.Increment()
+		return
+	}
+
+	wasEmpty := e.rcvBufSize == 0
+
+	// Push new packet into receive list and increment the buffer size.
+	packet := &udpPacket{
+		senderAddress: tcpip.FullAddress{
+			NIC:  r.NICID(),
+			Addr: id.RemoteAddress,
+			Port: header.UDP(hdr).SourcePort(),
+		},
+	}
+	packet.data = pkt.Data
+	e.rcvList.PushBack(packet)
+	e.rcvBufSize += pkt.Data.Size()
+
+	// Save any useful information from the network header to the packet.
+	switch r.NetProto {
+	case header.IPv4ProtocolNumber:
+		packet.tos, _ = header.IPv4(pkt.NetworkHeader).TOS()
+		packet.packetInfo.LocalAddr = r.LocalAddress
+		packet.packetInfo.DestinationAddr = r.RemoteAddress
+		packet.packetInfo.NIC = r.NICID()
+	case header.IPv6ProtocolNumber:
+		packet.tos, _ = header.IPv6(pkt.NetworkHeader).TOS()
+	}
+
+	packet.timestamp = e.stack.NowNanoseconds()
+
+	e.rcvMu.Unlock()
+
+	// Notify any waiters that there's data to be read now.
+	if wasEmpty {
+		e.waiterQueue.Notify(waiter.EventIn)
+	}
+}
+
+// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
+func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, pkt *stack.PacketBuffer) {
+	if typ == stack.ControlPortUnreachable {
+		e.mu.RLock()
+		defer e.mu.RUnlock()
+
+		if e.state == StateConnected {
+			e.lastErrorMu.Lock()
+			defer e.lastErrorMu.Unlock()
+
+			e.lastError = tcpip.ErrConnectionRefused
+		}
+	}
+}
+
+// State implements tcpip.Endpoint.State.
+func (e *endpoint) State() uint32 {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return uint32(e.state)
+}
+
+// Info returns a copy of the endpoint info.
+func (e *endpoint) Info() tcpip.EndpointInfo {
+	e.mu.RLock()
+	// Make a copy of the endpoint info.
+	ret := e.TransportEndpointInfo
+	e.mu.RUnlock()
+	return &ret
+}
+
+// Stats returns a pointer to the endpoint stats.
+func (e *endpoint) Stats() tcpip.EndpointStats {
+	return &e.stats
+}
+
+// Wait implements tcpip.Endpoint.Wait.
+func (*endpoint) Wait() {}
+
+func isBroadcastOrMulticast(a tcpip.Address) bool {
+	return a == header.IPv4Broadcast || header.IsV4MulticastAddress(a) || header.IsV6MulticastAddress(a)
+}
+
+func (e *endpoint) SetOwner(owner tcpip.PacketOwner) {
+	e.owner = owner
+}
diff --git a/pkg/tcpip/transport/udp/endpoint_state.go b/pkg/tcpip/transport/udp/endpoint_state.go
new file mode 100644
index 000000000..851e6b635
--- /dev/null
+++ b/pkg/tcpip/transport/udp/endpoint_state.go
@@ -0,0 +1,137 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package udp
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// saveData saves udpPacket.data field.
+func (u *udpPacket) saveData() buffer.VectorisedView {
+	// We cannot save u.data directly as u.data.views may alias to u.views,
+	// which is not allowed by state framework (in-struct pointer).
+	return u.data.Clone(nil)
+}
+
+// loadData loads udpPacket.data field.
+func (u *udpPacket) loadData(data buffer.VectorisedView) {
+	// NOTE: We cannot do the u.data = data.Clone(u.views[:]) optimization
+	// here because data.views is not guaranteed to be loaded by now. Plus,
+	// data.views will be allocated anyway so there really is little point
+	// of utilizing u.views for data.views.
+	u.data = data
+}
+
+// saveLastError is invoked by stateify.
+func (e *endpoint) saveLastError() string {
+	if e.lastError == nil {
+		return ""
+	}
+
+	return e.lastError.String()
+}
+
+// loadLastError is invoked by stateify.
+func (e *endpoint) loadLastError(s string) {
+	if s == "" {
+		return
+	}
+
+	e.lastError = tcpip.StringToError(s)
+}
+
+// beforeSave is invoked by stateify.
+func (e *endpoint) beforeSave() {
+	// Stop incoming packets from being handled (and mutate endpoint state).
+	// The lock will be released after savercvBufSizeMax(), which would have
+	// saved e.rcvBufSizeMax and set it to 0 to continue blocking incoming
+	// packets.
+	e.rcvMu.Lock()
+}
+
+// saveRcvBufSizeMax is invoked by stateify.
+func (e *endpoint) saveRcvBufSizeMax() int {
+	max := e.rcvBufSizeMax
+	// Make sure no new packets will be handled regardless of the lock.
+	e.rcvBufSizeMax = 0
+	// Release the lock acquired in beforeSave() so regular endpoint closing
+	// logic can proceed after save.
+	e.rcvMu.Unlock()
+	return max
+}
+
+// loadRcvBufSizeMax is invoked by stateify.
+func (e *endpoint) loadRcvBufSizeMax(max int) {
+	e.rcvBufSizeMax = max
+}
+
+// afterLoad is invoked by stateify.
+func (e *endpoint) afterLoad() {
+	stack.StackFromEnv.RegisterRestoredEndpoint(e)
+}
+
+// Resume implements tcpip.ResumableEndpoint.Resume.
+func (e *endpoint) Resume(s *stack.Stack) {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+
+	e.stack = s
+
+	for _, m := range e.multicastMemberships {
+		if err := e.stack.JoinGroup(e.NetProto, m.nicID, m.multicastAddr); err != nil {
+			panic(err)
+		}
+	}
+
+	if e.state != StateBound && e.state != StateConnected {
+		return
+	}
+
+	netProto := e.effectiveNetProtos[0]
+	// Connect() and bindLocked() both assert
+	//
+	//     netProto == header.IPv6ProtocolNumber
+	//
+	// before creating a multi-entry effectiveNetProtos.
+	if len(e.effectiveNetProtos) > 1 {
+		netProto = header.IPv6ProtocolNumber
+	}
+
+	var err *tcpip.Error
+	if e.state == StateConnected {
+		e.route, err = e.stack.FindRoute(e.RegisterNICID, e.ID.LocalAddress, e.ID.RemoteAddress, netProto, e.multicastLoop)
+		if err != nil {
+			panic(err)
+		}
+	} else if len(e.ID.LocalAddress) != 0 && !isBroadcastOrMulticast(e.ID.LocalAddress) { // stateBound
+		// A local unicast address is specified, verify that it's valid.
+		if e.stack.CheckLocalAddress(e.RegisterNICID, netProto, e.ID.LocalAddress) == 0 {
+			panic(tcpip.ErrBadLocalAddress)
+		}
+	}
+
+	// Our saved state had a port, but we don't actually have a
+	// reservation. We need to remove the port from our state, but still
+	// pass it to the reservation machinery.
+	id := e.ID
+	e.ID.LocalPort = 0
+	e.ID, e.boundBindToDevice, err = e.registerWithStack(e.RegisterNICID, e.effectiveNetProtos, id)
+	if err != nil {
+		panic(err)
+	}
+}
diff --git a/pkg/tcpip/transport/udp/forwarder.go b/pkg/tcpip/transport/udp/forwarder.go
new file mode 100644
index 000000000..c67e0ba95
--- /dev/null
+++ b/pkg/tcpip/transport/udp/forwarder.go
@@ -0,0 +1,96 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package udp
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Forwarder is a session request forwarder, which allows clients to decide
+// what to do with a session request, for example: ignore it, or process it.
+//
+// The canonical way of using it is to pass the Forwarder.HandlePacket function
+// to stack.SetTransportProtocolHandler.
+type Forwarder struct {
+	handler func(*ForwarderRequest)
+
+	stack *stack.Stack
+}
+
+// NewForwarder allocates and initializes a new forwarder.
+func NewForwarder(s *stack.Stack, handler func(*ForwarderRequest)) *Forwarder {
+	return &Forwarder{
+		stack:   s,
+		handler: handler,
+	}
+}
+
+// HandlePacket handles all packets.
+//
+// This function is expected to be passed as an argument to the
+// stack.SetTransportProtocolHandler function.
+func (f *Forwarder) HandlePacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
+	f.handler(&ForwarderRequest{
+		stack: f.stack,
+		route: r,
+		id:    id,
+		pkt:   pkt,
+	})
+
+	return true
+}
+
+// ForwarderRequest represents a session request received by the forwarder and
+// passed to the client. Clients may optionally create an endpoint to represent
+// it via CreateEndpoint.
+type ForwarderRequest struct {
+	stack *stack.Stack
+	route *stack.Route
+	id    stack.TransportEndpointID
+	pkt   *stack.PacketBuffer
+}
+
+// ID returns the 4-tuple (src address, src port, dst address, dst port) that
+// represents the session request.
+func (r *ForwarderRequest) ID() stack.TransportEndpointID {
+	return r.id
+}
+
+// CreateEndpoint creates a connected UDP endpoint for the session request.
+func (r *ForwarderRequest) CreateEndpoint(queue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	ep := newEndpoint(r.stack, r.route.NetProto, queue)
+	if err := r.stack.RegisterTransportEndpoint(r.route.NICID(), []tcpip.NetworkProtocolNumber{r.route.NetProto}, ProtocolNumber, r.id, ep, ep.portFlags, ep.bindToDevice); err != nil {
+		ep.Close()
+		return nil, err
+	}
+
+	ep.ID = r.id
+	ep.route = r.route.Clone()
+	ep.dstPort = r.id.RemotePort
+	ep.RegisterNICID = r.route.NICID()
+	ep.boundPortFlags = ep.portFlags
+
+	ep.state = StateConnected
+
+	ep.rcvMu.Lock()
+	ep.rcvReady = true
+	ep.rcvMu.Unlock()
+
+	ep.HandlePacket(r.route, r.id, r.pkt)
+
+	return ep, nil
+}
diff --git a/pkg/tcpip/transport/udp/protocol.go b/pkg/tcpip/transport/udp/protocol.go
new file mode 100644
index 000000000..0e7464e3a
--- /dev/null
+++ b/pkg/tcpip/transport/udp/protocol.go
@@ -0,0 +1,231 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package udp contains the implementation of the UDP transport protocol. To use
+// it in the networking stack, this package must be added to the project, and
+// activated on the stack by passing udp.NewProtocol() as one of the
+// transport protocols when calling stack.New(). Then endpoints can be created
+// by passing udp.ProtocolNumber as the transport protocol number when calling
+// Stack.NewEndpoint().
+package udp
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// ProtocolNumber is the udp protocol number.
+	ProtocolNumber = header.UDPProtocolNumber
+
+	// MinBufferSize is the smallest size of a receive or send buffer.
+	MinBufferSize = 4 << 10 // 4KiB bytes.
+
+	// DefaultSendBufferSize is the default size of the send buffer for
+	// an endpoint.
+	DefaultSendBufferSize = 32 << 10 // 32KiB
+
+	// DefaultReceiveBufferSize is the default size of the receive buffer
+	// for an endpoint.
+	DefaultReceiveBufferSize = 32 << 10 // 32KiB
+
+	// MaxBufferSize is the largest size a receive/send buffer can grow to.
+	MaxBufferSize = 4 << 20 // 4MiB
+)
+
+type protocol struct {
+}
+
+// Number returns the udp protocol number.
+func (*protocol) Number() tcpip.TransportProtocolNumber {
+	return ProtocolNumber
+}
+
+// NewEndpoint creates a new udp endpoint.
+func (*protocol) NewEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return newEndpoint(stack, netProto, waiterQueue), nil
+}
+
+// NewRawEndpoint creates a new raw UDP endpoint. It implements
+// stack.TransportProtocol.NewRawEndpoint.
+func (p *protocol) NewRawEndpoint(stack *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	return raw.NewEndpoint(stack, netProto, header.UDPProtocolNumber, waiterQueue)
+}
+
+// MinimumPacketSize returns the minimum valid udp packet size.
+func (*protocol) MinimumPacketSize() int {
+	return header.UDPMinimumSize
+}
+
+// ParsePorts returns the source and destination ports stored in the given udp
+// packet.
+func (*protocol) ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error) {
+	h := header.UDP(v)
+	return h.SourcePort(), h.DestinationPort(), nil
+}
+
+// HandleUnknownDestinationPacket handles packets targeted at this protocol but
+// that don't match any existing endpoint.
+func (p *protocol) HandleUnknownDestinationPacket(r *stack.Route, id stack.TransportEndpointID, pkt *stack.PacketBuffer) bool {
+	hdr := header.UDP(pkt.TransportHeader)
+	if int(hdr.Length()) > pkt.Data.Size()+header.UDPMinimumSize {
+		// Malformed packet.
+		r.Stack().Stats().UDP.MalformedPacketsReceived.Increment()
+		return true
+	}
+	// TODO(b/129426613): only send an ICMP message if UDP checksum is valid.
+
+	// Only send ICMP error if the address is not a multicast/broadcast
+	// v4/v6 address or the source is not the unspecified address.
+	//
+	// See: point e) in https://tools.ietf.org/html/rfc4443#section-2.4
+	if id.LocalAddress == header.IPv4Broadcast || header.IsV4MulticastAddress(id.LocalAddress) || header.IsV6MulticastAddress(id.LocalAddress) || id.RemoteAddress == header.IPv6Any || id.RemoteAddress == header.IPv4Any {
+		return true
+	}
+
+	// As per RFC: 1122 Section 3.2.2.1 A host SHOULD generate Destination
+	//   Unreachable messages with code:
+	//
+	//     2 (Protocol Unreachable), when the designated transport protocol
+	//     is not supported; or
+	//
+	//     3 (Port Unreachable), when the designated transport protocol
+	//     (e.g., UDP) is unable to demultiplex the datagram but has no
+	//     protocol mechanism to inform the sender.
+	switch len(id.LocalAddress) {
+	case header.IPv4AddressSize:
+		if !r.Stack().AllowICMPMessage() {
+			r.Stack().Stats().ICMP.V4PacketsSent.RateLimited.Increment()
+			return true
+		}
+		// As per RFC 1812 Section 4.3.2.3
+		//
+		//   ICMP datagram SHOULD contain as much of the original
+		//   datagram as possible without the length of the ICMP
+		//   datagram exceeding 576 bytes
+		//
+		// NOTE: The above RFC referenced is different from the original
+		// recommendation in RFC 1122 where it mentioned that at least 8
+		// bytes of the payload must be included. Today linux and other
+		// systems implement the] RFC1812 definition and not the original
+		// RFC 1122 requirement.
+		mtu := int(r.MTU())
+		if mtu > header.IPv4MinimumProcessableDatagramSize {
+			mtu = header.IPv4MinimumProcessableDatagramSize
+		}
+		headerLen := int(r.MaxHeaderLength()) + header.ICMPv4MinimumSize
+		available := int(mtu) - headerLen
+		payloadLen := len(pkt.NetworkHeader) + len(pkt.TransportHeader) + pkt.Data.Size()
+		if payloadLen > available {
+			payloadLen = available
+		}
+
+		// The buffers used by pkt may be used elsewhere in the system.
+		// For example, a raw or packet socket may use what UDP
+		// considers an unreachable destination. Thus we deep copy pkt
+		// to prevent multiple ownership and SR errors.
+		newHeader := append(buffer.View(nil), pkt.NetworkHeader...)
+		newHeader = append(newHeader, pkt.TransportHeader...)
+		payload := newHeader.ToVectorisedView()
+		payload.AppendView(pkt.Data.ToView())
+		payload.CapLength(payloadLen)
+
+		hdr := buffer.NewPrependable(headerLen)
+		pkt := header.ICMPv4(hdr.Prepend(header.ICMPv4MinimumSize))
+		pkt.SetType(header.ICMPv4DstUnreachable)
+		pkt.SetCode(header.ICMPv4PortUnreachable)
+		pkt.SetChecksum(header.ICMPv4Checksum(pkt, payload))
+		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv4ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, &stack.PacketBuffer{
+			Header:          hdr,
+			TransportHeader: buffer.View(pkt),
+			Data:            payload,
+		})
+
+	case header.IPv6AddressSize:
+		if !r.Stack().AllowICMPMessage() {
+			r.Stack().Stats().ICMP.V6PacketsSent.RateLimited.Increment()
+			return true
+		}
+
+		// As per RFC 4443 section 2.4
+		//
+		//    (c) Every ICMPv6 error message (type < 128) MUST include
+		//    as much of the IPv6 offending (invoking) packet (the
+		//    packet that caused the error) as possible without making
+		//    the error message packet exceed the minimum IPv6 MTU
+		//    [IPv6].
+		mtu := int(r.MTU())
+		if mtu > header.IPv6MinimumMTU {
+			mtu = header.IPv6MinimumMTU
+		}
+		headerLen := int(r.MaxHeaderLength()) + header.ICMPv6DstUnreachableMinimumSize
+		available := int(mtu) - headerLen
+		payloadLen := len(pkt.NetworkHeader) + len(pkt.TransportHeader) + pkt.Data.Size()
+		if payloadLen > available {
+			payloadLen = available
+		}
+		payload := buffer.NewVectorisedView(len(pkt.NetworkHeader)+len(pkt.TransportHeader), []buffer.View{pkt.NetworkHeader, pkt.TransportHeader})
+		payload.Append(pkt.Data)
+		payload.CapLength(payloadLen)
+
+		hdr := buffer.NewPrependable(headerLen)
+		pkt := header.ICMPv6(hdr.Prepend(header.ICMPv6DstUnreachableMinimumSize))
+		pkt.SetType(header.ICMPv6DstUnreachable)
+		pkt.SetCode(header.ICMPv6PortUnreachable)
+		pkt.SetChecksum(header.ICMPv6Checksum(pkt, r.LocalAddress, r.RemoteAddress, payload))
+		r.WritePacket(nil /* gso */, stack.NetworkHeaderParams{Protocol: header.ICMPv6ProtocolNumber, TTL: r.DefaultTTL(), TOS: stack.DefaultTOS}, &stack.PacketBuffer{
+			Header:          hdr,
+			TransportHeader: buffer.View(pkt),
+			Data:            payload,
+		})
+	}
+	return true
+}
+
+// SetOption implements stack.TransportProtocol.SetOption.
+func (p *protocol) SetOption(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// Option implements stack.TransportProtocol.Option.
+func (p *protocol) Option(option interface{}) *tcpip.Error {
+	return tcpip.ErrUnknownProtocolOption
+}
+
+// Close implements stack.TransportProtocol.Close.
+func (*protocol) Close() {}
+
+// Wait implements stack.TransportProtocol.Wait.
+func (*protocol) Wait() {}
+
+// Parse implements stack.TransportProtocol.Parse.
+func (*protocol) Parse(pkt *stack.PacketBuffer) bool {
+	h, ok := pkt.Data.PullUp(header.UDPMinimumSize)
+	if !ok {
+		// Packet is too small
+		return false
+	}
+	pkt.TransportHeader = h
+	pkt.Data.TrimFront(header.UDPMinimumSize)
+	return true
+}
+
+// NewProtocol returns a UDP transport protocol.
+func NewProtocol() stack.TransportProtocol {
+	return &protocol{}
+}
diff --git a/pkg/tcpip/transport/udp/udp_test.go b/pkg/tcpip/transport/udp/udp_test.go
new file mode 100644
index 000000000..91ba031fa
--- /dev/null
+++ b/pkg/tcpip/transport/udp/udp_test.go
@@ -0,0 +1,2072 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package udp_test
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"math/rand"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/checker"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Addresses and ports used for testing. It is recommended that tests stick to
+// using these addresses as it allows using the testFlow helper.
+// Naming rules: 'stack*'' denotes local addresses and ports, while 'test*'
+// represents the remote endpoint.
+const (
+	v4MappedAddrPrefix    = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\xff"
+	stackV6Addr           = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01"
+	testV6Addr            = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02"
+	stackV4MappedAddr     = v4MappedAddrPrefix + stackAddr
+	testV4MappedAddr      = v4MappedAddrPrefix + testAddr
+	multicastV4MappedAddr = v4MappedAddrPrefix + multicastAddr
+	broadcastV4MappedAddr = v4MappedAddrPrefix + broadcastAddr
+	v4MappedWildcardAddr  = v4MappedAddrPrefix + "\x00\x00\x00\x00"
+
+	stackAddr       = "\x0a\x00\x00\x01"
+	stackPort       = 1234
+	testAddr        = "\x0a\x00\x00\x02"
+	testPort        = 4096
+	multicastAddr   = "\xe8\x2b\xd3\xea"
+	multicastV6Addr = "\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
+	broadcastAddr   = header.IPv4Broadcast
+	testTOS         = 0x80
+
+	// defaultMTU is the MTU, in bytes, used throughout the tests, except
+	// where another value is explicitly used. It is chosen to match the MTU
+	// of loopback interfaces on linux systems.
+	defaultMTU = 65536
+)
+
+// header4Tuple stores the 4-tuple {src-IP, src-port, dst-IP, dst-port} used in
+// a packet header. These values are used to populate a header or verify one.
+// Note that because they are used in packet headers, the addresses are never in
+// a V4-mapped format.
+type header4Tuple struct {
+	srcAddr tcpip.FullAddress
+	dstAddr tcpip.FullAddress
+}
+
+// testFlow implements a helper type used for sending and receiving test
+// packets. A given test flow value defines 1) the socket endpoint used for the
+// test and 2) the type of packet send or received on the endpoint. E.g., a
+// multicastV6Only flow is a V6 multicast packet passing through a V6-only
+// endpoint. The type provides helper methods to characterize the flow (e.g.,
+// isV4) as well as return a proper header4Tuple for it.
+type testFlow int
+
+const (
+	unicastV4       testFlow = iota // V4 unicast on a V4 socket
+	unicastV4in6                    // V4-mapped unicast on a V6-dual socket
+	unicastV6                       // V6 unicast on a V6 socket
+	unicastV6Only                   // V6 unicast on a V6-only socket
+	multicastV4                     // V4 multicast on a V4 socket
+	multicastV4in6                  // V4-mapped multicast on a V6-dual socket
+	multicastV6                     // V6 multicast on a V6 socket
+	multicastV6Only                 // V6 multicast on a V6-only socket
+	broadcast                       // V4 broadcast on a V4 socket
+	broadcastIn6                    // V4-mapped broadcast on a V6-dual socket
+)
+
+func (flow testFlow) String() string {
+	switch flow {
+	case unicastV4:
+		return "unicastV4"
+	case unicastV6:
+		return "unicastV6"
+	case unicastV6Only:
+		return "unicastV6Only"
+	case unicastV4in6:
+		return "unicastV4in6"
+	case multicastV4:
+		return "multicastV4"
+	case multicastV6:
+		return "multicastV6"
+	case multicastV6Only:
+		return "multicastV6Only"
+	case multicastV4in6:
+		return "multicastV4in6"
+	case broadcast:
+		return "broadcast"
+	case broadcastIn6:
+		return "broadcastIn6"
+	default:
+		return "unknown"
+	}
+}
+
+// packetDirection explains if a flow is incoming (read) or outgoing (write).
+type packetDirection int
+
+const (
+	incoming packetDirection = iota
+	outgoing
+)
+
+// header4Tuple returns the header4Tuple for the given flow and direction. Note
+// that the tuple contains no mapped addresses as those only exist at the socket
+// level but not at the packet header level.
+func (flow testFlow) header4Tuple(d packetDirection) header4Tuple {
+	var h header4Tuple
+	if flow.isV4() {
+		if d == outgoing {
+			h = header4Tuple{
+				srcAddr: tcpip.FullAddress{Addr: stackAddr, Port: stackPort},
+				dstAddr: tcpip.FullAddress{Addr: testAddr, Port: testPort},
+			}
+		} else {
+			h = header4Tuple{
+				srcAddr: tcpip.FullAddress{Addr: testAddr, Port: testPort},
+				dstAddr: tcpip.FullAddress{Addr: stackAddr, Port: stackPort},
+			}
+		}
+		if flow.isMulticast() {
+			h.dstAddr.Addr = multicastAddr
+		} else if flow.isBroadcast() {
+			h.dstAddr.Addr = broadcastAddr
+		}
+	} else { // IPv6
+		if d == outgoing {
+			h = header4Tuple{
+				srcAddr: tcpip.FullAddress{Addr: stackV6Addr, Port: stackPort},
+				dstAddr: tcpip.FullAddress{Addr: testV6Addr, Port: testPort},
+			}
+		} else {
+			h = header4Tuple{
+				srcAddr: tcpip.FullAddress{Addr: testV6Addr, Port: testPort},
+				dstAddr: tcpip.FullAddress{Addr: stackV6Addr, Port: stackPort},
+			}
+		}
+		if flow.isMulticast() {
+			h.dstAddr.Addr = multicastV6Addr
+		}
+	}
+	return h
+}
+
+func (flow testFlow) getMcastAddr() tcpip.Address {
+	if flow.isV4() {
+		return multicastAddr
+	}
+	return multicastV6Addr
+}
+
+// mapAddrIfApplicable converts the given V4 address into its V4-mapped version
+// if it is applicable to the flow.
+func (flow testFlow) mapAddrIfApplicable(v4Addr tcpip.Address) tcpip.Address {
+	if flow.isMapped() {
+		return v4MappedAddrPrefix + v4Addr
+	}
+	return v4Addr
+}
+
+// netProto returns the protocol number used for the network packet.
+func (flow testFlow) netProto() tcpip.NetworkProtocolNumber {
+	if flow.isV4() {
+		return ipv4.ProtocolNumber
+	}
+	return ipv6.ProtocolNumber
+}
+
+// sockProto returns the protocol number used when creating the socket
+// endpoint for this flow.
+func (flow testFlow) sockProto() tcpip.NetworkProtocolNumber {
+	switch flow {
+	case unicastV4in6, unicastV6, unicastV6Only, multicastV4in6, multicastV6, multicastV6Only, broadcastIn6:
+		return ipv6.ProtocolNumber
+	case unicastV4, multicastV4, broadcast:
+		return ipv4.ProtocolNumber
+	default:
+		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
+	}
+}
+
+func (flow testFlow) checkerFn() func(*testing.T, []byte, ...checker.NetworkChecker) {
+	if flow.isV4() {
+		return checker.IPv4
+	}
+	return checker.IPv6
+}
+
+func (flow testFlow) isV6() bool { return !flow.isV4() }
+func (flow testFlow) isV4() bool {
+	return flow.sockProto() == ipv4.ProtocolNumber || flow.isMapped()
+}
+
+func (flow testFlow) isV6Only() bool {
+	switch flow {
+	case unicastV6Only, multicastV6Only:
+		return true
+	case unicastV4, unicastV4in6, unicastV6, multicastV4, multicastV4in6, multicastV6, broadcast, broadcastIn6:
+		return false
+	default:
+		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
+	}
+}
+
+func (flow testFlow) isMulticast() bool {
+	switch flow {
+	case multicastV4, multicastV4in6, multicastV6, multicastV6Only:
+		return true
+	case unicastV4, unicastV4in6, unicastV6, unicastV6Only, broadcast, broadcastIn6:
+		return false
+	default:
+		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
+	}
+}
+
+func (flow testFlow) isBroadcast() bool {
+	switch flow {
+	case broadcast, broadcastIn6:
+		return true
+	case unicastV4, unicastV4in6, unicastV6, unicastV6Only, multicastV4, multicastV4in6, multicastV6, multicastV6Only:
+		return false
+	default:
+		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
+	}
+}
+
+func (flow testFlow) isMapped() bool {
+	switch flow {
+	case unicastV4in6, multicastV4in6, broadcastIn6:
+		return true
+	case unicastV4, unicastV6, unicastV6Only, multicastV4, multicastV6, multicastV6Only, broadcast:
+		return false
+	default:
+		panic(fmt.Sprintf("invalid testFlow given: %d", flow))
+	}
+}
+
+type testContext struct {
+	t      *testing.T
+	linkEP *channel.Endpoint
+	s      *stack.Stack
+
+	ep tcpip.Endpoint
+	wq waiter.Queue
+}
+
+func newDualTestContext(t *testing.T, mtu uint32) *testContext {
+	t.Helper()
+	return newDualTestContextWithOptions(t, mtu, stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+	})
+}
+
+func newDualTestContextWithOptions(t *testing.T, mtu uint32, options stack.Options) *testContext {
+	t.Helper()
+
+	s := stack.New(options)
+	ep := channel.New(256, mtu, "")
+	wep := stack.LinkEndpoint(ep)
+
+	if testing.Verbose() {
+		wep = sniffer.New(ep)
+	}
+	if err := s.CreateNIC(1, wep); err != nil {
+		t.Fatalf("CreateNIC failed: %s", err)
+	}
+
+	if err := s.AddAddress(1, ipv4.ProtocolNumber, stackAddr); err != nil {
+		t.Fatalf("AddAddress failed: %s", err)
+	}
+
+	if err := s.AddAddress(1, ipv6.ProtocolNumber, stackV6Addr); err != nil {
+		t.Fatalf("AddAddress failed: %s", err)
+	}
+
+	s.SetRouteTable([]tcpip.Route{
+		{
+			Destination: header.IPv4EmptySubnet,
+			NIC:         1,
+		},
+		{
+			Destination: header.IPv6EmptySubnet,
+			NIC:         1,
+		},
+	})
+
+	return &testContext{
+		t:      t,
+		s:      s,
+		linkEP: ep,
+	}
+}
+
+func (c *testContext) cleanup() {
+	if c.ep != nil {
+		c.ep.Close()
+	}
+}
+
+func (c *testContext) createEndpoint(proto tcpip.NetworkProtocolNumber) {
+	c.t.Helper()
+
+	var err *tcpip.Error
+	c.ep, err = c.s.NewEndpoint(udp.ProtocolNumber, proto, &c.wq)
+	if err != nil {
+		c.t.Fatal("NewEndpoint failed: ", err)
+	}
+}
+
+func (c *testContext) createEndpointForFlow(flow testFlow) {
+	c.t.Helper()
+
+	c.createEndpoint(flow.sockProto())
+	if flow.isV6Only() {
+		if err := c.ep.SetSockOptBool(tcpip.V6OnlyOption, true); err != nil {
+			c.t.Fatalf("SetSockOptBool failed: %s", err)
+		}
+	} else if flow.isBroadcast() {
+		if err := c.ep.SetSockOptBool(tcpip.BroadcastOption, true); err != nil {
+			c.t.Fatalf("SetSockOptBool failed: %s", err)
+		}
+	}
+}
+
+// getPacketAndVerify reads a packet from the link endpoint and verifies the
+// header against expected values from the given test flow. In addition, it
+// calls any extra checker functions provided.
+func (c *testContext) getPacketAndVerify(flow testFlow, checkers ...checker.NetworkChecker) []byte {
+	c.t.Helper()
+
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+	p, ok := c.linkEP.ReadContext(ctx)
+	if !ok {
+		c.t.Fatalf("Packet wasn't written out")
+		return nil
+	}
+
+	if p.Proto != flow.netProto() {
+		c.t.Fatalf("Bad network protocol: got %v, wanted %v", p.Proto, flow.netProto())
+	}
+
+	hdr := p.Pkt.Header.View()
+	b := append(hdr[:len(hdr):len(hdr)], p.Pkt.Data.ToView()...)
+
+	h := flow.header4Tuple(outgoing)
+	checkers = append(
+		checkers,
+		checker.SrcAddr(h.srcAddr.Addr),
+		checker.DstAddr(h.dstAddr.Addr),
+		checker.UDP(checker.DstPort(h.dstAddr.Port)),
+	)
+	flow.checkerFn()(c.t, b, checkers...)
+	return b
+}
+
+// injectPacket creates a packet of the given flow and with the given payload,
+// and injects it into the link endpoint.
+func (c *testContext) injectPacket(flow testFlow, payload []byte) {
+	c.t.Helper()
+
+	h := flow.header4Tuple(incoming)
+	if flow.isV4() {
+		buf := c.buildV4Packet(payload, &h)
+		c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+			Data: buf.ToVectorisedView(),
+		})
+	} else {
+		buf := c.buildV6Packet(payload, &h)
+		c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
+			Data: buf.ToVectorisedView(),
+		})
+	}
+}
+
+// buildV6Packet creates a V6 test packet with the given payload and header
+// values in a buffer.
+func (c *testContext) buildV6Packet(payload []byte, h *header4Tuple) buffer.View {
+	// Allocate a buffer for data and headers.
+	buf := buffer.NewView(header.UDPMinimumSize + header.IPv6MinimumSize + len(payload))
+	payloadStart := len(buf) - len(payload)
+	copy(buf[payloadStart:], payload)
+
+	// Initialize the IP header.
+	ip := header.IPv6(buf)
+	ip.Encode(&header.IPv6Fields{
+		TrafficClass:  testTOS,
+		PayloadLength: uint16(header.UDPMinimumSize + len(payload)),
+		NextHeader:    uint8(udp.ProtocolNumber),
+		HopLimit:      65,
+		SrcAddr:       h.srcAddr.Addr,
+		DstAddr:       h.dstAddr.Addr,
+	})
+
+	// Initialize the UDP header.
+	u := header.UDP(buf[header.IPv6MinimumSize:])
+	u.Encode(&header.UDPFields{
+		SrcPort: h.srcAddr.Port,
+		DstPort: h.dstAddr.Port,
+		Length:  uint16(header.UDPMinimumSize + len(payload)),
+	})
+
+	// Calculate the UDP pseudo-header checksum.
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, h.srcAddr.Addr, h.dstAddr.Addr, uint16(len(u)))
+
+	// Calculate the UDP checksum and set it.
+	xsum = header.Checksum(payload, xsum)
+	u.SetChecksum(^u.CalculateChecksum(xsum))
+
+	return buf
+}
+
+// buildV4Packet creates a V4 test packet with the given payload and header
+// values in a buffer.
+func (c *testContext) buildV4Packet(payload []byte, h *header4Tuple) buffer.View {
+	// Allocate a buffer for data and headers.
+	buf := buffer.NewView(header.UDPMinimumSize + header.IPv4MinimumSize + len(payload))
+	payloadStart := len(buf) - len(payload)
+	copy(buf[payloadStart:], payload)
+
+	// Initialize the IP header.
+	ip := header.IPv4(buf)
+	ip.Encode(&header.IPv4Fields{
+		IHL:         header.IPv4MinimumSize,
+		TOS:         testTOS,
+		TotalLength: uint16(len(buf)),
+		TTL:         65,
+		Protocol:    uint8(udp.ProtocolNumber),
+		SrcAddr:     h.srcAddr.Addr,
+		DstAddr:     h.dstAddr.Addr,
+	})
+	ip.SetChecksum(^ip.CalculateChecksum())
+
+	// Initialize the UDP header.
+	u := header.UDP(buf[header.IPv4MinimumSize:])
+	u.Encode(&header.UDPFields{
+		SrcPort: h.srcAddr.Port,
+		DstPort: h.dstAddr.Port,
+		Length:  uint16(header.UDPMinimumSize + len(payload)),
+	})
+
+	// Calculate the UDP pseudo-header checksum.
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, h.srcAddr.Addr, h.dstAddr.Addr, uint16(len(u)))
+
+	// Calculate the UDP checksum and set it.
+	xsum = header.Checksum(payload, xsum)
+	u.SetChecksum(^u.CalculateChecksum(xsum))
+
+	return buf
+}
+
+func newPayload() []byte {
+	return newMinPayload(30)
+}
+
+func newMinPayload(minSize int) []byte {
+	b := make([]byte, minSize+rand.Intn(100))
+	for i := range b {
+		b[i] = byte(rand.Intn(256))
+	}
+	return b
+}
+
+func TestBindToDeviceOption(t *testing.T) {
+	s := stack.New(stack.Options{
+		NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol()},
+		TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()}})
+
+	ep, err := s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &waiter.Queue{})
+	if err != nil {
+		t.Fatalf("NewEndpoint failed; %s", err)
+	}
+	defer ep.Close()
+
+	opts := stack.NICOptions{Name: "my_device"}
+	if err := s.CreateNICWithOptions(321, loopback.New(), opts); err != nil {
+		t.Errorf("CreateNICWithOptions(_, _, %+v) failed: %v", opts, err)
+	}
+
+	// nicIDPtr is used instead of taking the address of NICID literals, which is
+	// a compiler error.
+	nicIDPtr := func(s tcpip.NICID) *tcpip.NICID {
+		return &s
+	}
+
+	testActions := []struct {
+		name                 string
+		setBindToDevice      *tcpip.NICID
+		setBindToDeviceError *tcpip.Error
+		getBindToDevice      tcpip.BindToDeviceOption
+	}{
+		{"GetDefaultValue", nil, nil, 0},
+		{"BindToNonExistent", nicIDPtr(999), tcpip.ErrUnknownDevice, 0},
+		{"BindToExistent", nicIDPtr(321), nil, 321},
+		{"UnbindToDevice", nicIDPtr(0), nil, 0},
+	}
+	for _, testAction := range testActions {
+		t.Run(testAction.name, func(t *testing.T) {
+			if testAction.setBindToDevice != nil {
+				bindToDevice := tcpip.BindToDeviceOption(*testAction.setBindToDevice)
+				if gotErr, wantErr := ep.SetSockOpt(bindToDevice), testAction.setBindToDeviceError; gotErr != wantErr {
+					t.Errorf("SetSockOpt(%v) got %v, want %v", bindToDevice, gotErr, wantErr)
+				}
+			}
+			bindToDevice := tcpip.BindToDeviceOption(88888)
+			if err := ep.GetSockOpt(&bindToDevice); err != nil {
+				t.Errorf("GetSockOpt got %v, want %v", err, nil)
+			}
+			if got, want := bindToDevice, testAction.getBindToDevice; got != want {
+				t.Errorf("bindToDevice got %d, want %d", got, want)
+			}
+		})
+	}
+}
+
+// testReadInternal sends a packet of the given test flow into the stack by
+// injecting it into the link endpoint. It then attempts to read it from the
+// UDP endpoint and depending on if this was expected to succeed verifies its
+// correctness including any additional checker functions provided.
+func testReadInternal(c *testContext, flow testFlow, packetShouldBeDropped, expectReadError bool, checkers ...checker.ControlMessagesChecker) {
+	c.t.Helper()
+
+	payload := newPayload()
+	c.injectPacket(flow, payload)
+
+	// Try to receive the data.
+	we, ch := waiter.NewChannelEntry(nil)
+	c.wq.EventRegister(&we, waiter.EventIn)
+	defer c.wq.EventUnregister(&we)
+
+	// Take a snapshot of the stats to validate them at the end of the test.
+	epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
+
+	var addr tcpip.FullAddress
+	v, cm, err := c.ep.Read(&addr)
+	if err == tcpip.ErrWouldBlock {
+		// Wait for data to become available.
+		select {
+		case <-ch:
+			v, cm, err = c.ep.Read(&addr)
+
+		case <-time.After(300 * time.Millisecond):
+			if packetShouldBeDropped {
+				return // expected to time out
+			}
+			c.t.Fatal("timed out waiting for data")
+		}
+	}
+
+	if expectReadError && err != nil {
+		c.checkEndpointReadStats(1, epstats, err)
+		return
+	}
+
+	if err != nil {
+		c.t.Fatal("Read failed:", err)
+	}
+
+	if packetShouldBeDropped {
+		c.t.Fatalf("Read unexpectedly received data from %s", addr.Addr)
+	}
+
+	// Check the peer address.
+	h := flow.header4Tuple(incoming)
+	if addr.Addr != h.srcAddr.Addr {
+		c.t.Fatalf("unexpected remote address: got %s, want %v", addr.Addr, h.srcAddr)
+	}
+
+	// Check the payload.
+	if !bytes.Equal(payload, v) {
+		c.t.Fatalf("bad payload: got %x, want %x", v, payload)
+	}
+
+	// Run any checkers against the ControlMessages.
+	for _, f := range checkers {
+		f(c.t, cm)
+	}
+
+	c.checkEndpointReadStats(1, epstats, err)
+}
+
+// testRead sends a packet of the given test flow into the stack by injecting it
+// into the link endpoint. It then reads it from the UDP endpoint and verifies
+// its correctness including any additional checker functions provided.
+func testRead(c *testContext, flow testFlow, checkers ...checker.ControlMessagesChecker) {
+	c.t.Helper()
+	testReadInternal(c, flow, false /* packetShouldBeDropped */, false /* expectReadError */, checkers...)
+}
+
+// testFailingRead sends a packet of the given test flow into the stack by
+// injecting it into the link endpoint. It then tries to read it from the UDP
+// endpoint and expects this to fail.
+func testFailingRead(c *testContext, flow testFlow, expectReadError bool) {
+	c.t.Helper()
+	testReadInternal(c, flow, true /* packetShouldBeDropped */, expectReadError)
+}
+
+func TestBindEphemeralPort(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+
+	if err := c.ep.Bind(tcpip.FullAddress{}); err != nil {
+		t.Fatalf("ep.Bind(...) failed: %s", err)
+	}
+}
+
+func TestBindReservedPort(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+
+	if err := c.ep.Connect(tcpip.FullAddress{Addr: testV6Addr, Port: testPort}); err != nil {
+		c.t.Fatalf("Connect failed: %s", err)
+	}
+
+	addr, err := c.ep.GetLocalAddress()
+	if err != nil {
+		t.Fatalf("GetLocalAddress failed: %s", err)
+	}
+
+	// We can't bind the address reserved by the connected endpoint above.
+	{
+		ep, err := c.s.NewEndpoint(udp.ProtocolNumber, ipv6.ProtocolNumber, &c.wq)
+		if err != nil {
+			t.Fatalf("NewEndpoint failed: %s", err)
+		}
+		defer ep.Close()
+		if got, want := ep.Bind(addr), tcpip.ErrPortInUse; got != want {
+			t.Fatalf("got ep.Bind(...) = %v, want = %v", got, want)
+		}
+	}
+
+	func() {
+		ep, err := c.s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &c.wq)
+		if err != nil {
+			t.Fatalf("NewEndpoint failed: %s", err)
+		}
+		defer ep.Close()
+		// We can't bind ipv4-any on the port reserved by the connected endpoint
+		// above, since the endpoint is dual-stack.
+		if got, want := ep.Bind(tcpip.FullAddress{Port: addr.Port}), tcpip.ErrPortInUse; got != want {
+			t.Fatalf("got ep.Bind(...) = %v, want = %v", got, want)
+		}
+		// We can bind an ipv4 address on this port, though.
+		if err := ep.Bind(tcpip.FullAddress{Addr: stackAddr, Port: addr.Port}); err != nil {
+			t.Fatalf("ep.Bind(...) failed: %s", err)
+		}
+	}()
+
+	// Once the connected endpoint releases its port reservation, we are able to
+	// bind ipv4-any once again.
+	c.ep.Close()
+	func() {
+		ep, err := c.s.NewEndpoint(udp.ProtocolNumber, ipv4.ProtocolNumber, &c.wq)
+		if err != nil {
+			t.Fatalf("NewEndpoint failed: %s", err)
+		}
+		defer ep.Close()
+		if err := ep.Bind(tcpip.FullAddress{Port: addr.Port}); err != nil {
+			t.Fatalf("ep.Bind(...) failed: %s", err)
+		}
+	}()
+}
+
+func TestV4ReadOnV6(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpointForFlow(unicastV4in6)
+
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	// Test acceptance.
+	testRead(c, unicastV4in6)
+}
+
+func TestV4ReadOnBoundToV4MappedWildcard(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpointForFlow(unicastV4in6)
+
+	// Bind to v4 mapped wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Addr: v4MappedWildcardAddr, Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	// Test acceptance.
+	testRead(c, unicastV4in6)
+}
+
+func TestV4ReadOnBoundToV4Mapped(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpointForFlow(unicastV4in6)
+
+	// Bind to local address.
+	if err := c.ep.Bind(tcpip.FullAddress{Addr: stackV4MappedAddr, Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	// Test acceptance.
+	testRead(c, unicastV4in6)
+}
+
+func TestV6ReadOnV6(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpointForFlow(unicastV6)
+
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	// Test acceptance.
+	testRead(c, unicastV6)
+}
+
+// TestV4ReadSelfSource checks that packets coming from a local IP address are
+// correctly dropped when handleLocal is true and not otherwise.
+func TestV4ReadSelfSource(t *testing.T) {
+	for _, tt := range []struct {
+		name              string
+		handleLocal       bool
+		wantErr           *tcpip.Error
+		wantInvalidSource uint64
+	}{
+		{"HandleLocal", false, nil, 0},
+		{"NoHandleLocal", true, tcpip.ErrWouldBlock, 1},
+	} {
+		t.Run(tt.name, func(t *testing.T) {
+			c := newDualTestContextWithOptions(t, defaultMTU, stack.Options{
+				NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+				TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				HandleLocal:        tt.handleLocal,
+			})
+			defer c.cleanup()
+
+			c.createEndpointForFlow(unicastV4)
+
+			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+				t.Fatalf("Bind failed: %s", err)
+			}
+
+			payload := newPayload()
+			h := unicastV4.header4Tuple(incoming)
+			h.srcAddr = h.dstAddr
+
+			buf := c.buildV4Packet(payload, &h)
+			c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+				Data: buf.ToVectorisedView(),
+			})
+
+			if got := c.s.Stats().IP.InvalidSourceAddressesReceived.Value(); got != tt.wantInvalidSource {
+				t.Errorf("c.s.Stats().IP.InvalidSourceAddressesReceived got %d, want %d", got, tt.wantInvalidSource)
+			}
+
+			if _, _, err := c.ep.Read(nil); err != tt.wantErr {
+				t.Errorf("c.ep.Read() got error %v, want %v", err, tt.wantErr)
+			}
+		})
+	}
+}
+
+func TestV4ReadOnV4(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpointForFlow(unicastV4)
+
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	// Test acceptance.
+	testRead(c, unicastV4)
+}
+
+// TestReadOnBoundToMulticast checks that an endpoint can bind to a multicast
+// address and receive data sent to that address.
+func TestReadOnBoundToMulticast(t *testing.T) {
+	// FIXME(b/128189410): multicastV4in6 currently doesn't work as
+	// AddMembershipOption doesn't handle V4in6 addresses.
+	for _, flow := range []testFlow{multicastV4, multicastV6, multicastV6Only} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Bind to multicast address.
+			mcastAddr := flow.mapAddrIfApplicable(flow.getMcastAddr())
+			if err := c.ep.Bind(tcpip.FullAddress{Addr: mcastAddr, Port: stackPort}); err != nil {
+				c.t.Fatal("Bind failed:", err)
+			}
+
+			// Join multicast group.
+			ifoptSet := tcpip.AddMembershipOption{NIC: 1, MulticastAddr: mcastAddr}
+			if err := c.ep.SetSockOpt(ifoptSet); err != nil {
+				c.t.Fatal("SetSockOpt failed:", err)
+			}
+
+			// Check that we receive multicast packets but not unicast or broadcast
+			// ones.
+			testRead(c, flow)
+			testFailingRead(c, broadcast, false /* expectReadError */)
+			testFailingRead(c, unicastV4, false /* expectReadError */)
+		})
+	}
+}
+
+// TestV4ReadOnBoundToBroadcast checks that an endpoint can bind to a broadcast
+// address and can receive only broadcast data.
+func TestV4ReadOnBoundToBroadcast(t *testing.T) {
+	for _, flow := range []testFlow{broadcast, broadcastIn6} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Bind to broadcast address.
+			bcastAddr := flow.mapAddrIfApplicable(broadcastAddr)
+			if err := c.ep.Bind(tcpip.FullAddress{Addr: bcastAddr, Port: stackPort}); err != nil {
+				c.t.Fatalf("Bind failed: %s", err)
+			}
+
+			// Check that we receive broadcast packets but not unicast ones.
+			testRead(c, flow)
+			testFailingRead(c, unicastV4, false /* expectReadError */)
+		})
+	}
+}
+
+// TestV4ReadBroadcastOnBoundToWildcard checks that an endpoint can bind to ANY
+// and receive broadcast and unicast data.
+func TestV4ReadBroadcastOnBoundToWildcard(t *testing.T) {
+	for _, flow := range []testFlow{broadcast, broadcastIn6} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Bind to wildcard.
+			if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+				c.t.Fatalf("Bind failed: %s (", err)
+			}
+
+			// Check that we receive both broadcast and unicast packets.
+			testRead(c, flow)
+			testRead(c, unicastV4)
+		})
+	}
+}
+
+// testFailingWrite sends a packet of the given test flow into the UDP endpoint
+// and verifies it fails with the provided error code.
+func testFailingWrite(c *testContext, flow testFlow, wantErr *tcpip.Error) {
+	c.t.Helper()
+	// Take a snapshot of the stats to validate them at the end of the test.
+	epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
+	h := flow.header4Tuple(outgoing)
+	writeDstAddr := flow.mapAddrIfApplicable(h.dstAddr.Addr)
+
+	payload := buffer.View(newPayload())
+	_, _, gotErr := c.ep.Write(tcpip.SlicePayload(payload), tcpip.WriteOptions{
+		To: &tcpip.FullAddress{Addr: writeDstAddr, Port: h.dstAddr.Port},
+	})
+	c.checkEndpointWriteStats(1, epstats, gotErr)
+	if gotErr != wantErr {
+		c.t.Fatalf("Write returned unexpected error: got %v, want %v", gotErr, wantErr)
+	}
+}
+
+// testWrite sends a packet of the given test flow from the UDP endpoint to the
+// flow's destination address:port. It then receives it from the link endpoint
+// and verifies its correctness including any additional checker functions
+// provided.
+func testWrite(c *testContext, flow testFlow, checkers ...checker.NetworkChecker) uint16 {
+	c.t.Helper()
+	return testWriteInternal(c, flow, true, checkers...)
+}
+
+// testWriteWithoutDestination sends a packet of the given test flow from the
+// UDP endpoint without giving a destination address:port. It then receives it
+// from the link endpoint and verifies its correctness including any additional
+// checker functions provided.
+func testWriteWithoutDestination(c *testContext, flow testFlow, checkers ...checker.NetworkChecker) uint16 {
+	c.t.Helper()
+	return testWriteInternal(c, flow, false, checkers...)
+}
+
+func testWriteInternal(c *testContext, flow testFlow, setDest bool, checkers ...checker.NetworkChecker) uint16 {
+	c.t.Helper()
+	// Take a snapshot of the stats to validate them at the end of the test.
+	epstats := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
+
+	writeOpts := tcpip.WriteOptions{}
+	if setDest {
+		h := flow.header4Tuple(outgoing)
+		writeDstAddr := flow.mapAddrIfApplicable(h.dstAddr.Addr)
+		writeOpts = tcpip.WriteOptions{
+			To: &tcpip.FullAddress{Addr: writeDstAddr, Port: h.dstAddr.Port},
+		}
+	}
+	payload := buffer.View(newPayload())
+	n, _, err := c.ep.Write(tcpip.SlicePayload(payload), writeOpts)
+	if err != nil {
+		c.t.Fatalf("Write failed: %s", err)
+	}
+	if n != int64(len(payload)) {
+		c.t.Fatalf("Bad number of bytes written: got %v, want %v", n, len(payload))
+	}
+	c.checkEndpointWriteStats(1, epstats, err)
+	// Received the packet and check the payload.
+	b := c.getPacketAndVerify(flow, checkers...)
+	var udp header.UDP
+	if flow.isV4() {
+		udp = header.UDP(header.IPv4(b).Payload())
+	} else {
+		udp = header.UDP(header.IPv6(b).Payload())
+	}
+	if !bytes.Equal(payload, udp.Payload()) {
+		c.t.Fatalf("Bad payload: got %x, want %x", udp.Payload(), payload)
+	}
+
+	return udp.SourcePort()
+}
+
+func testDualWrite(c *testContext) uint16 {
+	c.t.Helper()
+
+	v4Port := testWrite(c, unicastV4in6)
+	v6Port := testWrite(c, unicastV6)
+	if v4Port != v6Port {
+		c.t.Fatalf("expected v4 and v6 ports to be equal: got v4Port = %d, v6Port = %d", v4Port, v6Port)
+	}
+
+	return v4Port
+}
+
+func TestDualWriteUnbound(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+
+	testDualWrite(c)
+}
+
+func TestDualWriteBoundToWildcard(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	p := testDualWrite(c)
+	if p != stackPort {
+		c.t.Fatalf("Bad port: got %v, want %v", p, stackPort)
+	}
+}
+
+func TestDualWriteConnectedToV6(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+
+	// Connect to v6 address.
+	if err := c.ep.Connect(tcpip.FullAddress{Addr: testV6Addr, Port: testPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	testWrite(c, unicastV6)
+
+	// Write to V4 mapped address.
+	testFailingWrite(c, unicastV4in6, tcpip.ErrNetworkUnreachable)
+	const want = 1
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).SendErrors.NoRoute.Value(); got != want {
+		c.t.Fatalf("Endpoint stat not updated. got %d want %d", got, want)
+	}
+}
+
+func TestDualWriteConnectedToV4Mapped(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+
+	// Connect to v4 mapped address.
+	if err := c.ep.Connect(tcpip.FullAddress{Addr: testV4MappedAddr, Port: testPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	testWrite(c, unicastV4in6)
+
+	// Write to v6 address.
+	testFailingWrite(c, unicastV6, tcpip.ErrInvalidEndpointState)
+}
+
+func TestV4WriteOnV6Only(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpointForFlow(unicastV6Only)
+
+	// Write to V4 mapped address.
+	testFailingWrite(c, unicastV4in6, tcpip.ErrNoRoute)
+}
+
+func TestV6WriteOnBoundToV4Mapped(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+
+	// Bind to v4 mapped address.
+	if err := c.ep.Bind(tcpip.FullAddress{Addr: stackV4MappedAddr, Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	// Write to v6 address.
+	testFailingWrite(c, unicastV6, tcpip.ErrInvalidEndpointState)
+}
+
+func TestV6WriteOnConnected(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+
+	// Connect to v6 address.
+	if err := c.ep.Connect(tcpip.FullAddress{Addr: testV6Addr, Port: testPort}); err != nil {
+		c.t.Fatalf("Connect failed: %s", err)
+	}
+
+	testWriteWithoutDestination(c, unicastV6)
+}
+
+func TestV4WriteOnConnected(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+
+	// Connect to v4 mapped address.
+	if err := c.ep.Connect(tcpip.FullAddress{Addr: testV4MappedAddr, Port: testPort}); err != nil {
+		c.t.Fatalf("Connect failed: %s", err)
+	}
+
+	testWriteWithoutDestination(c, unicastV4)
+}
+
+// TestWriteOnBoundToV4Multicast checks that we can send packets out of a socket
+// that is bound to a V4 multicast address.
+func TestWriteOnBoundToV4Multicast(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, multicastV4, broadcast} {
+		t.Run(fmt.Sprintf("%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Bind to V4 mcast address.
+			if err := c.ep.Bind(tcpip.FullAddress{Addr: multicastAddr, Port: stackPort}); err != nil {
+				c.t.Fatal("Bind failed:", err)
+			}
+
+			testWrite(c, flow)
+		})
+	}
+}
+
+// TestWriteOnBoundToV4MappedMulticast checks that we can send packets out of a
+// socket that is bound to a V4-mapped multicast address.
+func TestWriteOnBoundToV4MappedMulticast(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4in6, multicastV4in6, broadcastIn6} {
+		t.Run(fmt.Sprintf("%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Bind to V4Mapped mcast address.
+			if err := c.ep.Bind(tcpip.FullAddress{Addr: multicastV4MappedAddr, Port: stackPort}); err != nil {
+				c.t.Fatalf("Bind failed: %s", err)
+			}
+
+			testWrite(c, flow)
+		})
+	}
+}
+
+// TestWriteOnBoundToV6Multicast checks that we can send packets out of a
+// socket that is bound to a V6 multicast address.
+func TestWriteOnBoundToV6Multicast(t *testing.T) {
+	for _, flow := range []testFlow{unicastV6, multicastV6} {
+		t.Run(fmt.Sprintf("%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Bind to V6 mcast address.
+			if err := c.ep.Bind(tcpip.FullAddress{Addr: multicastV6Addr, Port: stackPort}); err != nil {
+				c.t.Fatalf("Bind failed: %s", err)
+			}
+
+			testWrite(c, flow)
+		})
+	}
+}
+
+// TestWriteOnBoundToV6Multicast checks that we can send packets out of a
+// V6-only socket that is bound to a V6 multicast address.
+func TestWriteOnBoundToV6OnlyMulticast(t *testing.T) {
+	for _, flow := range []testFlow{unicastV6Only, multicastV6Only} {
+		t.Run(fmt.Sprintf("%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Bind to V6 mcast address.
+			if err := c.ep.Bind(tcpip.FullAddress{Addr: multicastV6Addr, Port: stackPort}); err != nil {
+				c.t.Fatalf("Bind failed: %s", err)
+			}
+
+			testWrite(c, flow)
+		})
+	}
+}
+
+// TestWriteOnBoundToBroadcast checks that we can send packets out of a
+// socket that is bound to the broadcast address.
+func TestWriteOnBoundToBroadcast(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, multicastV4, broadcast} {
+		t.Run(fmt.Sprintf("%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Bind to V4 broadcast address.
+			if err := c.ep.Bind(tcpip.FullAddress{Addr: broadcastAddr, Port: stackPort}); err != nil {
+				c.t.Fatal("Bind failed:", err)
+			}
+
+			testWrite(c, flow)
+		})
+	}
+}
+
+// TestWriteOnBoundToV4MappedBroadcast checks that we can send packets out of a
+// socket that is bound to the V4-mapped broadcast address.
+func TestWriteOnBoundToV4MappedBroadcast(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4in6, multicastV4in6, broadcastIn6} {
+		t.Run(fmt.Sprintf("%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Bind to V4Mapped mcast address.
+			if err := c.ep.Bind(tcpip.FullAddress{Addr: broadcastV4MappedAddr, Port: stackPort}); err != nil {
+				c.t.Fatalf("Bind failed: %s", err)
+			}
+
+			testWrite(c, flow)
+		})
+	}
+}
+
+func TestReadIncrementsPacketsReceived(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	// Create IPv4 UDP endpoint
+	c.createEndpoint(ipv6.ProtocolNumber)
+
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	testRead(c, unicastV4)
+
+	var want uint64 = 1
+	if got := c.s.Stats().UDP.PacketsReceived.Value(); got != want {
+		c.t.Fatalf("Read did not increment PacketsReceived: got %v, want %v", got, want)
+	}
+}
+
+func TestWriteIncrementsPacketsSent(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+
+	testDualWrite(c)
+
+	var want uint64 = 2
+	if got := c.s.Stats().UDP.PacketsSent.Value(); got != want {
+		c.t.Fatalf("Write did not increment PacketsSent: got %v, want %v", got, want)
+	}
+}
+
+func TestNoChecksum(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, unicastV6} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			// Disable the checksum generation.
+			if err := c.ep.SetSockOptBool(tcpip.NoChecksumOption, true); err != nil {
+				t.Fatalf("SetSockOptBool failed: %s", err)
+			}
+			// This option is effective on IPv4 only.
+			testWrite(c, flow, checker.UDP(checker.NoChecksum(flow.isV4())))
+
+			// Enable the checksum generation.
+			if err := c.ep.SetSockOptBool(tcpip.NoChecksumOption, false); err != nil {
+				t.Fatalf("SetSockOptBool failed: %s", err)
+			}
+			testWrite(c, flow, checker.UDP(checker.NoChecksum(false)))
+		})
+	}
+}
+
+func TestTTL(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, unicastV4in6, unicastV6, unicastV6Only, multicastV4, multicastV4in6, multicastV6, broadcast, broadcastIn6} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			const multicastTTL = 42
+			if err := c.ep.SetSockOptInt(tcpip.MulticastTTLOption, multicastTTL); err != nil {
+				c.t.Fatalf("SetSockOptInt failed: %s", err)
+			}
+
+			var wantTTL uint8
+			if flow.isMulticast() {
+				wantTTL = multicastTTL
+			} else {
+				var p stack.NetworkProtocol
+				if flow.isV4() {
+					p = ipv4.NewProtocol()
+				} else {
+					p = ipv6.NewProtocol()
+				}
+				ep, err := p.NewEndpoint(0, tcpip.AddressWithPrefix{}, nil, nil, nil, stack.New(stack.Options{
+					NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+					TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+				}))
+				if err != nil {
+					t.Fatal(err)
+				}
+				wantTTL = ep.DefaultTTL()
+				ep.Close()
+			}
+
+			testWrite(c, flow, checker.TTL(wantTTL))
+		})
+	}
+}
+
+func TestSetTTL(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, unicastV4in6, unicastV6, unicastV6Only, broadcast, broadcastIn6} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			for _, wantTTL := range []uint8{1, 2, 50, 64, 128, 254, 255} {
+				t.Run(fmt.Sprintf("TTL:%d", wantTTL), func(t *testing.T) {
+					c := newDualTestContext(t, defaultMTU)
+					defer c.cleanup()
+
+					c.createEndpointForFlow(flow)
+
+					if err := c.ep.SetSockOptInt(tcpip.TTLOption, int(wantTTL)); err != nil {
+						c.t.Fatalf("SetSockOptInt(TTLOption, %d) failed: %s", wantTTL, err)
+					}
+
+					var p stack.NetworkProtocol
+					if flow.isV4() {
+						p = ipv4.NewProtocol()
+					} else {
+						p = ipv6.NewProtocol()
+					}
+					ep, err := p.NewEndpoint(0, tcpip.AddressWithPrefix{}, nil, nil, nil, stack.New(stack.Options{
+						NetworkProtocols:   []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol()},
+						TransportProtocols: []stack.TransportProtocol{udp.NewProtocol()},
+					}))
+					if err != nil {
+						t.Fatal(err)
+					}
+					ep.Close()
+
+					testWrite(c, flow, checker.TTL(wantTTL))
+				})
+			}
+		})
+	}
+}
+
+func TestSetTOS(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4, multicastV4, broadcast} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			const tos = testTOS
+			v, err := c.ep.GetSockOptInt(tcpip.IPv4TOSOption)
+			if err != nil {
+				c.t.Errorf("GetSockOptInt(IPv4TOSOption) failed: %s", err)
+			}
+			// Test for expected default value.
+			if v != 0 {
+				c.t.Errorf("got GetSockOpt(IPv4TOSOption) = 0x%x, want = 0x%x", v, 0)
+			}
+
+			if err := c.ep.SetSockOptInt(tcpip.IPv4TOSOption, tos); err != nil {
+				c.t.Errorf("SetSockOptInt(IPv4TOSOption, 0x%x) failed: %s", tos, err)
+			}
+
+			v, err = c.ep.GetSockOptInt(tcpip.IPv4TOSOption)
+			if err != nil {
+				c.t.Errorf("GetSockOptInt(IPv4TOSOption) failed: %s", err)
+			}
+
+			if v != tos {
+				c.t.Errorf("got GetSockOptInt(IPv4TOSOption) = 0x%x, want = 0x%x", v, tos)
+			}
+
+			testWrite(c, flow, checker.TOS(tos, 0))
+		})
+	}
+}
+
+func TestSetTClass(t *testing.T) {
+	for _, flow := range []testFlow{unicastV4in6, unicastV6, unicastV6Only, multicastV4in6, multicastV6, broadcastIn6} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			c := newDualTestContext(t, defaultMTU)
+			defer c.cleanup()
+
+			c.createEndpointForFlow(flow)
+
+			const tClass = testTOS
+			v, err := c.ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
+			if err != nil {
+				c.t.Errorf("GetSockOptInt(IPv6TrafficClassOption) failed: %s", err)
+			}
+			// Test for expected default value.
+			if v != 0 {
+				c.t.Errorf("got GetSockOptInt(IPv6TrafficClassOption) = 0x%x, want = 0x%x", v, 0)
+			}
+
+			if err := c.ep.SetSockOptInt(tcpip.IPv6TrafficClassOption, tClass); err != nil {
+				c.t.Errorf("SetSockOptInt(IPv6TrafficClassOption, 0x%x) failed: %s", tClass, err)
+			}
+
+			v, err = c.ep.GetSockOptInt(tcpip.IPv6TrafficClassOption)
+			if err != nil {
+				c.t.Errorf("GetSockOptInt(IPv6TrafficClassOption) failed: %s", err)
+			}
+
+			if v != tClass {
+				c.t.Errorf("got GetSockOptInt(IPv6TrafficClassOption) = 0x%x, want = 0x%x", v, tClass)
+			}
+
+			// The header getter for TClass is called TOS, so use that checker.
+			testWrite(c, flow, checker.TOS(tClass, 0))
+		})
+	}
+}
+
+func TestReceiveTosTClass(t *testing.T) {
+	testCases := []struct {
+		name             string
+		getReceiveOption tcpip.SockOptBool
+		tests            []testFlow
+	}{
+		{"ReceiveTosOption", tcpip.ReceiveTOSOption, []testFlow{unicastV4, broadcast}},
+		{"ReceiveTClassOption", tcpip.ReceiveTClassOption, []testFlow{unicastV4in6, unicastV6, unicastV6Only, broadcastIn6}},
+	}
+	for _, testCase := range testCases {
+		for _, flow := range testCase.tests {
+			t.Run(fmt.Sprintf("%s:flow:%s", testCase.name, flow), func(t *testing.T) {
+				c := newDualTestContext(t, defaultMTU)
+				defer c.cleanup()
+
+				c.createEndpointForFlow(flow)
+				option := testCase.getReceiveOption
+				name := testCase.name
+
+				// Verify that setting and reading the option works.
+				v, err := c.ep.GetSockOptBool(option)
+				if err != nil {
+					c.t.Errorf("GetSockOptBool(%s) failed: %s", name, err)
+				}
+				// Test for expected default value.
+				if v != false {
+					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, v, false)
+				}
+
+				want := true
+				if err := c.ep.SetSockOptBool(option, want); err != nil {
+					c.t.Fatalf("SetSockOptBool(%s, %t) failed: %s", name, want, err)
+				}
+
+				got, err := c.ep.GetSockOptBool(option)
+				if err != nil {
+					c.t.Errorf("GetSockOptBool(%s) failed: %s", name, err)
+				}
+
+				if got != want {
+					c.t.Errorf("got GetSockOptBool(%s) = %t, want = %t", name, got, want)
+				}
+
+				// Verify that the correct received TOS or TClass is handed through as
+				// ancillary data to the ControlMessages struct.
+				if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+					c.t.Fatalf("Bind failed: %s", err)
+				}
+				switch option {
+				case tcpip.ReceiveTClassOption:
+					testRead(c, flow, checker.ReceiveTClass(testTOS))
+				case tcpip.ReceiveTOSOption:
+					testRead(c, flow, checker.ReceiveTOS(testTOS))
+				default:
+					t.Fatalf("unknown test variant: %s", name)
+				}
+			})
+		}
+	}
+}
+
+func TestMulticastInterfaceOption(t *testing.T) {
+	for _, flow := range []testFlow{multicastV4, multicastV4in6, multicastV6, multicastV6Only} {
+		t.Run(fmt.Sprintf("flow:%s", flow), func(t *testing.T) {
+			for _, bindTyp := range []string{"bound", "unbound"} {
+				t.Run(bindTyp, func(t *testing.T) {
+					for _, optTyp := range []string{"use local-addr", "use NICID", "use local-addr and NIC"} {
+						t.Run(optTyp, func(t *testing.T) {
+							h := flow.header4Tuple(outgoing)
+							mcastAddr := h.dstAddr.Addr
+							localIfAddr := h.srcAddr.Addr
+
+							var ifoptSet tcpip.MulticastInterfaceOption
+							switch optTyp {
+							case "use local-addr":
+								ifoptSet.InterfaceAddr = localIfAddr
+							case "use NICID":
+								ifoptSet.NIC = 1
+							case "use local-addr and NIC":
+								ifoptSet.InterfaceAddr = localIfAddr
+								ifoptSet.NIC = 1
+							default:
+								t.Fatal("unknown test variant")
+							}
+
+							c := newDualTestContext(t, defaultMTU)
+							defer c.cleanup()
+
+							c.createEndpoint(flow.sockProto())
+
+							if bindTyp == "bound" {
+								// Bind the socket by connecting to the multicast address.
+								// This may have an influence on how the multicast interface
+								// is set.
+								addr := tcpip.FullAddress{
+									Addr: flow.mapAddrIfApplicable(mcastAddr),
+									Port: stackPort,
+								}
+								if err := c.ep.Connect(addr); err != nil {
+									c.t.Fatalf("Connect failed: %s", err)
+								}
+							}
+
+							if err := c.ep.SetSockOpt(ifoptSet); err != nil {
+								c.t.Fatalf("SetSockOpt failed: %s", err)
+							}
+
+							// Verify multicast interface addr and NIC were set correctly.
+							// Note that NIC must be 1 since this is our outgoing interface.
+							ifoptWant := tcpip.MulticastInterfaceOption{NIC: 1, InterfaceAddr: ifoptSet.InterfaceAddr}
+							var ifoptGot tcpip.MulticastInterfaceOption
+							if err := c.ep.GetSockOpt(&ifoptGot); err != nil {
+								c.t.Fatalf("GetSockOpt failed: %s", err)
+							}
+							if ifoptGot != ifoptWant {
+								c.t.Errorf("got GetSockOpt() = %#v, want = %#v", ifoptGot, ifoptWant)
+							}
+						})
+					}
+				})
+			}
+		})
+	}
+}
+
+// TestV4UnknownDestination verifies that we generate an ICMPv4 Destination
+// Unreachable message when a udp datagram is received on ports for which there
+// is no bound udp socket.
+func TestV4UnknownDestination(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	testCases := []struct {
+		flow         testFlow
+		icmpRequired bool
+		// largePayload if true, will result in a payload large enough
+		// so that the final generated IPv4 packet is larger than
+		// header.IPv4MinimumProcessableDatagramSize.
+		largePayload bool
+	}{
+		{unicastV4, true, false},
+		{unicastV4, true, true},
+		{multicastV4, false, false},
+		{multicastV4, false, true},
+		{broadcast, false, false},
+		{broadcast, false, true},
+	}
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("flow:%s icmpRequired:%t largePayload:%t", tc.flow, tc.icmpRequired, tc.largePayload), func(t *testing.T) {
+			payload := newPayload()
+			if tc.largePayload {
+				payload = newMinPayload(576)
+			}
+			c.injectPacket(tc.flow, payload)
+			if !tc.icmpRequired {
+				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+				defer cancel()
+				if p, ok := c.linkEP.ReadContext(ctx); ok {
+					t.Fatalf("unexpected packet received: %+v", p)
+				}
+				return
+			}
+
+			// ICMP required.
+			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+			defer cancel()
+			p, ok := c.linkEP.ReadContext(ctx)
+			if !ok {
+				t.Fatalf("packet wasn't written out")
+				return
+			}
+
+			var pkt []byte
+			pkt = append(pkt, p.Pkt.Header.View()...)
+			pkt = append(pkt, p.Pkt.Data.ToView()...)
+			if got, want := len(pkt), header.IPv4MinimumProcessableDatagramSize; got > want {
+				t.Fatalf("got an ICMP packet of size: %d, want: sz <= %d", got, want)
+			}
+
+			hdr := header.IPv4(pkt)
+			checker.IPv4(t, hdr, checker.ICMPv4(
+				checker.ICMPv4Type(header.ICMPv4DstUnreachable),
+				checker.ICMPv4Code(header.ICMPv4PortUnreachable)))
+
+			icmpPkt := header.ICMPv4(hdr.Payload())
+			payloadIPHeader := header.IPv4(icmpPkt.Payload())
+			wantLen := len(payload)
+			if tc.largePayload {
+				wantLen = header.IPv4MinimumProcessableDatagramSize - header.IPv4MinimumSize*2 - header.ICMPv4MinimumSize - header.UDPMinimumSize
+			}
+
+			// In case of large payloads the IP packet may be truncated. Update
+			// the length field before retrieving the udp datagram payload.
+			payloadIPHeader.SetTotalLength(uint16(wantLen + header.UDPMinimumSize + header.IPv4MinimumSize))
+
+			origDgram := header.UDP(payloadIPHeader.Payload())
+			if got, want := len(origDgram.Payload()), wantLen; got != want {
+				t.Fatalf("unexpected payload length got: %d, want: %d", got, want)
+			}
+			if got, want := origDgram.Payload(), payload[:wantLen]; !bytes.Equal(got, want) {
+				t.Fatalf("unexpected payload got: %d, want: %d", got, want)
+			}
+		})
+	}
+}
+
+// TestV6UnknownDestination verifies that we generate an ICMPv6 Destination
+// Unreachable message when a udp datagram is received on ports for which there
+// is no bound udp socket.
+func TestV6UnknownDestination(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	testCases := []struct {
+		flow         testFlow
+		icmpRequired bool
+		// largePayload if true will result in a payload large enough to
+		// create an IPv6 packet > header.IPv6MinimumMTU bytes.
+		largePayload bool
+	}{
+		{unicastV6, true, false},
+		{unicastV6, true, true},
+		{multicastV6, false, false},
+		{multicastV6, false, true},
+	}
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("flow:%s icmpRequired:%t largePayload:%t", tc.flow, tc.icmpRequired, tc.largePayload), func(t *testing.T) {
+			payload := newPayload()
+			if tc.largePayload {
+				payload = newMinPayload(1280)
+			}
+			c.injectPacket(tc.flow, payload)
+			if !tc.icmpRequired {
+				ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+				defer cancel()
+				if p, ok := c.linkEP.ReadContext(ctx); ok {
+					t.Fatalf("unexpected packet received: %+v", p)
+				}
+				return
+			}
+
+			// ICMP required.
+			ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+			defer cancel()
+			p, ok := c.linkEP.ReadContext(ctx)
+			if !ok {
+				t.Fatalf("packet wasn't written out")
+				return
+			}
+
+			var pkt []byte
+			pkt = append(pkt, p.Pkt.Header.View()...)
+			pkt = append(pkt, p.Pkt.Data.ToView()...)
+			if got, want := len(pkt), header.IPv6MinimumMTU; got > want {
+				t.Fatalf("got an ICMP packet of size: %d, want: sz <= %d", got, want)
+			}
+
+			hdr := header.IPv6(pkt)
+			checker.IPv6(t, hdr, checker.ICMPv6(
+				checker.ICMPv6Type(header.ICMPv6DstUnreachable),
+				checker.ICMPv6Code(header.ICMPv6PortUnreachable)))
+
+			icmpPkt := header.ICMPv6(hdr.Payload())
+			payloadIPHeader := header.IPv6(icmpPkt.Payload())
+			wantLen := len(payload)
+			if tc.largePayload {
+				wantLen = header.IPv6MinimumMTU - header.IPv6MinimumSize*2 - header.ICMPv6MinimumSize - header.UDPMinimumSize
+			}
+			// In case of large payloads the IP packet may be truncated. Update
+			// the length field before retrieving the udp datagram payload.
+			payloadIPHeader.SetPayloadLength(uint16(wantLen + header.UDPMinimumSize))
+
+			origDgram := header.UDP(payloadIPHeader.Payload())
+			if got, want := len(origDgram.Payload()), wantLen; got != want {
+				t.Fatalf("unexpected payload length got: %d, want: %d", got, want)
+			}
+			if got, want := origDgram.Payload(), payload[:wantLen]; !bytes.Equal(got, want) {
+				t.Fatalf("unexpected payload got: %v, want: %v", got, want)
+			}
+		})
+	}
+}
+
+// TestIncrementMalformedPacketsReceived verifies if the malformed received
+// global and endpoint stats are incremented.
+func TestIncrementMalformedPacketsReceived(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	payload := newPayload()
+	h := unicastV6.header4Tuple(incoming)
+	buf := c.buildV6Packet(payload, &h)
+
+	// Invalidate the UDP header length field.
+	u := header.UDP(buf[header.IPv6MinimumSize:])
+	u.SetLength(u.Length() + 1)
+
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	const want = 1
+	if got := c.s.Stats().UDP.MalformedPacketsReceived.Value(); got != want {
+		t.Errorf("got stats.UDP.MalformedPacketsReceived.Value() = %d, want = %d", got, want)
+	}
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.MalformedPacketsReceived.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.MalformedPacketsReceived stats = %d, want = %d", got, want)
+	}
+}
+
+// TestShortHeader verifies that when a packet with a too-short UDP header is
+// received, the malformed received global stat gets incremented.
+func TestShortHeader(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	h := unicastV6.header4Tuple(incoming)
+
+	// Allocate a buffer for an IPv6 and too-short UDP header.
+	const udpSize = header.UDPMinimumSize - 1
+	buf := buffer.NewView(header.IPv6MinimumSize + udpSize)
+	// Initialize the IP header.
+	ip := header.IPv6(buf)
+	ip.Encode(&header.IPv6Fields{
+		TrafficClass:  testTOS,
+		PayloadLength: uint16(udpSize),
+		NextHeader:    uint8(udp.ProtocolNumber),
+		HopLimit:      65,
+		SrcAddr:       h.srcAddr.Addr,
+		DstAddr:       h.dstAddr.Addr,
+	})
+
+	// Initialize the UDP header.
+	udpHdr := header.UDP(buffer.NewView(header.UDPMinimumSize))
+	udpHdr.Encode(&header.UDPFields{
+		SrcPort: h.srcAddr.Port,
+		DstPort: h.dstAddr.Port,
+		Length:  header.UDPMinimumSize,
+	})
+	// Calculate the UDP pseudo-header checksum.
+	xsum := header.PseudoHeaderChecksum(udp.ProtocolNumber, h.srcAddr.Addr, h.dstAddr.Addr, uint16(len(udpHdr)))
+	udpHdr.SetChecksum(^udpHdr.CalculateChecksum(xsum))
+	// Copy all but the last byte of the UDP header into the packet.
+	copy(buf[header.IPv6MinimumSize:], udpHdr)
+
+	// Inject packet.
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	if got, want := c.s.Stats().MalformedRcvdPackets.Value(), uint64(1); got != want {
+		t.Errorf("got c.s.Stats().MalformedRcvdPackets.Value() = %d, want = %d", got, want)
+	}
+}
+
+// TestIncrementChecksumErrorsV4 verifies if a checksum error is detected,
+// global and endpoint stats are incremented.
+func TestIncrementChecksumErrorsV4(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv4.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	payload := newPayload()
+	h := unicastV4.header4Tuple(incoming)
+	buf := c.buildV4Packet(payload, &h)
+
+	// Invalidate the UDP header checksum field, taking care to avoid
+	// overflow to zero, which would disable checksum validation.
+	for u := header.UDP(buf[header.IPv4MinimumSize:]); ; {
+		u.SetChecksum(u.Checksum() + 1)
+		if u.Checksum() != 0 {
+			break
+		}
+	}
+
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	const want = 1
+	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+	}
+}
+
+// TestIncrementChecksumErrorsV6 verifies if a checksum error is detected,
+// global and endpoint stats are incremented.
+func TestIncrementChecksumErrorsV6(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	payload := newPayload()
+	h := unicastV6.header4Tuple(incoming)
+	buf := c.buildV6Packet(payload, &h)
+
+	// Invalidate the UDP header checksum field.
+	u := header.UDP(buf[header.IPv6MinimumSize:])
+	u.SetChecksum(u.Checksum() + 1)
+
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	const want = 1
+	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+	}
+}
+
+// TestPayloadModifiedV4 verifies if a checksum error is detected,
+// global and endpoint stats are incremented.
+func TestPayloadModifiedV4(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv4.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	payload := newPayload()
+	h := unicastV4.header4Tuple(incoming)
+	buf := c.buildV4Packet(payload, &h)
+	// Modify the payload so that the checksum value in the UDP header will be incorrect.
+	buf[len(buf)-1]++
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	const want = 1
+	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+	}
+}
+
+// TestPayloadModifiedV6 verifies if a checksum error is detected,
+// global and endpoint stats are incremented.
+func TestPayloadModifiedV6(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	payload := newPayload()
+	h := unicastV6.header4Tuple(incoming)
+	buf := c.buildV6Packet(payload, &h)
+	// Modify the payload so that the checksum value in the UDP header will be incorrect.
+	buf[len(buf)-1]++
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	const want = 1
+	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+	}
+}
+
+// TestChecksumZeroV4 verifies if the checksum value is zero, global and
+// endpoint states are *not* incremented (UDP checksum is optional on IPv4).
+func TestChecksumZeroV4(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv4.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	payload := newPayload()
+	h := unicastV4.header4Tuple(incoming)
+	buf := c.buildV4Packet(payload, &h)
+	// Set the checksum field in the UDP header to zero.
+	u := header.UDP(buf[header.IPv4MinimumSize:])
+	u.SetChecksum(0)
+	c.linkEP.InjectInbound(ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	const want = 0
+	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+	}
+}
+
+// TestChecksumZeroV6 verifies if the checksum value is zero, global and
+// endpoint states are incremented (UDP checksum is *not* optional on IPv6).
+func TestChecksumZeroV6(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	payload := newPayload()
+	h := unicastV6.header4Tuple(incoming)
+	buf := c.buildV6Packet(payload, &h)
+	// Set the checksum field in the UDP header to zero.
+	u := header.UDP(buf[header.IPv6MinimumSize:])
+	u.SetChecksum(0)
+	c.linkEP.InjectInbound(ipv6.ProtocolNumber, &stack.PacketBuffer{
+		Data: buf.ToVectorisedView(),
+	})
+
+	const want = 1
+	if got := c.s.Stats().UDP.ChecksumErrors.Value(); got != want {
+		t.Errorf("got stats.UDP.ChecksumErrors.Value() = %d, want = %d", got, want)
+	}
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ChecksumErrors.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.ChecksumErrors stats = %d, want = %d", got, want)
+	}
+}
+
+// TestShutdownRead verifies endpoint read shutdown and error
+// stats increment on packet receive.
+func TestShutdownRead(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+
+	// Bind to wildcard.
+	if err := c.ep.Bind(tcpip.FullAddress{Port: stackPort}); err != nil {
+		c.t.Fatalf("Bind failed: %s", err)
+	}
+
+	if err := c.ep.Connect(tcpip.FullAddress{Addr: testV6Addr, Port: testPort}); err != nil {
+		c.t.Fatalf("Connect failed: %s", err)
+	}
+
+	if err := c.ep.Shutdown(tcpip.ShutdownRead); err != nil {
+		t.Fatalf("Shutdown failed: %s", err)
+	}
+
+	testFailingRead(c, unicastV6, true /* expectReadError */)
+
+	var want uint64 = 1
+	if got := c.s.Stats().UDP.ReceiveBufferErrors.Value(); got != want {
+		t.Errorf("got stats.UDP.ReceiveBufferErrors.Value() = %v, want = %v", got, want)
+	}
+	if got := c.ep.Stats().(*tcpip.TransportEndpointStats).ReceiveErrors.ClosedReceiver.Value(); got != want {
+		t.Errorf("got EP Stats.ReceiveErrors.ClosedReceiver stats = %v, want = %v", got, want)
+	}
+}
+
+// TestShutdownWrite verifies endpoint write shutdown and error
+// stats increment on packet write.
+func TestShutdownWrite(t *testing.T) {
+	c := newDualTestContext(t, defaultMTU)
+	defer c.cleanup()
+
+	c.createEndpoint(ipv6.ProtocolNumber)
+
+	if err := c.ep.Connect(tcpip.FullAddress{Addr: testV6Addr, Port: testPort}); err != nil {
+		c.t.Fatalf("Connect failed: %s", err)
+	}
+
+	if err := c.ep.Shutdown(tcpip.ShutdownWrite); err != nil {
+		t.Fatalf("Shutdown failed: %s", err)
+	}
+
+	testFailingWrite(c, unicastV6, tcpip.ErrClosedForSend)
+}
+
+func (c *testContext) checkEndpointWriteStats(incr uint64, want tcpip.TransportEndpointStats, err *tcpip.Error) {
+	got := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
+	switch err {
+	case nil:
+		want.PacketsSent.IncrementBy(incr)
+	case tcpip.ErrMessageTooLong, tcpip.ErrInvalidOptionValue:
+		want.WriteErrors.InvalidArgs.IncrementBy(incr)
+	case tcpip.ErrClosedForSend:
+		want.WriteErrors.WriteClosed.IncrementBy(incr)
+	case tcpip.ErrInvalidEndpointState:
+		want.WriteErrors.InvalidEndpointState.IncrementBy(incr)
+	case tcpip.ErrNoLinkAddress:
+		want.SendErrors.NoLinkAddr.IncrementBy(incr)
+	case tcpip.ErrNoRoute, tcpip.ErrBroadcastDisabled, tcpip.ErrNetworkUnreachable:
+		want.SendErrors.NoRoute.IncrementBy(incr)
+	default:
+		want.SendErrors.SendToNetworkFailed.IncrementBy(incr)
+	}
+	if got != want {
+		c.t.Errorf("Endpoint stats not matching for error %s got %+v want %+v", err, got, want)
+	}
+}
+
+func (c *testContext) checkEndpointReadStats(incr uint64, want tcpip.TransportEndpointStats, err *tcpip.Error) {
+	got := c.ep.Stats().(*tcpip.TransportEndpointStats).Clone()
+	switch err {
+	case nil, tcpip.ErrWouldBlock:
+	case tcpip.ErrClosedForReceive:
+		want.ReadErrors.ReadClosed.IncrementBy(incr)
+	default:
+		c.t.Errorf("Endpoint error missing stats update err %v", err)
+	}
+	if got != want {
+		c.t.Errorf("Endpoint stats not matching for error %s got %+v want %+v", err, got, want)
+	}
+}