8 files changed, 3257 insertions, 0 deletions
diff --git a/pkg/tcpip/stack/linkaddrcache.go b/pkg/tcpip/stack/linkaddrcache.go
new file mode 100644
index 000000000..b952ad20f
--- /dev/null
+++ b/pkg/tcpip/stack/linkaddrcache.go
@@ -0,0 +1,306 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+	"sync"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/sleep"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+)
+
+const linkAddrCacheSize = 512 // max cache entries
+
+// linkAddrCache is a fixed-sized cache mapping IP addresses to link addresses.
+//
+// The entries are stored in a ring buffer, oldest entry replaced first.
+//
+// This struct is safe for concurrent use.
+type linkAddrCache struct {
+	// ageLimit is how long a cache entry is valid for.
+	ageLimit time.Duration
+
+	// resolutionTimeout is the amount of time to wait for a link request to
+	// resolve an address.
+	resolutionTimeout time.Duration
+
+	// resolutionAttempts is the number of times an address is attempted to be
+	// resolved before failing.
+	resolutionAttempts int
+
+	mu      sync.Mutex
+	cache   map[tcpip.FullAddress]*linkAddrEntry
+	next    int // array index of next available entry
+	entries [linkAddrCacheSize]linkAddrEntry
+}
+
+// entryState controls the state of a single entry in the cache.
+type entryState int
+
+const (
+	// incomplete means that there is an outstanding request to resolve the
+	// address. This is the initial state.
+	incomplete entryState = iota
+	// ready means that the address has been resolved and can be used.
+	ready
+	// failed means that address resolution timed out and the address
+	// could not be resolved.
+	failed
+	// expired means that the cache entry has expired and the address must be
+	// resolved again.
+	expired
+)
+
+// String implements Stringer.
+func (s entryState) String() string {
+	switch s {
+	case incomplete:
+		return "incomplete"
+	case ready:
+		return "ready"
+	case failed:
+		return "failed"
+	case expired:
+		return "expired"
+	default:
+		return fmt.Sprintf("unknown(%d)", s)
+	}
+}
+
+// A linkAddrEntry is an entry in the linkAddrCache.
+// This struct is thread-compatible.
+type linkAddrEntry struct {
+	addr       tcpip.FullAddress
+	linkAddr   tcpip.LinkAddress
+	expiration time.Time
+	s          entryState
+
+	// wakers is a set of waiters for address resolution result. Anytime
+	// state transitions out of 'incomplete' these waiters are notified.
+	wakers map[*sleep.Waker]struct{}
+
+	done chan struct{}
+}
+
+func (e *linkAddrEntry) state() entryState {
+	if e.s != expired && time.Now().After(e.expiration) {
+		// Force the transition to ensure waiters are notified.
+		e.changeState(expired)
+	}
+	return e.s
+}
+
+func (e *linkAddrEntry) changeState(ns entryState) {
+	if e.s == ns {
+		return
+	}
+
+	// Validate state transition.
+	switch e.s {
+	case incomplete:
+		// All transitions are valid.
+	case ready, failed:
+		if ns != expired {
+			panic(fmt.Sprintf("invalid state transition from %s to %s", e.s, ns))
+		}
+	case expired:
+		// Terminal state.
+		panic(fmt.Sprintf("invalid state transition from %s to %s", e.s, ns))
+	default:
+		panic(fmt.Sprintf("invalid state: %s", e.s))
+	}
+
+	// Notify whoever is waiting on address resolution when transitioning
+	// out of 'incomplete'.
+	if e.s == incomplete {
+		for w := range e.wakers {
+			w.Assert()
+		}
+		e.wakers = nil
+		if e.done != nil {
+			close(e.done)
+		}
+	}
+	e.s = ns
+}
+
+func (e *linkAddrEntry) maybeAddWaker(w *sleep.Waker) {
+	if w != nil {
+		e.wakers[w] = struct{}{}
+	}
+}
+
+func (e *linkAddrEntry) removeWaker(w *sleep.Waker) {
+	delete(e.wakers, w)
+}
+
+// add adds a k -> v mapping to the cache.
+func (c *linkAddrCache) add(k tcpip.FullAddress, v tcpip.LinkAddress) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	entry, ok := c.cache[k]
+	if ok {
+		s := entry.state()
+		if s != expired && entry.linkAddr == v {
+			// Disregard repeated calls.
+			return
+		}
+		// Check if entry is waiting for address resolution.
+		if s == incomplete {
+			entry.linkAddr = v
+		} else {
+			// Otherwise create a new entry to replace it.
+			entry = c.makeAndAddEntry(k, v)
+		}
+	} else {
+		entry = c.makeAndAddEntry(k, v)
+	}
+
+	entry.changeState(ready)
+}
+
+// makeAndAddEntry is a helper function to create and add a new
+// entry to the cache map and evict older entry as needed.
+func (c *linkAddrCache) makeAndAddEntry(k tcpip.FullAddress, v tcpip.LinkAddress) *linkAddrEntry {
+	// Take over the next entry.
+	entry := &c.entries[c.next]
+	if c.cache[entry.addr] == entry {
+		delete(c.cache, entry.addr)
+	}
+
+	// Mark the soon-to-be-replaced entry as expired, just in case there is
+	// someone waiting for address resolution on it.
+	entry.changeState(expired)
+
+	*entry = linkAddrEntry{
+		addr:       k,
+		linkAddr:   v,
+		expiration: time.Now().Add(c.ageLimit),
+		wakers:     make(map[*sleep.Waker]struct{}),
+		done:       make(chan struct{}),
+	}
+
+	c.cache[k] = entry
+	c.next = (c.next + 1) % len(c.entries)
+	return entry
+}
+
+// get reports any known link address for k.
+func (c *linkAddrCache) get(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, linkEP LinkEndpoint, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) {
+	if linkRes != nil {
+		if addr, ok := linkRes.ResolveStaticAddress(k.Addr); ok {
+			return addr, nil, nil
+		}
+	}
+
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if entry, ok := c.cache[k]; ok {
+		switch s := entry.state(); s {
+		case expired:
+		case ready:
+			return entry.linkAddr, nil, nil
+		case failed:
+			return "", nil, tcpip.ErrNoLinkAddress
+		case incomplete:
+			// Address resolution is still in progress.
+			entry.maybeAddWaker(waker)
+			return "", entry.done, tcpip.ErrWouldBlock
+		default:
+			panic(fmt.Sprintf("invalid cache entry state: %s", s))
+		}
+	}
+
+	if linkRes == nil {
+		return "", nil, tcpip.ErrNoLinkAddress
+	}
+
+	// Add 'incomplete' entry in the cache to mark that resolution is in progress.
+	e := c.makeAndAddEntry(k, "")
+	e.maybeAddWaker(waker)
+
+	go c.startAddressResolution(k, linkRes, localAddr, linkEP, e.done) // S/R-SAFE: link non-savable; wakers dropped synchronously.
+
+	return "", e.done, tcpip.ErrWouldBlock
+}
+
+// removeWaker removes a waker previously added through get().
+func (c *linkAddrCache) removeWaker(k tcpip.FullAddress, waker *sleep.Waker) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if entry, ok := c.cache[k]; ok {
+		entry.removeWaker(waker)
+	}
+}
+
+func (c *linkAddrCache) startAddressResolution(k tcpip.FullAddress, linkRes LinkAddressResolver, localAddr tcpip.Address, linkEP LinkEndpoint, done <-chan struct{}) {
+	for i := 0; ; i++ {
+		// Send link request, then wait for the timeout limit and check
+		// whether the request succeeded.
+		linkRes.LinkAddressRequest(k.Addr, localAddr, linkEP)
+
+		select {
+		case <-time.After(c.resolutionTimeout):
+			if stop := c.checkLinkRequest(k, i); stop {
+				return
+			}
+		case <-done:
+			return
+		}
+	}
+}
+
+// checkLinkRequest checks whether previous attempt to resolve address has succeeded
+// and mark the entry accordingly, e.g. ready, failed, etc. Return true if request
+// can stop, false if another request should be sent.
+func (c *linkAddrCache) checkLinkRequest(k tcpip.FullAddress, attempt int) bool {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	entry, ok := c.cache[k]
+	if !ok {
+		// Entry was evicted from the cache.
+		return true
+	}
+
+	switch s := entry.state(); s {
+	case ready, failed, expired:
+		// Entry was made ready by resolver or failed. Either way we're done.
+		return true
+	case incomplete:
+		if attempt+1 >= c.resolutionAttempts {
+			// Max number of retries reached, mark entry as failed.
+			entry.changeState(failed)
+			return true
+		}
+		// No response yet, need to send another ARP request.
+		return false
+	default:
+		panic(fmt.Sprintf("invalid cache entry state: %s", s))
+	}
+}
+
+func newLinkAddrCache(ageLimit, resolutionTimeout time.Duration, resolutionAttempts int) *linkAddrCache {
+	return &linkAddrCache{
+		ageLimit:           ageLimit,
+		resolutionTimeout:  resolutionTimeout,
+		resolutionAttempts: resolutionAttempts,
+		cache:              make(map[tcpip.FullAddress]*linkAddrEntry, linkAddrCacheSize),
+	}
+}
diff --git a/pkg/tcpip/stack/nic.go b/pkg/tcpip/stack/nic.go
new file mode 100644
index 000000000..50d35de88
--- /dev/null
+++ b/pkg/tcpip/stack/nic.go
@@ -0,0 +1,728 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"strings"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.googlesource.com/gvisor/pkg/ilist"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+)
+
+// NIC represents a "network interface card" to which the networking stack is
+// attached.
+type NIC struct {
+	stack    *Stack
+	id       tcpip.NICID
+	name     string
+	linkEP   LinkEndpoint
+	loopback bool
+
+	demux *transportDemuxer
+
+	mu          sync.RWMutex
+	spoofing    bool
+	promiscuous bool
+	primary     map[tcpip.NetworkProtocolNumber]*ilist.List
+	endpoints   map[NetworkEndpointID]*referencedNetworkEndpoint
+	subnets     []tcpip.Subnet
+	mcastJoins  map[NetworkEndpointID]int32
+
+	stats NICStats
+}
+
+// NICStats includes transmitted and received stats.
+type NICStats struct {
+	Tx DirectionStats
+	Rx DirectionStats
+}
+
+// DirectionStats includes packet and byte counts.
+type DirectionStats struct {
+	Packets *tcpip.StatCounter
+	Bytes   *tcpip.StatCounter
+}
+
+// PrimaryEndpointBehavior is an enumeration of an endpoint's primacy behavior.
+type PrimaryEndpointBehavior int
+
+const (
+	// CanBePrimaryEndpoint indicates the endpoint can be used as a primary
+	// endpoint for new connections with no local address. This is the
+	// default when calling NIC.AddAddress.
+	CanBePrimaryEndpoint PrimaryEndpointBehavior = iota
+
+	// FirstPrimaryEndpoint indicates the endpoint should be the first
+	// primary endpoint considered. If there are multiple endpoints with
+	// this behavior, the most recently-added one will be first.
+	FirstPrimaryEndpoint
+
+	// NeverPrimaryEndpoint indicates the endpoint should never be a
+	// primary endpoint.
+	NeverPrimaryEndpoint
+)
+
+func newNIC(stack *Stack, id tcpip.NICID, name string, ep LinkEndpoint, loopback bool) *NIC {
+	return &NIC{
+		stack:      stack,
+		id:         id,
+		name:       name,
+		linkEP:     ep,
+		loopback:   loopback,
+		demux:      newTransportDemuxer(stack),
+		primary:    make(map[tcpip.NetworkProtocolNumber]*ilist.List),
+		endpoints:  make(map[NetworkEndpointID]*referencedNetworkEndpoint),
+		mcastJoins: make(map[NetworkEndpointID]int32),
+		stats: NICStats{
+			Tx: DirectionStats{
+				Packets: &tcpip.StatCounter{},
+				Bytes:   &tcpip.StatCounter{},
+			},
+			Rx: DirectionStats{
+				Packets: &tcpip.StatCounter{},
+				Bytes:   &tcpip.StatCounter{},
+			},
+		},
+	}
+}
+
+// attachLinkEndpoint attaches the NIC to the endpoint, which will enable it
+// to start delivering packets.
+func (n *NIC) attachLinkEndpoint() {
+	n.linkEP.Attach(n)
+}
+
+// setPromiscuousMode enables or disables promiscuous mode.
+func (n *NIC) setPromiscuousMode(enable bool) {
+	n.mu.Lock()
+	n.promiscuous = enable
+	n.mu.Unlock()
+}
+
+func (n *NIC) isPromiscuousMode() bool {
+	n.mu.RLock()
+	rv := n.promiscuous
+	n.mu.RUnlock()
+	return rv
+}
+
+// setSpoofing enables or disables address spoofing.
+func (n *NIC) setSpoofing(enable bool) {
+	n.mu.Lock()
+	n.spoofing = enable
+	n.mu.Unlock()
+}
+
+func (n *NIC) getMainNICAddress(protocol tcpip.NetworkProtocolNumber) (tcpip.Address, tcpip.Subnet, *tcpip.Error) {
+	n.mu.RLock()
+	defer n.mu.RUnlock()
+
+	var r *referencedNetworkEndpoint
+
+	// Check for a primary endpoint.
+	if list, ok := n.primary[protocol]; ok {
+		for e := list.Front(); e != nil; e = e.Next() {
+			ref := e.(*referencedNetworkEndpoint)
+			if ref.holdsInsertRef && ref.tryIncRef() {
+				r = ref
+				break
+			}
+		}
+
+	}
+
+	if r == nil {
+		return "", tcpip.Subnet{}, tcpip.ErrNoLinkAddress
+	}
+
+	address := r.ep.ID().LocalAddress
+	r.decRef()
+
+	// Find the least-constrained matching subnet for the address, if one
+	// exists, and return it.
+	var subnet tcpip.Subnet
+	for _, s := range n.subnets {
+		if s.Contains(address) && !subnet.Contains(s.ID()) {
+			subnet = s
+		}
+	}
+	return address, subnet, nil
+}
+
+// primaryEndpoint returns the primary endpoint of n for the given network
+// protocol.
+func (n *NIC) primaryEndpoint(protocol tcpip.NetworkProtocolNumber) *referencedNetworkEndpoint {
+	n.mu.RLock()
+	defer n.mu.RUnlock()
+
+	list := n.primary[protocol]
+	if list == nil {
+		return nil
+	}
+
+	for e := list.Front(); e != nil; e = e.Next() {
+		r := e.(*referencedNetworkEndpoint)
+		// TODO(crawshaw): allow broadcast address when SO_BROADCAST is set.
+		switch r.ep.ID().LocalAddress {
+		case header.IPv4Broadcast, header.IPv4Any:
+			continue
+		}
+		if r.tryIncRef() {
+			return r
+		}
+	}
+
+	return nil
+}
+
+// findEndpoint finds the endpoint, if any, with the given address.
+func (n *NIC) findEndpoint(protocol tcpip.NetworkProtocolNumber, address tcpip.Address, peb PrimaryEndpointBehavior) *referencedNetworkEndpoint {
+	id := NetworkEndpointID{address}
+
+	n.mu.RLock()
+	ref := n.endpoints[id]
+	if ref != nil && !ref.tryIncRef() {
+		ref = nil
+	}
+	spoofing := n.spoofing
+	n.mu.RUnlock()
+
+	if ref != nil || !spoofing {
+		return ref
+	}
+
+	// Try again with the lock in exclusive mode. If we still can't get the
+	// endpoint, create a new "temporary" endpoint. It will only exist while
+	// there's a route through it.
+	n.mu.Lock()
+	ref = n.endpoints[id]
+	if ref == nil || !ref.tryIncRef() {
+		ref, _ = n.addAddressLocked(protocol, address, peb, true)
+		if ref != nil {
+			ref.holdsInsertRef = false
+		}
+	}
+	n.mu.Unlock()
+	return ref
+}
+
+func (n *NIC) addAddressLocked(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, peb PrimaryEndpointBehavior, replace bool) (*referencedNetworkEndpoint, *tcpip.Error) {
+	netProto, ok := n.stack.networkProtocols[protocol]
+	if !ok {
+		return nil, tcpip.ErrUnknownProtocol
+	}
+
+	// Create the new network endpoint.
+	ep, err := netProto.NewEndpoint(n.id, addr, n.stack, n, n.linkEP)
+	if err != nil {
+		return nil, err
+	}
+
+	id := *ep.ID()
+	if ref, ok := n.endpoints[id]; ok {
+		if !replace {
+			return nil, tcpip.ErrDuplicateAddress
+		}
+
+		n.removeEndpointLocked(ref)
+	}
+
+	ref := &referencedNetworkEndpoint{
+		refs:           1,
+		ep:             ep,
+		nic:            n,
+		protocol:       protocol,
+		holdsInsertRef: true,
+	}
+
+	// Set up cache if link address resolution exists for this protocol.
+	if n.linkEP.Capabilities()&CapabilityResolutionRequired != 0 {
+		if _, ok := n.stack.linkAddrResolvers[protocol]; ok {
+			ref.linkCache = n.stack
+		}
+	}
+
+	n.endpoints[id] = ref
+
+	l, ok := n.primary[protocol]
+	if !ok {
+		l = &ilist.List{}
+		n.primary[protocol] = l
+	}
+
+	switch peb {
+	case CanBePrimaryEndpoint:
+		l.PushBack(ref)
+	case FirstPrimaryEndpoint:
+		l.PushFront(ref)
+	}
+
+	return ref, nil
+}
+
+// AddAddress adds a new address to n, so that it starts accepting packets
+// targeted at the given address (and network protocol).
+func (n *NIC) AddAddress(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
+	return n.AddAddressWithOptions(protocol, addr, CanBePrimaryEndpoint)
+}
+
+// AddAddressWithOptions is the same as AddAddress, but allows you to specify
+// whether the new endpoint can be primary or not.
+func (n *NIC) AddAddressWithOptions(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, peb PrimaryEndpointBehavior) *tcpip.Error {
+	// Add the endpoint.
+	n.mu.Lock()
+	_, err := n.addAddressLocked(protocol, addr, peb, false)
+	n.mu.Unlock()
+
+	return err
+}
+
+// Addresses returns the addresses associated with this NIC.
+func (n *NIC) Addresses() []tcpip.ProtocolAddress {
+	n.mu.RLock()
+	defer n.mu.RUnlock()
+	addrs := make([]tcpip.ProtocolAddress, 0, len(n.endpoints))
+	for nid, ep := range n.endpoints {
+		addrs = append(addrs, tcpip.ProtocolAddress{
+			Protocol: ep.protocol,
+			Address:  nid.LocalAddress,
+		})
+	}
+	return addrs
+}
+
+// AddSubnet adds a new subnet to n, so that it starts accepting packets
+// targeted at the given address and network protocol.
+func (n *NIC) AddSubnet(protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) {
+	n.mu.Lock()
+	n.subnets = append(n.subnets, subnet)
+	n.mu.Unlock()
+}
+
+// RemoveSubnet removes the given subnet from n.
+func (n *NIC) RemoveSubnet(subnet tcpip.Subnet) {
+	n.mu.Lock()
+
+	// Use the same underlying array.
+	tmp := n.subnets[:0]
+	for _, sub := range n.subnets {
+		if sub != subnet {
+			tmp = append(tmp, sub)
+		}
+	}
+	n.subnets = tmp
+
+	n.mu.Unlock()
+}
+
+// ContainsSubnet reports whether this NIC contains the given subnet.
+func (n *NIC) ContainsSubnet(subnet tcpip.Subnet) bool {
+	for _, s := range n.Subnets() {
+		if s == subnet {
+			return true
+		}
+	}
+	return false
+}
+
+// Subnets returns the Subnets associated with this NIC.
+func (n *NIC) Subnets() []tcpip.Subnet {
+	n.mu.RLock()
+	defer n.mu.RUnlock()
+	sns := make([]tcpip.Subnet, 0, len(n.subnets)+len(n.endpoints))
+	for nid := range n.endpoints {
+		sn, err := tcpip.NewSubnet(nid.LocalAddress, tcpip.AddressMask(strings.Repeat("\xff", len(nid.LocalAddress))))
+		if err != nil {
+			// This should never happen as the mask has been carefully crafted to
+			// match the address.
+			panic("Invalid endpoint subnet: " + err.Error())
+		}
+		sns = append(sns, sn)
+	}
+	return append(sns, n.subnets...)
+}
+
+func (n *NIC) removeEndpointLocked(r *referencedNetworkEndpoint) {
+	id := *r.ep.ID()
+
+	// Nothing to do if the reference has already been replaced with a
+	// different one.
+	if n.endpoints[id] != r {
+		return
+	}
+
+	if r.holdsInsertRef {
+		panic("Reference count dropped to zero before being removed")
+	}
+
+	delete(n.endpoints, id)
+	wasInList := r.Next() != nil || r.Prev() != nil || r == n.primary[r.protocol].Front()
+	if wasInList {
+		n.primary[r.protocol].Remove(r)
+	}
+
+	r.ep.Close()
+}
+
+func (n *NIC) removeEndpoint(r *referencedNetworkEndpoint) {
+	n.mu.Lock()
+	n.removeEndpointLocked(r)
+	n.mu.Unlock()
+}
+
+func (n *NIC) removeAddressLocked(addr tcpip.Address) *tcpip.Error {
+	r := n.endpoints[NetworkEndpointID{addr}]
+	if r == nil || !r.holdsInsertRef {
+		return tcpip.ErrBadLocalAddress
+	}
+
+	r.holdsInsertRef = false
+
+	r.decRefLocked()
+
+	return nil
+}
+
+// RemoveAddress removes an address from n.
+func (n *NIC) RemoveAddress(addr tcpip.Address) *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+	return n.removeAddressLocked(addr)
+}
+
+// joinGroup adds a new endpoint for the given multicast address, if none
+// exists yet. Otherwise it just increments its count.
+func (n *NIC) joinGroup(protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	id := NetworkEndpointID{addr}
+	joins := n.mcastJoins[id]
+	if joins == 0 {
+		if _, err := n.addAddressLocked(protocol, addr, NeverPrimaryEndpoint, false); err != nil {
+			return err
+		}
+	}
+	n.mcastJoins[id] = joins + 1
+	return nil
+}
+
+// leaveGroup decrements the count for the given multicast address, and when it
+// reaches zero removes the endpoint for this address.
+func (n *NIC) leaveGroup(addr tcpip.Address) *tcpip.Error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+
+	id := NetworkEndpointID{addr}
+	joins := n.mcastJoins[id]
+	switch joins {
+	case 0:
+		// There are no joins with this address on this NIC.
+		return tcpip.ErrBadLocalAddress
+	case 1:
+		// This is the last one, clean up.
+		if err := n.removeAddressLocked(addr); err != nil {
+			return err
+		}
+	}
+	n.mcastJoins[id] = joins - 1
+	return nil
+}
+
+// DeliverNetworkPacket finds the appropriate network protocol endpoint and
+// hands the packet over for further processing. This function is called when
+// the NIC receives a packet from the physical interface.
+// Note that the ownership of the slice backing vv is retained by the caller.
+// This rule applies only to the slice itself, not to the items of the slice;
+// the ownership of the items is not retained by the caller.
+func (n *NIC) DeliverNetworkPacket(linkEP LinkEndpoint, remote, _ tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView) {
+	n.stats.Rx.Packets.Increment()
+	n.stats.Rx.Bytes.IncrementBy(uint64(vv.Size()))
+
+	netProto, ok := n.stack.networkProtocols[protocol]
+	if !ok {
+		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
+		return
+	}
+
+	if netProto.Number() == header.IPv4ProtocolNumber || netProto.Number() == header.IPv6ProtocolNumber {
+		n.stack.stats.IP.PacketsReceived.Increment()
+	}
+
+	if len(vv.First()) < netProto.MinimumPacketSize() {
+		n.stack.stats.MalformedRcvdPackets.Increment()
+		return
+	}
+
+	src, dst := netProto.ParseAddresses(vv.First())
+
+	// If the packet is destined to the IPv4 Broadcast address, then make a
+	// route to each IPv4 network endpoint and let each endpoint handle the
+	// packet.
+	if dst == header.IPv4Broadcast {
+		// n.endpoints is mutex protected so acquire lock.
+		n.mu.RLock()
+		for _, ref := range n.endpoints {
+			if ref.protocol == header.IPv4ProtocolNumber && ref.tryIncRef() {
+				r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref, false /* handleLocal */, false /* multicastLoop */)
+				r.RemoteLinkAddress = remote
+				ref.ep.HandlePacket(&r, vv)
+				ref.decRef()
+			}
+		}
+		n.mu.RUnlock()
+		return
+	}
+
+	if ref := n.getRef(protocol, dst); ref != nil {
+		r := makeRoute(protocol, dst, src, linkEP.LinkAddress(), ref, false /* handleLocal */, false /* multicastLoop */)
+		r.RemoteLinkAddress = remote
+		ref.ep.HandlePacket(&r, vv)
+		ref.decRef()
+		return
+	}
+
+	// This NIC doesn't care about the packet. Find a NIC that cares about the
+	// packet and forward it to the NIC.
+	//
+	// TODO: Should we be forwarding the packet even if promiscuous?
+	if n.stack.Forwarding() {
+		r, err := n.stack.FindRoute(0, "", dst, protocol, false /* multicastLoop */)
+		if err != nil {
+			n.stack.stats.IP.InvalidAddressesReceived.Increment()
+			return
+		}
+		defer r.Release()
+
+		r.LocalLinkAddress = n.linkEP.LinkAddress()
+		r.RemoteLinkAddress = remote
+
+		// Found a NIC.
+		n := r.ref.nic
+		n.mu.RLock()
+		ref, ok := n.endpoints[NetworkEndpointID{dst}]
+		n.mu.RUnlock()
+		if ok && ref.tryIncRef() {
+			r.RemoteAddress = src
+			// TODO(b/123449044): Update the source NIC as well.
+			ref.ep.HandlePacket(&r, vv)
+			ref.decRef()
+		} else {
+			// n doesn't have a destination endpoint.
+			// Send the packet out of n.
+			hdr := buffer.NewPrependableFromView(vv.First())
+			vv.RemoveFirst()
+
+			// TODO(b/128629022): use route.WritePacket.
+			if err := n.linkEP.WritePacket(&r, nil /* gso */, hdr, vv, protocol); err != nil {
+				r.Stats().IP.OutgoingPacketErrors.Increment()
+			} else {
+				n.stats.Tx.Packets.Increment()
+				n.stats.Tx.Bytes.IncrementBy(uint64(hdr.UsedLength() + vv.Size()))
+			}
+		}
+		return
+	}
+
+	n.stack.stats.IP.InvalidAddressesReceived.Increment()
+}
+
+func (n *NIC) getRef(protocol tcpip.NetworkProtocolNumber, dst tcpip.Address) *referencedNetworkEndpoint {
+	id := NetworkEndpointID{dst}
+
+	n.mu.RLock()
+	if ref, ok := n.endpoints[id]; ok && ref.tryIncRef() {
+		n.mu.RUnlock()
+		return ref
+	}
+
+	promiscuous := n.promiscuous
+	// Check if the packet is for a subnet this NIC cares about.
+	if !promiscuous {
+		for _, sn := range n.subnets {
+			if sn.Contains(dst) {
+				promiscuous = true
+				break
+			}
+		}
+	}
+	n.mu.RUnlock()
+	if promiscuous {
+		// Try again with the lock in exclusive mode. If we still can't
+		// get the endpoint, create a new "temporary" one. It will only
+		// exist while there's a route through it.
+		n.mu.Lock()
+		if ref, ok := n.endpoints[id]; ok && ref.tryIncRef() {
+			n.mu.Unlock()
+			return ref
+		}
+		ref, err := n.addAddressLocked(protocol, dst, CanBePrimaryEndpoint, true)
+		n.mu.Unlock()
+		if err == nil {
+			ref.holdsInsertRef = false
+			return ref
+		}
+	}
+
+	return nil
+}
+
+// DeliverTransportPacket delivers the packets to the appropriate transport
+// protocol endpoint.
+func (n *NIC) DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView) {
+	state, ok := n.stack.transportProtocols[protocol]
+	if !ok {
+		n.stack.stats.UnknownProtocolRcvdPackets.Increment()
+		return
+	}
+
+	transProto := state.proto
+
+	// Raw socket packets are delivered based solely on the transport
+	// protocol number. We do not inspect the payload to ensure it's
+	// validly formed.
+	if !n.demux.deliverRawPacket(r, protocol, netHeader, vv) {
+		n.stack.demux.deliverRawPacket(r, protocol, netHeader, vv)
+	}
+
+	if len(vv.First()) < transProto.MinimumPacketSize() {
+		n.stack.stats.MalformedRcvdPackets.Increment()
+		return
+	}
+
+	srcPort, dstPort, err := transProto.ParsePorts(vv.First())
+	if err != nil {
+		n.stack.stats.MalformedRcvdPackets.Increment()
+		return
+	}
+
+	id := TransportEndpointID{dstPort, r.LocalAddress, srcPort, r.RemoteAddress}
+	if n.demux.deliverPacket(r, protocol, netHeader, vv, id) {
+		return
+	}
+	if n.stack.demux.deliverPacket(r, protocol, netHeader, vv, id) {
+		return
+	}
+
+	// Try to deliver to per-stack default handler.
+	if state.defaultHandler != nil {
+		if state.defaultHandler(r, id, netHeader, vv) {
+			return
+		}
+	}
+
+	// We could not find an appropriate destination for this packet, so
+	// deliver it to the global handler.
+	if !transProto.HandleUnknownDestinationPacket(r, id, vv) {
+		n.stack.stats.MalformedRcvdPackets.Increment()
+	}
+}
+
+// DeliverTransportControlPacket delivers control packets to the appropriate
+// transport protocol endpoint.
+func (n *NIC) DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView) {
+	state, ok := n.stack.transportProtocols[trans]
+	if !ok {
+		return
+	}
+
+	transProto := state.proto
+
+	// ICMPv4 only guarantees that 8 bytes of the transport protocol will
+	// be present in the payload. We know that the ports are within the
+	// first 8 bytes for all known transport protocols.
+	if len(vv.First()) < 8 {
+		return
+	}
+
+	srcPort, dstPort, err := transProto.ParsePorts(vv.First())
+	if err != nil {
+		return
+	}
+
+	id := TransportEndpointID{srcPort, local, dstPort, remote}
+	if n.demux.deliverControlPacket(net, trans, typ, extra, vv, id) {
+		return
+	}
+	if n.stack.demux.deliverControlPacket(net, trans, typ, extra, vv, id) {
+		return
+	}
+}
+
+// ID returns the identifier of n.
+func (n *NIC) ID() tcpip.NICID {
+	return n.id
+}
+
+type referencedNetworkEndpoint struct {
+	ilist.Entry
+	refs     int32
+	ep       NetworkEndpoint
+	nic      *NIC
+	protocol tcpip.NetworkProtocolNumber
+
+	// linkCache is set if link address resolution is enabled for this
+	// protocol. Set to nil otherwise.
+	linkCache LinkAddressCache
+
+	// holdsInsertRef is protected by the NIC's mutex. It indicates whether
+	// the reference count is biased by 1 due to the insertion of the
+	// endpoint. It is reset to false when RemoveAddress is called on the
+	// NIC.
+	holdsInsertRef bool
+}
+
+// decRef decrements the ref count and cleans up the endpoint once it reaches
+// zero.
+func (r *referencedNetworkEndpoint) decRef() {
+	if atomic.AddInt32(&r.refs, -1) == 0 {
+		r.nic.removeEndpoint(r)
+	}
+}
+
+// decRefLocked is the same as decRef but assumes that the NIC.mu mutex is
+// locked.
+func (r *referencedNetworkEndpoint) decRefLocked() {
+	if atomic.AddInt32(&r.refs, -1) == 0 {
+		r.nic.removeEndpointLocked(r)
+	}
+}
+
+// incRef increments the ref count. It must only be called when the caller is
+// known to be holding a reference to the endpoint, otherwise tryIncRef should
+// be used.
+func (r *referencedNetworkEndpoint) incRef() {
+	atomic.AddInt32(&r.refs, 1)
+}
+
+// tryIncRef attempts to increment the ref count from n to n+1, but only if n is
+// not zero. That is, it will increment the count if the endpoint is still
+// alive, and do nothing if it has already been clean up.
+func (r *referencedNetworkEndpoint) tryIncRef() bool {
+	for {
+		v := atomic.LoadInt32(&r.refs)
+		if v == 0 {
+			return false
+		}
+
+		if atomic.CompareAndSwapInt32(&r.refs, v, v+1) {
+			return true
+		}
+	}
+}
diff --git a/pkg/tcpip/stack/registration.go b/pkg/tcpip/stack/registration.go
new file mode 100644
index 000000000..c70533a35
--- /dev/null
+++ b/pkg/tcpip/stack/registration.go
@@ -0,0 +1,441 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/sleep"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+// NetworkEndpointID is the identifier of a network layer protocol endpoint.
+// Currently the local address is sufficient because all supported protocols
+// (i.e., IPv4 and IPv6) have different sizes for their addresses.
+type NetworkEndpointID struct {
+	LocalAddress tcpip.Address
+}
+
+// TransportEndpointID is the identifier of a transport layer protocol endpoint.
+//
+// +stateify savable
+type TransportEndpointID struct {
+	// LocalPort is the local port associated with the endpoint.
+	LocalPort uint16
+
+	// LocalAddress is the local [network layer] address associated with
+	// the endpoint.
+	LocalAddress tcpip.Address
+
+	// RemotePort is the remote port associated with the endpoint.
+	RemotePort uint16
+
+	// RemoteAddress it the remote [network layer] address associated with
+	// the endpoint.
+	RemoteAddress tcpip.Address
+}
+
+// ControlType is the type of network control message.
+type ControlType int
+
+// The following are the allowed values for ControlType values.
+const (
+	ControlPacketTooBig ControlType = iota
+	ControlPortUnreachable
+	ControlUnknown
+)
+
+// TransportEndpoint is the interface that needs to be implemented by transport
+// protocol (e.g., tcp, udp) endpoints that can handle packets.
+type TransportEndpoint interface {
+	// HandlePacket is called by the stack when new packets arrive to
+	// this transport endpoint.
+	HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView)
+
+	// HandleControlPacket is called by the stack when new control (e.g.,
+	// ICMP) packets arrive to this transport endpoint.
+	HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView)
+}
+
+// RawTransportEndpoint is the interface that needs to be implemented by raw
+// transport protocol endpoints. RawTransportEndpoints receive the entire
+// packet - including the link, network, and transport headers - as delivered
+// to netstack.
+type RawTransportEndpoint interface {
+	// HandlePacket is called by the stack when new packets arrive to
+	// this transport endpoint. The packet contains all data from the link
+	// layer up.
+	HandlePacket(r *Route, netHeader buffer.View, packet buffer.VectorisedView)
+}
+
+// TransportProtocol is the interface that needs to be implemented by transport
+// protocols (e.g., tcp, udp) that want to be part of the networking stack.
+type TransportProtocol interface {
+	// Number returns the transport protocol number.
+	Number() tcpip.TransportProtocolNumber
+
+	// NewEndpoint creates a new endpoint of the transport protocol.
+	NewEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+
+	// NewRawEndpoint creates a new raw endpoint of the transport protocol.
+	NewRawEndpoint(stack *Stack, netProto tcpip.NetworkProtocolNumber, waitQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error)
+
+	// MinimumPacketSize returns the minimum valid packet size of this
+	// transport protocol. The stack automatically drops any packets smaller
+	// than this targeted at this protocol.
+	MinimumPacketSize() int
+
+	// ParsePorts returns the source and destination ports stored in a
+	// packet of this protocol.
+	ParsePorts(v buffer.View) (src, dst uint16, err *tcpip.Error)
+
+	// HandleUnknownDestinationPacket handles packets targeted at this
+	// protocol but that don't match any existing endpoint. For example,
+	// it is targeted at a port that have no listeners.
+	//
+	// The return value indicates whether the packet was well-formed (for
+	// stats purposes only).
+	HandleUnknownDestinationPacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) bool
+
+	// SetOption allows enabling/disabling protocol specific features.
+	// SetOption returns an error if the option is not supported or the
+	// provided option value is invalid.
+	SetOption(option interface{}) *tcpip.Error
+
+	// Option allows retrieving protocol specific option values.
+	// Option returns an error if the option is not supported or the
+	// provided option value is invalid.
+	Option(option interface{}) *tcpip.Error
+}
+
+// TransportDispatcher contains the methods used by the network stack to deliver
+// packets to the appropriate transport endpoint after it has been handled by
+// the network layer.
+type TransportDispatcher interface {
+	// DeliverTransportPacket delivers packets to the appropriate
+	// transport protocol endpoint. It also returns the network layer
+	// header for the enpoint to inspect or pass up the stack.
+	DeliverTransportPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView)
+
+	// DeliverTransportControlPacket delivers control packets to the
+	// appropriate transport protocol endpoint.
+	DeliverTransportControlPacket(local, remote tcpip.Address, net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView)
+}
+
+// PacketLooping specifies where an outbound packet should be sent.
+type PacketLooping byte
+
+const (
+	// PacketOut indicates that the packet should be passed to the link
+	// endpoint.
+	PacketOut PacketLooping = 1 << iota
+
+	// PacketLoop indicates that the packet should be handled locally.
+	PacketLoop
+)
+
+// NetworkEndpoint is the interface that needs to be implemented by endpoints
+// of network layer protocols (e.g., ipv4, ipv6).
+type NetworkEndpoint interface {
+	// DefaultTTL is the default time-to-live value (or hop limit, in ipv6)
+	// for this endpoint.
+	DefaultTTL() uint8
+
+	// MTU is the maximum transmission unit for this endpoint. This is
+	// generally calculated as the MTU of the underlying data link endpoint
+	// minus the network endpoint max header length.
+	MTU() uint32
+
+	// Capabilities returns the set of capabilities supported by the
+	// underlying link-layer endpoint.
+	Capabilities() LinkEndpointCapabilities
+
+	// MaxHeaderLength returns the maximum size the network (and lower
+	// level layers combined) headers can have. Higher levels use this
+	// information to reserve space in the front of the packets they're
+	// building.
+	MaxHeaderLength() uint16
+
+	// WritePacket writes a packet to the given destination address and
+	// protocol.
+	WritePacket(r *Route, gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8, loop PacketLooping) *tcpip.Error
+
+	// ID returns the network protocol endpoint ID.
+	ID() *NetworkEndpointID
+
+	// NICID returns the id of the NIC this endpoint belongs to.
+	NICID() tcpip.NICID
+
+	// HandlePacket is called by the link layer when new packets arrive to
+	// this network endpoint.
+	HandlePacket(r *Route, vv buffer.VectorisedView)
+
+	// Close is called when the endpoint is reomved from a stack.
+	Close()
+}
+
+// NetworkProtocol is the interface that needs to be implemented by network
+// protocols (e.g., ipv4, ipv6) that want to be part of the networking stack.
+type NetworkProtocol interface {
+	// Number returns the network protocol number.
+	Number() tcpip.NetworkProtocolNumber
+
+	// MinimumPacketSize returns the minimum valid packet size of this
+	// network protocol. The stack automatically drops any packets smaller
+	// than this targeted at this protocol.
+	MinimumPacketSize() int
+
+	// ParsePorts returns the source and destination addresses stored in a
+	// packet of this protocol.
+	ParseAddresses(v buffer.View) (src, dst tcpip.Address)
+
+	// NewEndpoint creates a new endpoint of this protocol.
+	NewEndpoint(nicid tcpip.NICID, addr tcpip.Address, linkAddrCache LinkAddressCache, dispatcher TransportDispatcher, sender LinkEndpoint) (NetworkEndpoint, *tcpip.Error)
+
+	// SetOption allows enabling/disabling protocol specific features.
+	// SetOption returns an error if the option is not supported or the
+	// provided option value is invalid.
+	SetOption(option interface{}) *tcpip.Error
+
+	// Option allows retrieving protocol specific option values.
+	// Option returns an error if the option is not supported or the
+	// provided option value is invalid.
+	Option(option interface{}) *tcpip.Error
+}
+
+// NetworkDispatcher contains the methods used by the network stack to deliver
+// packets to the appropriate network endpoint after it has been handled by
+// the data link layer.
+type NetworkDispatcher interface {
+	// DeliverNetworkPacket finds the appropriate network protocol
+	// endpoint and hands the packet over for further processing.
+	DeliverNetworkPacket(linkEP LinkEndpoint, remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView)
+}
+
+// LinkEndpointCapabilities is the type associated with the capabilities
+// supported by a link-layer endpoint. It is a set of bitfields.
+type LinkEndpointCapabilities uint
+
+// The following are the supported link endpoint capabilities.
+const (
+	CapabilityNone LinkEndpointCapabilities = 0
+	// CapabilityTXChecksumOffload indicates that the link endpoint supports
+	// checksum computation for outgoing packets and the stack can skip
+	// computing checksums when sending packets.
+	CapabilityTXChecksumOffload LinkEndpointCapabilities = 1 << iota
+	// CapabilityRXChecksumOffload indicates that the link endpoint supports
+	// checksum verification on received packets and that it's safe for the
+	// stack to skip checksum verification.
+	CapabilityRXChecksumOffload
+	CapabilityResolutionRequired
+	CapabilitySaveRestore
+	CapabilityDisconnectOk
+	CapabilityLoopback
+	CapabilityGSO
+)
+
+// LinkEndpoint is the interface implemented by data link layer protocols (e.g.,
+// ethernet, loopback, raw) and used by network layer protocols to send packets
+// out through the implementer's data link endpoint.
+type LinkEndpoint interface {
+	// MTU is the maximum transmission unit for this endpoint. This is
+	// usually dictated by the backing physical network; when such a
+	// physical network doesn't exist, the limit is generally 64k, which
+	// includes the maximum size of an IP packet.
+	MTU() uint32
+
+	// Capabilities returns the set of capabilities supported by the
+	// endpoint.
+	Capabilities() LinkEndpointCapabilities
+
+	// MaxHeaderLength returns the maximum size the data link (and
+	// lower level layers combined) headers can have. Higher levels use this
+	// information to reserve space in the front of the packets they're
+	// building.
+	MaxHeaderLength() uint16
+
+	// LinkAddress returns the link address (typically a MAC) of the
+	// link endpoint.
+	LinkAddress() tcpip.LinkAddress
+
+	// WritePacket writes a packet with the given protocol through the given
+	// route.
+	//
+	// To participate in transparent bridging, a LinkEndpoint implementation
+	// should call eth.Encode with header.EthernetFields.SrcAddr set to
+	// r.LocalLinkAddress if it is provided.
+	WritePacket(r *Route, gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.NetworkProtocolNumber) *tcpip.Error
+
+	// Attach attaches the data link layer endpoint to the network-layer
+	// dispatcher of the stack.
+	Attach(dispatcher NetworkDispatcher)
+
+	// IsAttached returns whether a NetworkDispatcher is attached to the
+	// endpoint.
+	IsAttached() bool
+}
+
+// InjectableLinkEndpoint is a LinkEndpoint where inbound packets are
+// delivered via the Inject method.
+type InjectableLinkEndpoint interface {
+	LinkEndpoint
+
+	// Inject injects an inbound packet.
+	Inject(protocol tcpip.NetworkProtocolNumber, vv buffer.VectorisedView)
+
+	// WriteRawPacket writes a fully formed outbound packet directly to the link.
+	//
+	// dest is used by endpoints with multiple raw destinations.
+	WriteRawPacket(dest tcpip.Address, packet []byte) *tcpip.Error
+}
+
+// A LinkAddressResolver is an extension to a NetworkProtocol that
+// can resolve link addresses.
+type LinkAddressResolver interface {
+	// LinkAddressRequest sends a request for the LinkAddress of addr.
+	// The request is sent on linkEP with localAddr as the source.
+	//
+	// A valid response will cause the discovery protocol's network
+	// endpoint to call AddLinkAddress.
+	LinkAddressRequest(addr, localAddr tcpip.Address, linkEP LinkEndpoint) *tcpip.Error
+
+	// ResolveStaticAddress attempts to resolve address without sending
+	// requests. It either resolves the name immediately or returns the
+	// empty LinkAddress.
+	//
+	// It can be used to resolve broadcast addresses for example.
+	ResolveStaticAddress(addr tcpip.Address) (tcpip.LinkAddress, bool)
+
+	// LinkAddressProtocol returns the network protocol of the
+	// addresses this this resolver can resolve.
+	LinkAddressProtocol() tcpip.NetworkProtocolNumber
+}
+
+// A LinkAddressCache caches link addresses.
+type LinkAddressCache interface {
+	// CheckLocalAddress determines if the given local address exists, and if it
+	// does not exist.
+	CheckLocalAddress(nicid tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID
+
+	// AddLinkAddress adds a link address to the cache.
+	AddLinkAddress(nicid tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress)
+
+	// GetLinkAddress looks up the cache to translate address to link address (e.g. IP -> MAC).
+	// If the LinkEndpoint requests address resolution and there is a LinkAddressResolver
+	// registered with the network protocol, the cache attempts to resolve the address
+	// and returns ErrWouldBlock. Waker is notified when address resolution is
+	// complete (success or not).
+	//
+	// If address resolution is required, ErrNoLinkAddress and a notification channel is
+	// returned for the top level caller to block. Channel is closed once address resolution
+	// is complete (success or not).
+	GetLinkAddress(nicid tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, w *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error)
+
+	// RemoveWaker removes a waker that has been added in GetLinkAddress().
+	RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.Waker)
+}
+
+// TransportProtocolFactory functions are used by the stack to instantiate
+// transport protocols.
+type TransportProtocolFactory func() TransportProtocol
+
+// NetworkProtocolFactory provides methods to be used by the stack to
+// instantiate network protocols.
+type NetworkProtocolFactory func() NetworkProtocol
+
+var (
+	transportProtocols = make(map[string]TransportProtocolFactory)
+	networkProtocols   = make(map[string]NetworkProtocolFactory)
+
+	linkEPMu           sync.RWMutex
+	nextLinkEndpointID tcpip.LinkEndpointID = 1
+	linkEndpoints                           = make(map[tcpip.LinkEndpointID]LinkEndpoint)
+)
+
+// RegisterTransportProtocolFactory registers a new transport protocol factory
+// with the stack so that it becomes available to users of the stack. This
+// function is intended to be called by init() functions of the protocols.
+func RegisterTransportProtocolFactory(name string, p TransportProtocolFactory) {
+	transportProtocols[name] = p
+}
+
+// RegisterNetworkProtocolFactory registers a new network protocol factory with
+// the stack so that it becomes available to users of the stack. This function
+// is intended to be called by init() functions of the protocols.
+func RegisterNetworkProtocolFactory(name string, p NetworkProtocolFactory) {
+	networkProtocols[name] = p
+}
+
+// RegisterLinkEndpoint register a link-layer protocol endpoint and returns an
+// ID that can be used to refer to it.
+func RegisterLinkEndpoint(linkEP LinkEndpoint) tcpip.LinkEndpointID {
+	linkEPMu.Lock()
+	defer linkEPMu.Unlock()
+
+	v := nextLinkEndpointID
+	nextLinkEndpointID++
+
+	linkEndpoints[v] = linkEP
+
+	return v
+}
+
+// FindLinkEndpoint finds the link endpoint associated with the given ID.
+func FindLinkEndpoint(id tcpip.LinkEndpointID) LinkEndpoint {
+	linkEPMu.RLock()
+	defer linkEPMu.RUnlock()
+
+	return linkEndpoints[id]
+}
+
+// GSOType is the type of GSO segments.
+//
+// +stateify savable
+type GSOType int
+
+// Types of gso segments.
+const (
+	GSONone GSOType = iota
+	GSOTCPv4
+	GSOTCPv6
+)
+
+// GSO contains generic segmentation offload properties.
+//
+// +stateify savable
+type GSO struct {
+	// Type is one of GSONone, GSOTCPv4, etc.
+	Type GSOType
+	// NeedsCsum is set if the checksum offload is enabled.
+	NeedsCsum bool
+	// CsumOffset is offset after that to place checksum.
+	CsumOffset uint16
+
+	// Mss is maximum segment size.
+	MSS uint16
+	// L3Len is L3 (IP) header length.
+	L3HdrLen uint16
+
+	// MaxSize is maximum GSO packet size.
+	MaxSize uint32
+}
+
+// GSOEndpoint provides access to GSO properties.
+type GSOEndpoint interface {
+	// GSOMaxSize returns the maximum GSO packet size.
+	GSOMaxSize() uint32
+}
diff --git a/pkg/tcpip/stack/route.go b/pkg/tcpip/stack/route.go
new file mode 100644
index 000000000..3d4c282a9
--- /dev/null
+++ b/pkg/tcpip/stack/route.go
@@ -0,0 +1,189 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/sleep"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+)
+
+// Route represents a route through the networking stack to a given destination.
+type Route struct {
+	// RemoteAddress is the final destination of the route.
+	RemoteAddress tcpip.Address
+
+	// RemoteLinkAddress is the link-layer (MAC) address of the
+	// final destination of the route.
+	RemoteLinkAddress tcpip.LinkAddress
+
+	// LocalAddress is the local address where the route starts.
+	LocalAddress tcpip.Address
+
+	// LocalLinkAddress is the link-layer (MAC) address of the
+	// where the route starts.
+	LocalLinkAddress tcpip.LinkAddress
+
+	// NextHop is the next node in the path to the destination.
+	NextHop tcpip.Address
+
+	// NetProto is the network-layer protocol.
+	NetProto tcpip.NetworkProtocolNumber
+
+	// ref a reference to the network endpoint through which the route
+	// starts.
+	ref *referencedNetworkEndpoint
+
+	// loop controls where WritePacket should send packets.
+	loop PacketLooping
+}
+
+// makeRoute initializes a new route. It takes ownership of the provided
+// reference to a network endpoint.
+func makeRoute(netProto tcpip.NetworkProtocolNumber, localAddr, remoteAddr tcpip.Address, localLinkAddr tcpip.LinkAddress, ref *referencedNetworkEndpoint, handleLocal, multicastLoop bool) Route {
+	loop := PacketOut
+	if handleLocal && localAddr != "" && remoteAddr == localAddr {
+		loop = PacketLoop
+	} else if multicastLoop && (header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)) {
+		loop |= PacketLoop
+	}
+
+	return Route{
+		NetProto:         netProto,
+		LocalAddress:     localAddr,
+		LocalLinkAddress: localLinkAddr,
+		RemoteAddress:    remoteAddr,
+		ref:              ref,
+		loop:             loop,
+	}
+}
+
+// NICID returns the id of the NIC from which this route originates.
+func (r *Route) NICID() tcpip.NICID {
+	return r.ref.ep.NICID()
+}
+
+// MaxHeaderLength forwards the call to the network endpoint's implementation.
+func (r *Route) MaxHeaderLength() uint16 {
+	return r.ref.ep.MaxHeaderLength()
+}
+
+// Stats returns a mutable copy of current stats.
+func (r *Route) Stats() tcpip.Stats {
+	return r.ref.nic.stack.Stats()
+}
+
+// PseudoHeaderChecksum forwards the call to the network endpoint's
+// implementation.
+func (r *Route) PseudoHeaderChecksum(protocol tcpip.TransportProtocolNumber, totalLen uint16) uint16 {
+	return header.PseudoHeaderChecksum(protocol, r.LocalAddress, r.RemoteAddress, totalLen)
+}
+
+// Capabilities returns the link-layer capabilities of the route.
+func (r *Route) Capabilities() LinkEndpointCapabilities {
+	return r.ref.ep.Capabilities()
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (r *Route) GSOMaxSize() uint32 {
+	if gso, ok := r.ref.ep.(GSOEndpoint); ok {
+		return gso.GSOMaxSize()
+	}
+	return 0
+}
+
+// Resolve attempts to resolve the link address if necessary. Returns ErrWouldBlock in
+// case address resolution requires blocking, e.g. wait for ARP reply. Waker is
+// notified when address resolution is complete (success or not).
+//
+// If address resolution is required, ErrNoLinkAddress and a notification channel is
+// returned for the top level caller to block. Channel is closed once address resolution
+// is complete (success or not).
+func (r *Route) Resolve(waker *sleep.Waker) (<-chan struct{}, *tcpip.Error) {
+	if !r.IsResolutionRequired() {
+		// Nothing to do if there is no cache (which does the resolution on cache miss) or
+		// link address is already known.
+		return nil, nil
+	}
+
+	nextAddr := r.NextHop
+	if nextAddr == "" {
+		// Local link address is already known.
+		if r.RemoteAddress == r.LocalAddress {
+			r.RemoteLinkAddress = r.LocalLinkAddress
+			return nil, nil
+		}
+		nextAddr = r.RemoteAddress
+	}
+	linkAddr, ch, err := r.ref.linkCache.GetLinkAddress(r.ref.nic.ID(), nextAddr, r.LocalAddress, r.NetProto, waker)
+	if err != nil {
+		return ch, err
+	}
+	r.RemoteLinkAddress = linkAddr
+	return nil, nil
+}
+
+// RemoveWaker removes a waker that has been added in Resolve().
+func (r *Route) RemoveWaker(waker *sleep.Waker) {
+	nextAddr := r.NextHop
+	if nextAddr == "" {
+		nextAddr = r.RemoteAddress
+	}
+	r.ref.linkCache.RemoveWaker(r.ref.nic.ID(), nextAddr, waker)
+}
+
+// IsResolutionRequired returns true if Resolve() must be called to resolve
+// the link address before the this route can be written to.
+func (r *Route) IsResolutionRequired() bool {
+	return r.ref.linkCache != nil && r.RemoteLinkAddress == ""
+}
+
+// WritePacket writes the packet through the given route.
+func (r *Route) WritePacket(gso *GSO, hdr buffer.Prependable, payload buffer.VectorisedView, protocol tcpip.TransportProtocolNumber, ttl uint8) *tcpip.Error {
+	err := r.ref.ep.WritePacket(r, gso, hdr, payload, protocol, ttl, r.loop)
+	if err != nil {
+		r.Stats().IP.OutgoingPacketErrors.Increment()
+	} else {
+		r.ref.nic.stats.Tx.Packets.Increment()
+		r.ref.nic.stats.Tx.Bytes.IncrementBy(uint64(hdr.UsedLength() + payload.Size()))
+	}
+	return err
+}
+
+// DefaultTTL returns the default TTL of the underlying network endpoint.
+func (r *Route) DefaultTTL() uint8 {
+	return r.ref.ep.DefaultTTL()
+}
+
+// MTU returns the MTU of the underlying network endpoint.
+func (r *Route) MTU() uint32 {
+	return r.ref.ep.MTU()
+}
+
+// Release frees all resources associated with the route.
+func (r *Route) Release() {
+	if r.ref != nil {
+		r.ref.decRef()
+		r.ref = nil
+	}
+}
+
+// Clone Clone a route such that the original one can be released and the new
+// one will remain valid.
+func (r *Route) Clone() Route {
+	r.ref.incRef()
+	return *r
+}
diff --git a/pkg/tcpip/stack/stack.go b/pkg/tcpip/stack/stack.go
new file mode 100644
index 000000000..9d8e8cda5
--- /dev/null
+++ b/pkg/tcpip/stack/stack.go
@@ -0,0 +1,1095 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package stack provides the glue between networking protocols and the
+// consumers of the networking stack.
+//
+// For consumers, the only function of interest is New(), everything else is
+// provided by the tcpip/public package.
+//
+// For protocol implementers, RegisterTransportProtocolFactory() and
+// RegisterNetworkProtocolFactory() are used to register protocol factories with
+// the stack, which will then be used to instantiate protocol objects when
+// consumers interact with the stack.
+package stack
+
+import (
+	"sync"
+	"time"
+
+	"gvisor.googlesource.com/gvisor/pkg/sleep"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/ports"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/seqnum"
+	"gvisor.googlesource.com/gvisor/pkg/waiter"
+)
+
+const (
+	// ageLimit is set to the same cache stale time used in Linux.
+	ageLimit = 1 * time.Minute
+	// resolutionTimeout is set to the same ARP timeout used in Linux.
+	resolutionTimeout = 1 * time.Second
+	// resolutionAttempts is set to the same ARP retries used in Linux.
+	resolutionAttempts = 3
+)
+
+type transportProtocolState struct {
+	proto          TransportProtocol
+	defaultHandler func(r *Route, id TransportEndpointID, netHeader buffer.View, vv buffer.VectorisedView) bool
+}
+
+// TCPProbeFunc is the expected function type for a TCP probe function to be
+// passed to stack.AddTCPProbe.
+type TCPProbeFunc func(s TCPEndpointState)
+
+// TCPCubicState is used to hold a copy of the internal cubic state when the
+// TCPProbeFunc is invoked.
+type TCPCubicState struct {
+	WLastMax                float64
+	WMax                    float64
+	T                       time.Time
+	TimeSinceLastCongestion time.Duration
+	C                       float64
+	K                       float64
+	Beta                    float64
+	WC                      float64
+	WEst                    float64
+}
+
+// TCPEndpointID is the unique 4 tuple that identifies a given endpoint.
+type TCPEndpointID struct {
+	// LocalPort is the local port associated with the endpoint.
+	LocalPort uint16
+
+	// LocalAddress is the local [network layer] address associated with
+	// the endpoint.
+	LocalAddress tcpip.Address
+
+	// RemotePort is the remote port associated with the endpoint.
+	RemotePort uint16
+
+	// RemoteAddress it the remote [network layer] address associated with
+	// the endpoint.
+	RemoteAddress tcpip.Address
+}
+
+// TCPFastRecoveryState holds a copy of the internal fast recovery state of a
+// TCP endpoint.
+type TCPFastRecoveryState struct {
+	// Active if true indicates the endpoint is in fast recovery.
+	Active bool
+
+	// First is the first unacknowledged sequence number being recovered.
+	First seqnum.Value
+
+	// Last is the 'recover' sequence number that indicates the point at
+	// which we should exit recovery barring any timeouts etc.
+	Last seqnum.Value
+
+	// MaxCwnd is the maximum value we are permitted to grow the congestion
+	// window during recovery. This is set at the time we enter recovery.
+	MaxCwnd int
+
+	// HighRxt is the highest sequence number which has been retransmitted
+	// during the current loss recovery phase.
+	// See: RFC 6675 Section 2 for details.
+	HighRxt seqnum.Value
+
+	// RescueRxt is the highest sequence number which has been
+	// optimistically retransmitted to prevent stalling of the ACK clock
+	// when there is loss at the end of the window and no new data is
+	// available for transmission.
+	// See: RFC 6675 Section 2 for details.
+	RescueRxt seqnum.Value
+}
+
+// TCPReceiverState holds a copy of the internal state of the receiver for
+// a given TCP endpoint.
+type TCPReceiverState struct {
+	// RcvNxt is the TCP variable RCV.NXT.
+	RcvNxt seqnum.Value
+
+	// RcvAcc is the TCP variable RCV.ACC.
+	RcvAcc seqnum.Value
+
+	// RcvWndScale is the window scaling to use for inbound segments.
+	RcvWndScale uint8
+
+	// PendingBufUsed is the number of bytes pending in the receive
+	// queue.
+	PendingBufUsed seqnum.Size
+
+	// PendingBufSize is the size of the socket receive buffer.
+	PendingBufSize seqnum.Size
+}
+
+// TCPSenderState holds a copy of the internal state of the sender for
+// a given TCP Endpoint.
+type TCPSenderState struct {
+	// LastSendTime is the time at which we sent the last segment.
+	LastSendTime time.Time
+
+	// DupAckCount is the number of Duplicate ACK's received.
+	DupAckCount int
+
+	// SndCwnd is the size of the sending congestion window in packets.
+	SndCwnd int
+
+	// Ssthresh is the slow start threshold in packets.
+	Ssthresh int
+
+	// SndCAAckCount is the number of packets consumed in congestion
+	// avoidance mode.
+	SndCAAckCount int
+
+	// Outstanding is the number of packets in flight.
+	Outstanding int
+
+	// SndWnd is the send window size in bytes.
+	SndWnd seqnum.Size
+
+	// SndUna is the next unacknowledged sequence number.
+	SndUna seqnum.Value
+
+	// SndNxt is the sequence number of the next segment to be sent.
+	SndNxt seqnum.Value
+
+	// RTTMeasureSeqNum is the sequence number being used for the latest RTT
+	// measurement.
+	RTTMeasureSeqNum seqnum.Value
+
+	// RTTMeasureTime is the time when the RTTMeasureSeqNum was sent.
+	RTTMeasureTime time.Time
+
+	// Closed indicates that the caller has closed the endpoint for sending.
+	Closed bool
+
+	// SRTT is the smoothed round-trip time as defined in section 2 of
+	// RFC 6298.
+	SRTT time.Duration
+
+	// RTO is the retransmit timeout as defined in section of 2 of RFC 6298.
+	RTO time.Duration
+
+	// RTTVar is the round-trip time variation as defined in section 2 of
+	// RFC 6298.
+	RTTVar time.Duration
+
+	// SRTTInited if true indicates take a valid RTT measurement has been
+	// completed.
+	SRTTInited bool
+
+	// MaxPayloadSize is the maximum size of the payload of a given segment.
+	// It is initialized on demand.
+	MaxPayloadSize int
+
+	// SndWndScale is the number of bits to shift left when reading the send
+	// window size from a segment.
+	SndWndScale uint8
+
+	// MaxSentAck is the highest acknowledgement number sent till now.
+	MaxSentAck seqnum.Value
+
+	// FastRecovery holds the fast recovery state for the endpoint.
+	FastRecovery TCPFastRecoveryState
+
+	// Cubic holds the state related to CUBIC congestion control.
+	Cubic TCPCubicState
+}
+
+// TCPSACKInfo holds TCP SACK related information for a given TCP endpoint.
+type TCPSACKInfo struct {
+	// Blocks is the list of SACK Blocks that identify the out of order segments
+	// held by a given TCP endpoint.
+	Blocks []header.SACKBlock
+
+	// ReceivedBlocks are the SACK blocks received by this endpoint
+	// from the peer endpoint.
+	ReceivedBlocks []header.SACKBlock
+
+	// MaxSACKED is the highest sequence number that has been SACKED
+	// by the peer.
+	MaxSACKED seqnum.Value
+}
+
+// TCPEndpointState is a copy of the internal state of a TCP endpoint.
+type TCPEndpointState struct {
+	// ID is a copy of the TransportEndpointID for the endpoint.
+	ID TCPEndpointID
+
+	// SegTime denotes the absolute time when this segment was received.
+	SegTime time.Time
+
+	// RcvBufSize is the size of the receive socket buffer for the endpoint.
+	RcvBufSize int
+
+	// RcvBufUsed is the amount of bytes actually held in the receive socket
+	// buffer for the endpoint.
+	RcvBufUsed int
+
+	// RcvClosed if true, indicates the endpoint has been closed for reading.
+	RcvClosed bool
+
+	// SendTSOk is used to indicate when the TS Option has been negotiated.
+	// When sendTSOk is true every non-RST segment should carry a TS as per
+	// RFC7323#section-1.1.
+	SendTSOk bool
+
+	// RecentTS is the timestamp that should be sent in the TSEcr field of
+	// the timestamp for future segments sent by the endpoint. This field is
+	// updated if required when a new segment is received by this endpoint.
+	RecentTS uint32
+
+	// TSOffset is a randomized offset added to the value of the TSVal field
+	// in the timestamp option.
+	TSOffset uint32
+
+	// SACKPermitted is set to true if the peer sends the TCPSACKPermitted
+	// option in the SYN/SYN-ACK.
+	SACKPermitted bool
+
+	// SACK holds TCP SACK related information for this endpoint.
+	SACK TCPSACKInfo
+
+	// SndBufSize is the size of the socket send buffer.
+	SndBufSize int
+
+	// SndBufUsed is the number of bytes held in the socket send buffer.
+	SndBufUsed int
+
+	// SndClosed indicates that the endpoint has been closed for sends.
+	SndClosed bool
+
+	// SndBufInQueue is the number of bytes in the send queue.
+	SndBufInQueue seqnum.Size
+
+	// PacketTooBigCount is used to notify the main protocol routine how
+	// many times a "packet too big" control packet is received.
+	PacketTooBigCount int
+
+	// SndMTU is the smallest MTU seen in the control packets received.
+	SndMTU int
+
+	// Receiver holds variables related to the TCP receiver for the endpoint.
+	Receiver TCPReceiverState
+
+	// Sender holds state related to the TCP Sender for the endpoint.
+	Sender TCPSenderState
+}
+
+// Stack is a networking stack, with all supported protocols, NICs, and route
+// table.
+type Stack struct {
+	transportProtocols map[tcpip.TransportProtocolNumber]*transportProtocolState
+	networkProtocols   map[tcpip.NetworkProtocolNumber]NetworkProtocol
+	linkAddrResolvers  map[tcpip.NetworkProtocolNumber]LinkAddressResolver
+
+	demux *transportDemuxer
+
+	stats tcpip.Stats
+
+	linkAddrCache *linkAddrCache
+
+	// raw indicates whether raw sockets may be created. It is set during
+	// Stack creation and is immutable.
+	raw bool
+
+	mu         sync.RWMutex
+	nics       map[tcpip.NICID]*NIC
+	forwarding bool
+
+	// route is the route table passed in by the user via SetRouteTable(),
+	// it is used by FindRoute() to build a route for a specific
+	// destination.
+	routeTable []tcpip.Route
+
+	*ports.PortManager
+
+	// If not nil, then any new endpoints will have this probe function
+	// invoked everytime they receive a TCP segment.
+	tcpProbeFunc TCPProbeFunc
+
+	// clock is used to generate user-visible times.
+	clock tcpip.Clock
+
+	// handleLocal allows non-loopback interfaces to loop packets.
+	handleLocal bool
+}
+
+// Options contains optional Stack configuration.
+type Options struct {
+	// Clock is an optional clock source used for timestampping packets.
+	//
+	// If no Clock is specified, the clock source will be time.Now.
+	Clock tcpip.Clock
+
+	// Stats are optional statistic counters.
+	Stats tcpip.Stats
+
+	// HandleLocal indicates whether packets destined to their source
+	// should be handled by the stack internally (true) or outside the
+	// stack (false).
+	HandleLocal bool
+
+	// Raw indicates whether raw sockets may be created.
+	Raw bool
+}
+
+// New allocates a new networking stack with only the requested networking and
+// transport protocols configured with default options.
+//
+// Protocol options can be changed by calling the
+// SetNetworkProtocolOption/SetTransportProtocolOption methods provided by the
+// stack. Please refer to individual protocol implementations as to what options
+// are supported.
+func New(network []string, transport []string, opts Options) *Stack {
+	clock := opts.Clock
+	if clock == nil {
+		clock = &tcpip.StdClock{}
+	}
+
+	s := &Stack{
+		transportProtocols: make(map[tcpip.TransportProtocolNumber]*transportProtocolState),
+		networkProtocols:   make(map[tcpip.NetworkProtocolNumber]NetworkProtocol),
+		linkAddrResolvers:  make(map[tcpip.NetworkProtocolNumber]LinkAddressResolver),
+		nics:               make(map[tcpip.NICID]*NIC),
+		linkAddrCache:      newLinkAddrCache(ageLimit, resolutionTimeout, resolutionAttempts),
+		PortManager:        ports.NewPortManager(),
+		clock:              clock,
+		stats:              opts.Stats.FillIn(),
+		handleLocal:        opts.HandleLocal,
+		raw:                opts.Raw,
+	}
+
+	// Add specified network protocols.
+	for _, name := range network {
+		netProtoFactory, ok := networkProtocols[name]
+		if !ok {
+			continue
+		}
+		netProto := netProtoFactory()
+		s.networkProtocols[netProto.Number()] = netProto
+		if r, ok := netProto.(LinkAddressResolver); ok {
+			s.linkAddrResolvers[r.LinkAddressProtocol()] = r
+		}
+	}
+
+	// Add specified transport protocols.
+	for _, name := range transport {
+		transProtoFactory, ok := transportProtocols[name]
+		if !ok {
+			continue
+		}
+		transProto := transProtoFactory()
+		s.transportProtocols[transProto.Number()] = &transportProtocolState{
+			proto: transProto,
+		}
+	}
+
+	// Create the global transport demuxer.
+	s.demux = newTransportDemuxer(s)
+
+	return s
+}
+
+// SetNetworkProtocolOption allows configuring individual protocol level
+// options. This method returns an error if the protocol is not supported or
+// option is not supported by the protocol implementation or the provided value
+// is incorrect.
+func (s *Stack) SetNetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error {
+	netProto, ok := s.networkProtocols[network]
+	if !ok {
+		return tcpip.ErrUnknownProtocol
+	}
+	return netProto.SetOption(option)
+}
+
+// NetworkProtocolOption allows retrieving individual protocol level option
+// values. This method returns an error if the protocol is not supported or
+// option is not supported by the protocol implementation.
+// e.g.
+// var v ipv4.MyOption
+// err := s.NetworkProtocolOption(tcpip.IPv4ProtocolNumber, &v)
+// if err != nil {
+//   ...
+// }
+func (s *Stack) NetworkProtocolOption(network tcpip.NetworkProtocolNumber, option interface{}) *tcpip.Error {
+	netProto, ok := s.networkProtocols[network]
+	if !ok {
+		return tcpip.ErrUnknownProtocol
+	}
+	return netProto.Option(option)
+}
+
+// SetTransportProtocolOption allows configuring individual protocol level
+// options. This method returns an error if the protocol is not supported or
+// option is not supported by the protocol implementation or the provided value
+// is incorrect.
+func (s *Stack) SetTransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error {
+	transProtoState, ok := s.transportProtocols[transport]
+	if !ok {
+		return tcpip.ErrUnknownProtocol
+	}
+	return transProtoState.proto.SetOption(option)
+}
+
+// TransportProtocolOption allows retrieving individual protocol level option
+// values. This method returns an error if the protocol is not supported or
+// option is not supported by the protocol implementation.
+// var v tcp.SACKEnabled
+// if err := s.TransportProtocolOption(tcpip.TCPProtocolNumber, &v); err != nil {
+//   ...
+// }
+func (s *Stack) TransportProtocolOption(transport tcpip.TransportProtocolNumber, option interface{}) *tcpip.Error {
+	transProtoState, ok := s.transportProtocols[transport]
+	if !ok {
+		return tcpip.ErrUnknownProtocol
+	}
+	return transProtoState.proto.Option(option)
+}
+
+// SetTransportProtocolHandler sets the per-stack default handler for the given
+// protocol.
+//
+// It must be called only during initialization of the stack. Changing it as the
+// stack is operating is not supported.
+func (s *Stack) SetTransportProtocolHandler(p tcpip.TransportProtocolNumber, h func(*Route, TransportEndpointID, buffer.View, buffer.VectorisedView) bool) {
+	state := s.transportProtocols[p]
+	if state != nil {
+		state.defaultHandler = h
+	}
+}
+
+// NowNanoseconds implements tcpip.Clock.NowNanoseconds.
+func (s *Stack) NowNanoseconds() int64 {
+	return s.clock.NowNanoseconds()
+}
+
+// Stats returns a mutable copy of the current stats.
+//
+// This is not generally exported via the public interface, but is available
+// internally.
+func (s *Stack) Stats() tcpip.Stats {
+	return s.stats
+}
+
+// SetForwarding enables or disables the packet forwarding between NICs.
+func (s *Stack) SetForwarding(enable bool) {
+	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
+	s.mu.Lock()
+	s.forwarding = enable
+	s.mu.Unlock()
+}
+
+// Forwarding returns if the packet forwarding between NICs is enabled.
+func (s *Stack) Forwarding() bool {
+	// TODO(igudger, bgeffon): Expose via /proc/sys/net/ipv4/ip_forward.
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+	return s.forwarding
+}
+
+// SetRouteTable assigns the route table to be used by this stack. It
+// specifies which NIC to use for given destination address ranges.
+func (s *Stack) SetRouteTable(table []tcpip.Route) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	s.routeTable = table
+}
+
+// GetRouteTable returns the route table which is currently in use.
+func (s *Stack) GetRouteTable() []tcpip.Route {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return append([]tcpip.Route(nil), s.routeTable...)
+}
+
+// NewEndpoint creates a new transport layer endpoint of the given protocol.
+func (s *Stack) NewEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	t, ok := s.transportProtocols[transport]
+	if !ok {
+		return nil, tcpip.ErrUnknownProtocol
+	}
+
+	return t.proto.NewEndpoint(s, network, waiterQueue)
+}
+
+// NewRawEndpoint creates a new raw transport layer endpoint of the given
+// protocol. Raw endpoints receive all traffic for a given protocol regardless
+// of address.
+func (s *Stack) NewRawEndpoint(transport tcpip.TransportProtocolNumber, network tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) (tcpip.Endpoint, *tcpip.Error) {
+	if !s.raw {
+		return nil, tcpip.ErrNotPermitted
+	}
+
+	t, ok := s.transportProtocols[transport]
+	if !ok {
+		return nil, tcpip.ErrUnknownProtocol
+	}
+
+	return t.proto.NewRawEndpoint(s, network, waiterQueue)
+}
+
+// createNIC creates a NIC with the provided id and link-layer endpoint, and
+// optionally enable it.
+func (s *Stack) createNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID, enabled, loopback bool) *tcpip.Error {
+	ep := FindLinkEndpoint(linkEP)
+	if ep == nil {
+		return tcpip.ErrBadLinkEndpoint
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	// Make sure id is unique.
+	if _, ok := s.nics[id]; ok {
+		return tcpip.ErrDuplicateNICID
+	}
+
+	n := newNIC(s, id, name, ep, loopback)
+
+	s.nics[id] = n
+	if enabled {
+		n.attachLinkEndpoint()
+	}
+
+	return nil
+}
+
+// CreateNIC creates a NIC with the provided id and link-layer endpoint.
+func (s *Stack) CreateNIC(id tcpip.NICID, linkEP tcpip.LinkEndpointID) *tcpip.Error {
+	return s.createNIC(id, "", linkEP, true, false)
+}
+
+// CreateNamedNIC creates a NIC with the provided id and link-layer endpoint,
+// and a human-readable name.
+func (s *Stack) CreateNamedNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID) *tcpip.Error {
+	return s.createNIC(id, name, linkEP, true, false)
+}
+
+// CreateNamedLoopbackNIC creates a NIC with the provided id and link-layer
+// endpoint, and a human-readable name.
+func (s *Stack) CreateNamedLoopbackNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID) *tcpip.Error {
+	return s.createNIC(id, name, linkEP, true, true)
+}
+
+// CreateDisabledNIC creates a NIC with the provided id and link-layer endpoint,
+// but leave it disable. Stack.EnableNIC must be called before the link-layer
+// endpoint starts delivering packets to it.
+func (s *Stack) CreateDisabledNIC(id tcpip.NICID, linkEP tcpip.LinkEndpointID) *tcpip.Error {
+	return s.createNIC(id, "", linkEP, false, false)
+}
+
+// CreateDisabledNamedNIC is a combination of CreateNamedNIC and
+// CreateDisabledNIC.
+func (s *Stack) CreateDisabledNamedNIC(id tcpip.NICID, name string, linkEP tcpip.LinkEndpointID) *tcpip.Error {
+	return s.createNIC(id, name, linkEP, false, false)
+}
+
+// EnableNIC enables the given NIC so that the link-layer endpoint can start
+// delivering packets to it.
+func (s *Stack) EnableNIC(id tcpip.NICID) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic := s.nics[id]
+	if nic == nil {
+		return tcpip.ErrUnknownNICID
+	}
+
+	nic.attachLinkEndpoint()
+
+	return nil
+}
+
+// CheckNIC checks if a NIC is usable.
+func (s *Stack) CheckNIC(id tcpip.NICID) bool {
+	s.mu.RLock()
+	nic, ok := s.nics[id]
+	s.mu.RUnlock()
+	if ok {
+		return nic.linkEP.IsAttached()
+	}
+	return false
+}
+
+// NICSubnets returns a map of NICIDs to their associated subnets.
+func (s *Stack) NICSubnets() map[tcpip.NICID][]tcpip.Subnet {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nics := map[tcpip.NICID][]tcpip.Subnet{}
+
+	for id, nic := range s.nics {
+		nics[id] = append(nics[id], nic.Subnets()...)
+	}
+	return nics
+}
+
+// NICInfo captures the name and addresses assigned to a NIC.
+type NICInfo struct {
+	Name              string
+	LinkAddress       tcpip.LinkAddress
+	ProtocolAddresses []tcpip.ProtocolAddress
+
+	// Flags indicate the state of the NIC.
+	Flags NICStateFlags
+
+	// MTU is the maximum transmission unit.
+	MTU uint32
+
+	Stats NICStats
+}
+
+// NICInfo returns a map of NICIDs to their associated information.
+func (s *Stack) NICInfo() map[tcpip.NICID]NICInfo {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nics := make(map[tcpip.NICID]NICInfo)
+	for id, nic := range s.nics {
+		flags := NICStateFlags{
+			Up:          true, // Netstack interfaces are always up.
+			Running:     nic.linkEP.IsAttached(),
+			Promiscuous: nic.isPromiscuousMode(),
+			Loopback:    nic.linkEP.Capabilities()&CapabilityLoopback != 0,
+		}
+		nics[id] = NICInfo{
+			Name:              nic.name,
+			LinkAddress:       nic.linkEP.LinkAddress(),
+			ProtocolAddresses: nic.Addresses(),
+			Flags:             flags,
+			MTU:               nic.linkEP.MTU(),
+			Stats:             nic.stats,
+		}
+	}
+	return nics
+}
+
+// NICStateFlags holds information about the state of an NIC.
+type NICStateFlags struct {
+	// Up indicates whether the interface is running.
+	Up bool
+
+	// Running indicates whether resources are allocated.
+	Running bool
+
+	// Promiscuous indicates whether the interface is in promiscuous mode.
+	Promiscuous bool
+
+	// Loopback indicates whether the interface is a loopback.
+	Loopback bool
+}
+
+// AddAddress adds a new network-layer address to the specified NIC.
+func (s *Stack) AddAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) *tcpip.Error {
+	return s.AddAddressWithOptions(id, protocol, addr, CanBePrimaryEndpoint)
+}
+
+// AddAddressWithOptions is the same as AddAddress, but allows you to specify
+// whether the new endpoint can be primary or not.
+func (s *Stack) AddAddressWithOptions(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address, peb PrimaryEndpointBehavior) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic := s.nics[id]
+	if nic == nil {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.AddAddressWithOptions(protocol, addr, peb)
+}
+
+// AddSubnet adds a subnet range to the specified NIC.
+func (s *Stack) AddSubnet(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber, subnet tcpip.Subnet) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[id]; ok {
+		nic.AddSubnet(protocol, subnet)
+		return nil
+	}
+
+	return tcpip.ErrUnknownNICID
+}
+
+// RemoveSubnet removes the subnet range from the specified NIC.
+func (s *Stack) RemoveSubnet(id tcpip.NICID, subnet tcpip.Subnet) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[id]; ok {
+		nic.RemoveSubnet(subnet)
+		return nil
+	}
+
+	return tcpip.ErrUnknownNICID
+}
+
+// ContainsSubnet reports whether the specified NIC contains the specified
+// subnet.
+func (s *Stack) ContainsSubnet(id tcpip.NICID, subnet tcpip.Subnet) (bool, *tcpip.Error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[id]; ok {
+		return nic.ContainsSubnet(subnet), nil
+	}
+
+	return false, tcpip.ErrUnknownNICID
+}
+
+// RemoveAddress removes an existing network-layer address from the specified
+// NIC.
+func (s *Stack) RemoveAddress(id tcpip.NICID, addr tcpip.Address) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[id]; ok {
+		return nic.RemoveAddress(addr)
+	}
+
+	return tcpip.ErrUnknownNICID
+}
+
+// GetMainNICAddress returns the first primary address (and the subnet that
+// contains it) for the given NIC and protocol. Returns an arbitrary endpoint's
+// address if no primary addresses exist. Returns an error if the NIC doesn't
+// exist or has no endpoints.
+func (s *Stack) GetMainNICAddress(id tcpip.NICID, protocol tcpip.NetworkProtocolNumber) (tcpip.Address, tcpip.Subnet, *tcpip.Error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[id]; ok {
+		return nic.getMainNICAddress(protocol)
+	}
+
+	return "", tcpip.Subnet{}, tcpip.ErrUnknownNICID
+}
+
+func (s *Stack) getRefEP(nic *NIC, localAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber) (ref *referencedNetworkEndpoint) {
+	if len(localAddr) == 0 {
+		return nic.primaryEndpoint(netProto)
+	}
+	return nic.findEndpoint(netProto, localAddr, CanBePrimaryEndpoint)
+}
+
+// FindRoute creates a route to the given destination address, leaving through
+// the given nic and local address (if provided).
+func (s *Stack) FindRoute(id tcpip.NICID, localAddr, remoteAddr tcpip.Address, netProto tcpip.NetworkProtocolNumber, multicastLoop bool) (Route, *tcpip.Error) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	isBroadcast := remoteAddr == header.IPv4Broadcast
+	isMulticast := header.IsV4MulticastAddress(remoteAddr) || header.IsV6MulticastAddress(remoteAddr)
+	needRoute := !(isBroadcast || isMulticast || header.IsV6LinkLocalAddress(remoteAddr))
+	if id != 0 && !needRoute {
+		if nic, ok := s.nics[id]; ok {
+			if ref := s.getRefEP(nic, localAddr, netProto); ref != nil {
+				return makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback), nil
+			}
+		}
+	} else {
+		for _, route := range s.routeTable {
+			if (id != 0 && id != route.NIC) || (len(remoteAddr) != 0 && !route.Match(remoteAddr)) {
+				continue
+			}
+			if nic, ok := s.nics[route.NIC]; ok {
+				if ref := s.getRefEP(nic, localAddr, netProto); ref != nil {
+					if len(remoteAddr) == 0 {
+						// If no remote address was provided, then the route
+						// provided will refer to the link local address.
+						remoteAddr = ref.ep.ID().LocalAddress
+					}
+
+					r := makeRoute(netProto, ref.ep.ID().LocalAddress, remoteAddr, nic.linkEP.LinkAddress(), ref, s.handleLocal && !nic.loopback, multicastLoop && !nic.loopback)
+					if needRoute {
+						r.NextHop = route.Gateway
+					}
+					return r, nil
+				}
+			}
+		}
+	}
+
+	if !needRoute {
+		return Route{}, tcpip.ErrNetworkUnreachable
+	}
+
+	return Route{}, tcpip.ErrNoRoute
+}
+
+// CheckNetworkProtocol checks if a given network protocol is enabled in the
+// stack.
+func (s *Stack) CheckNetworkProtocol(protocol tcpip.NetworkProtocolNumber) bool {
+	_, ok := s.networkProtocols[protocol]
+	return ok
+}
+
+// CheckLocalAddress determines if the given local address exists, and if it
+// does, returns the id of the NIC it's bound to. Returns 0 if the address
+// does not exist.
+func (s *Stack) CheckLocalAddress(nicid tcpip.NICID, protocol tcpip.NetworkProtocolNumber, addr tcpip.Address) tcpip.NICID {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	// If a NIC is specified, we try to find the address there only.
+	if nicid != 0 {
+		nic := s.nics[nicid]
+		if nic == nil {
+			return 0
+		}
+
+		ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint)
+		if ref == nil {
+			return 0
+		}
+
+		ref.decRef()
+
+		return nic.id
+	}
+
+	// Go through all the NICs.
+	for _, nic := range s.nics {
+		ref := nic.findEndpoint(protocol, addr, CanBePrimaryEndpoint)
+		if ref != nil {
+			ref.decRef()
+			return nic.id
+		}
+	}
+
+	return 0
+}
+
+// SetPromiscuousMode enables or disables promiscuous mode in the given NIC.
+func (s *Stack) SetPromiscuousMode(nicID tcpip.NICID, enable bool) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic := s.nics[nicID]
+	if nic == nil {
+		return tcpip.ErrUnknownNICID
+	}
+
+	nic.setPromiscuousMode(enable)
+
+	return nil
+}
+
+// SetSpoofing enables or disables address spoofing in the given NIC, allowing
+// endpoints to bind to any address in the NIC.
+func (s *Stack) SetSpoofing(nicID tcpip.NICID, enable bool) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic := s.nics[nicID]
+	if nic == nil {
+		return tcpip.ErrUnknownNICID
+	}
+
+	nic.setSpoofing(enable)
+
+	return nil
+}
+
+// AddLinkAddress adds a link address to the stack link cache.
+func (s *Stack) AddLinkAddress(nicid tcpip.NICID, addr tcpip.Address, linkAddr tcpip.LinkAddress) {
+	fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr}
+	s.linkAddrCache.add(fullAddr, linkAddr)
+	// TODO: provide a way for a transport endpoint to receive a signal
+	// that AddLinkAddress for a particular address has been called.
+}
+
+// GetLinkAddress implements LinkAddressCache.GetLinkAddress.
+func (s *Stack) GetLinkAddress(nicid tcpip.NICID, addr, localAddr tcpip.Address, protocol tcpip.NetworkProtocolNumber, waker *sleep.Waker) (tcpip.LinkAddress, <-chan struct{}, *tcpip.Error) {
+	s.mu.RLock()
+	nic := s.nics[nicid]
+	if nic == nil {
+		s.mu.RUnlock()
+		return "", nil, tcpip.ErrUnknownNICID
+	}
+	s.mu.RUnlock()
+
+	fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr}
+	linkRes := s.linkAddrResolvers[protocol]
+	return s.linkAddrCache.get(fullAddr, linkRes, localAddr, nic.linkEP, waker)
+}
+
+// RemoveWaker implements LinkAddressCache.RemoveWaker.
+func (s *Stack) RemoveWaker(nicid tcpip.NICID, addr tcpip.Address, waker *sleep.Waker) {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic := s.nics[nicid]; nic == nil {
+		fullAddr := tcpip.FullAddress{NIC: nicid, Addr: addr}
+		s.linkAddrCache.removeWaker(fullAddr, waker)
+	}
+}
+
+// RegisterTransportEndpoint registers the given endpoint with the stack
+// transport dispatcher. Received packets that match the provided id will be
+// delivered to the given endpoint; specifying a nic is optional, but
+// nic-specific IDs have precedence over global ones.
+func (s *Stack) RegisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool) *tcpip.Error {
+	if nicID == 0 {
+		return s.demux.registerEndpoint(netProtos, protocol, id, ep, reusePort)
+	}
+
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic := s.nics[nicID]
+	if nic == nil {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.demux.registerEndpoint(netProtos, protocol, id, ep, reusePort)
+}
+
+// UnregisterTransportEndpoint removes the endpoint with the given id from the
+// stack transport dispatcher.
+func (s *Stack) UnregisterTransportEndpoint(nicID tcpip.NICID, netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint) {
+	if nicID == 0 {
+		s.demux.unregisterEndpoint(netProtos, protocol, id, ep)
+		return
+	}
+
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic := s.nics[nicID]
+	if nic != nil {
+		nic.demux.unregisterEndpoint(netProtos, protocol, id, ep)
+	}
+}
+
+// RegisterRawTransportEndpoint registers the given endpoint with the stack
+// transport dispatcher. Received packets that match the provided transport
+// protocol will be delivered to the given endpoint.
+func (s *Stack) RegisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error {
+	if nicID == 0 {
+		return s.demux.registerRawEndpoint(netProto, transProto, ep)
+	}
+
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic := s.nics[nicID]
+	if nic == nil {
+		return tcpip.ErrUnknownNICID
+	}
+
+	return nic.demux.registerRawEndpoint(netProto, transProto, ep)
+}
+
+// UnregisterRawTransportEndpoint removes the endpoint for the transport
+// protocol from the stack transport dispatcher.
+func (s *Stack) UnregisterRawTransportEndpoint(nicID tcpip.NICID, netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) {
+	if nicID == 0 {
+		s.demux.unregisterRawEndpoint(netProto, transProto, ep)
+		return
+	}
+
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	nic := s.nics[nicID]
+	if nic != nil {
+		nic.demux.unregisterRawEndpoint(netProto, transProto, ep)
+	}
+}
+
+// NetworkProtocolInstance returns the protocol instance in the stack for the
+// specified network protocol. This method is public for protocol implementers
+// and tests to use.
+func (s *Stack) NetworkProtocolInstance(num tcpip.NetworkProtocolNumber) NetworkProtocol {
+	if p, ok := s.networkProtocols[num]; ok {
+		return p
+	}
+	return nil
+}
+
+// TransportProtocolInstance returns the protocol instance in the stack for the
+// specified transport protocol. This method is public for protocol implementers
+// and tests to use.
+func (s *Stack) TransportProtocolInstance(num tcpip.TransportProtocolNumber) TransportProtocol {
+	if pState, ok := s.transportProtocols[num]; ok {
+		return pState.proto
+	}
+	return nil
+}
+
+// AddTCPProbe installs a probe function that will be invoked on every segment
+// received by a given TCP endpoint. The probe function is passed a copy of the
+// TCP endpoint state before and after processing of the segment.
+//
+// NOTE: TCPProbe is added only to endpoints created after this call. Endpoints
+// created prior to this call will not call the probe function.
+//
+// Further, installing two different probes back to back can result in some
+// endpoints calling the first one and some the second one. There is no
+// guarantee provided on which probe will be invoked. Ideally this should only
+// be called once per stack.
+func (s *Stack) AddTCPProbe(probe TCPProbeFunc) {
+	s.mu.Lock()
+	s.tcpProbeFunc = probe
+	s.mu.Unlock()
+}
+
+// GetTCPProbe returns the TCPProbeFunc if installed with AddTCPProbe, nil
+// otherwise.
+func (s *Stack) GetTCPProbe() TCPProbeFunc {
+	s.mu.Lock()
+	p := s.tcpProbeFunc
+	s.mu.Unlock()
+	return p
+}
+
+// RemoveTCPProbe removes an installed TCP probe.
+//
+// NOTE: This only ensures that endpoints created after this call do not
+// have a probe attached. Endpoints already created will continue to invoke
+// TCP probe.
+func (s *Stack) RemoveTCPProbe() {
+	s.mu.Lock()
+	s.tcpProbeFunc = nil
+	s.mu.Unlock()
+}
+
+// JoinGroup joins the given multicast group on the given NIC.
+func (s *Stack) JoinGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) *tcpip.Error {
+	// TODO: notify network of subscription via igmp protocol.
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[nicID]; ok {
+		return nic.joinGroup(protocol, multicastAddr)
+	}
+	return tcpip.ErrUnknownNICID
+}
+
+// LeaveGroup leaves the given multicast group on the given NIC.
+func (s *Stack) LeaveGroup(protocol tcpip.NetworkProtocolNumber, nicID tcpip.NICID, multicastAddr tcpip.Address) *tcpip.Error {
+	s.mu.RLock()
+	defer s.mu.RUnlock()
+
+	if nic, ok := s.nics[nicID]; ok {
+		return nic.leaveGroup(multicastAddr)
+	}
+	return tcpip.ErrUnknownNICID
+}
diff --git a/pkg/tcpip/stack/stack_global_state.go b/pkg/tcpip/stack/stack_global_state.go
new file mode 100644
index 000000000..dfec4258a
--- /dev/null
+++ b/pkg/tcpip/stack/stack_global_state.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+// StackFromEnv is the global stack created in restore run.
+// FIXME(b/36201077)
+var StackFromEnv *Stack
diff --git a/pkg/tcpip/stack/stack_state_autogen.go b/pkg/tcpip/stack/stack_state_autogen.go
new file mode 100755
index 000000000..bb05ff7c1
--- /dev/null
+++ b/pkg/tcpip/stack/stack_state_autogen.go
@@ -0,0 +1,59 @@
+// automatically generated by stateify.
+
+package stack
+
+import (
+	"gvisor.googlesource.com/gvisor/pkg/state"
+)
+
+func (x *TransportEndpointID) beforeSave() {}
+func (x *TransportEndpointID) save(m state.Map) {
+	x.beforeSave()
+	m.Save("LocalPort", &x.LocalPort)
+	m.Save("LocalAddress", &x.LocalAddress)
+	m.Save("RemotePort", &x.RemotePort)
+	m.Save("RemoteAddress", &x.RemoteAddress)
+}
+
+func (x *TransportEndpointID) afterLoad() {}
+func (x *TransportEndpointID) load(m state.Map) {
+	m.Load("LocalPort", &x.LocalPort)
+	m.Load("LocalAddress", &x.LocalAddress)
+	m.Load("RemotePort", &x.RemotePort)
+	m.Load("RemoteAddress", &x.RemoteAddress)
+}
+
+func (x *GSOType) save(m state.Map) {
+	m.SaveValue("", (int)(*x))
+}
+
+func (x *GSOType) load(m state.Map) {
+	m.LoadValue("", new(int), func(y interface{}) { *x = (GSOType)(y.(int)) })
+}
+
+func (x *GSO) beforeSave() {}
+func (x *GSO) save(m state.Map) {
+	x.beforeSave()
+	m.Save("Type", &x.Type)
+	m.Save("NeedsCsum", &x.NeedsCsum)
+	m.Save("CsumOffset", &x.CsumOffset)
+	m.Save("MSS", &x.MSS)
+	m.Save("L3HdrLen", &x.L3HdrLen)
+	m.Save("MaxSize", &x.MaxSize)
+}
+
+func (x *GSO) afterLoad() {}
+func (x *GSO) load(m state.Map) {
+	m.Load("Type", &x.Type)
+	m.Load("NeedsCsum", &x.NeedsCsum)
+	m.Load("CsumOffset", &x.CsumOffset)
+	m.Load("MSS", &x.MSS)
+	m.Load("L3HdrLen", &x.L3HdrLen)
+	m.Load("MaxSize", &x.MaxSize)
+}
+
+func init() {
+	state.Register("stack.TransportEndpointID", (*TransportEndpointID)(nil), state.Fns{Save: (*TransportEndpointID).save, Load: (*TransportEndpointID).load})
+	state.Register("stack.GSOType", (*GSOType)(nil), state.Fns{Save: (*GSOType).save, Load: (*GSOType).load})
+	state.Register("stack.GSO", (*GSO)(nil), state.Fns{Save: (*GSO).save, Load: (*GSO).load})
+}
diff --git a/pkg/tcpip/stack/transport_demuxer.go b/pkg/tcpip/stack/transport_demuxer.go
new file mode 100644
index 000000000..605bfadeb
--- /dev/null
+++ b/pkg/tcpip/stack/transport_demuxer.go
@@ -0,0 +1,420 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package stack
+
+import (
+	"fmt"
+	"math/rand"
+	"sync"
+
+	"gvisor.googlesource.com/gvisor/pkg/tcpip"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/buffer"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/hash/jenkins"
+	"gvisor.googlesource.com/gvisor/pkg/tcpip/header"
+)
+
+type protocolIDs struct {
+	network   tcpip.NetworkProtocolNumber
+	transport tcpip.TransportProtocolNumber
+}
+
+// transportEndpoints manages all endpoints of a given protocol. It has its own
+// mutex so as to reduce interference between protocols.
+type transportEndpoints struct {
+	// mu protects all fields of the transportEndpoints.
+	mu        sync.RWMutex
+	endpoints map[TransportEndpointID]TransportEndpoint
+	// rawEndpoints contains endpoints for raw sockets, which receive all
+	// traffic of a given protocol regardless of port.
+	rawEndpoints []RawTransportEndpoint
+}
+
+// unregisterEndpoint unregisters the endpoint with the given id such that it
+// won't receive any more packets.
+func (eps *transportEndpoints) unregisterEndpoint(id TransportEndpointID, ep TransportEndpoint) {
+	eps.mu.Lock()
+	defer eps.mu.Unlock()
+	e, ok := eps.endpoints[id]
+	if !ok {
+		return
+	}
+	if multiPortEp, ok := e.(*multiPortEndpoint); ok {
+		if !multiPortEp.unregisterEndpoint(ep) {
+			return
+		}
+	}
+	delete(eps.endpoints, id)
+}
+
+// transportDemuxer demultiplexes packets targeted at a transport endpoint
+// (i.e., after they've been parsed by the network layer). It does two levels
+// of demultiplexing: first based on the network and transport protocols, then
+// based on endpoints IDs. It should only be instantiated via
+// newTransportDemuxer.
+type transportDemuxer struct {
+	// protocol is immutable.
+	protocol map[protocolIDs]*transportEndpoints
+}
+
+func newTransportDemuxer(stack *Stack) *transportDemuxer {
+	d := &transportDemuxer{protocol: make(map[protocolIDs]*transportEndpoints)}
+
+	// Add each network and transport pair to the demuxer.
+	for netProto := range stack.networkProtocols {
+		for proto := range stack.transportProtocols {
+			d.protocol[protocolIDs{netProto, proto}] = &transportEndpoints{
+				endpoints: make(map[TransportEndpointID]TransportEndpoint),
+			}
+		}
+	}
+
+	return d
+}
+
+// registerEndpoint registers the given endpoint with the dispatcher such that
+// packets that match the endpoint ID are delivered to it.
+func (d *transportDemuxer) registerEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool) *tcpip.Error {
+	for i, n := range netProtos {
+		if err := d.singleRegisterEndpoint(n, protocol, id, ep, reusePort); err != nil {
+			d.unregisterEndpoint(netProtos[:i], protocol, id, ep)
+			return err
+		}
+	}
+
+	return nil
+}
+
+// multiPortEndpoint is a container for TransportEndpoints which are bound to
+// the same pair of address and port.
+type multiPortEndpoint struct {
+	mu           sync.RWMutex
+	endpointsArr []TransportEndpoint
+	endpointsMap map[TransportEndpoint]int
+	// seed is a random secret for a jenkins hash.
+	seed uint32
+}
+
+// reciprocalScale scales a value into range [0, n).
+//
+// This is similar to val % n, but faster.
+// See http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+func reciprocalScale(val, n uint32) uint32 {
+	return uint32((uint64(val) * uint64(n)) >> 32)
+}
+
+// selectEndpoint calculates a hash of destination and source addresses and
+// ports then uses it to select a socket. In this case, all packets from one
+// address will be sent to same endpoint.
+func (ep *multiPortEndpoint) selectEndpoint(id TransportEndpointID) TransportEndpoint {
+	ep.mu.RLock()
+	defer ep.mu.RUnlock()
+
+	payload := []byte{
+		byte(id.LocalPort),
+		byte(id.LocalPort >> 8),
+		byte(id.RemotePort),
+		byte(id.RemotePort >> 8),
+	}
+
+	h := jenkins.Sum32(ep.seed)
+	h.Write(payload)
+	h.Write([]byte(id.LocalAddress))
+	h.Write([]byte(id.RemoteAddress))
+	hash := h.Sum32()
+
+	idx := reciprocalScale(hash, uint32(len(ep.endpointsArr)))
+	return ep.endpointsArr[idx]
+}
+
+// HandlePacket is called by the stack when new packets arrive to this transport
+// endpoint.
+func (ep *multiPortEndpoint) HandlePacket(r *Route, id TransportEndpointID, vv buffer.VectorisedView) {
+	// If this is a broadcast or multicast datagram, deliver the datagram to all
+	// endpoints managed by ep.
+	if id.LocalAddress == header.IPv4Broadcast || header.IsV4MulticastAddress(id.LocalAddress) || header.IsV6MulticastAddress(id.LocalAddress) {
+		for i, endpoint := range ep.endpointsArr {
+			// HandlePacket modifies vv, so each endpoint needs its own copy.
+			if i == len(ep.endpointsArr)-1 {
+				endpoint.HandlePacket(r, id, vv)
+				break
+			}
+			vvCopy := buffer.NewView(vv.Size())
+			copy(vvCopy, vv.ToView())
+			endpoint.HandlePacket(r, id, vvCopy.ToVectorisedView())
+		}
+	} else {
+		ep.selectEndpoint(id).HandlePacket(r, id, vv)
+	}
+}
+
+// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
+func (ep *multiPortEndpoint) HandleControlPacket(id TransportEndpointID, typ ControlType, extra uint32, vv buffer.VectorisedView) {
+	ep.selectEndpoint(id).HandleControlPacket(id, typ, extra, vv)
+}
+
+func (ep *multiPortEndpoint) singleRegisterEndpoint(t TransportEndpoint) {
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	// A new endpoint is added into endpointsArr and its index there is
+	// saved in endpointsMap. This will allows to remove endpoint from
+	// the array fast.
+	ep.endpointsMap[t] = len(ep.endpointsArr)
+	ep.endpointsArr = append(ep.endpointsArr, t)
+}
+
+// unregisterEndpoint returns true if multiPortEndpoint has to be unregistered.
+func (ep *multiPortEndpoint) unregisterEndpoint(t TransportEndpoint) bool {
+	ep.mu.Lock()
+	defer ep.mu.Unlock()
+
+	idx, ok := ep.endpointsMap[t]
+	if !ok {
+		return false
+	}
+	delete(ep.endpointsMap, t)
+	l := len(ep.endpointsArr)
+	if l > 1 {
+		// The last endpoint in endpointsArr is moved instead of the deleted one.
+		lastEp := ep.endpointsArr[l-1]
+		ep.endpointsArr[idx] = lastEp
+		ep.endpointsMap[lastEp] = idx
+		ep.endpointsArr = ep.endpointsArr[0 : l-1]
+		return false
+	}
+	return true
+}
+
+func (d *transportDemuxer) singleRegisterEndpoint(netProto tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint, reusePort bool) *tcpip.Error {
+	if id.RemotePort != 0 {
+		reusePort = false
+	}
+
+	eps, ok := d.protocol[protocolIDs{netProto, protocol}]
+	if !ok {
+		return nil
+	}
+
+	eps.mu.Lock()
+	defer eps.mu.Unlock()
+
+	var multiPortEp *multiPortEndpoint
+	if _, ok := eps.endpoints[id]; ok {
+		if !reusePort {
+			return tcpip.ErrPortInUse
+		}
+		multiPortEp, ok = eps.endpoints[id].(*multiPortEndpoint)
+		if !ok {
+			return tcpip.ErrPortInUse
+		}
+	}
+
+	if reusePort {
+		if multiPortEp == nil {
+			multiPortEp = &multiPortEndpoint{}
+			multiPortEp.endpointsMap = make(map[TransportEndpoint]int)
+			multiPortEp.seed = rand.Uint32()
+			eps.endpoints[id] = multiPortEp
+		}
+
+		multiPortEp.singleRegisterEndpoint(ep)
+
+		return nil
+	}
+	eps.endpoints[id] = ep
+
+	return nil
+}
+
+// unregisterEndpoint unregisters the endpoint with the given id such that it
+// won't receive any more packets.
+func (d *transportDemuxer) unregisterEndpoint(netProtos []tcpip.NetworkProtocolNumber, protocol tcpip.TransportProtocolNumber, id TransportEndpointID, ep TransportEndpoint) {
+	for _, n := range netProtos {
+		if eps, ok := d.protocol[protocolIDs{n, protocol}]; ok {
+			eps.unregisterEndpoint(id, ep)
+		}
+	}
+}
+
+var loopbackSubnet = func() tcpip.Subnet {
+	sn, err := tcpip.NewSubnet("\x7f\x00\x00\x00", "\xff\x00\x00\x00")
+	if err != nil {
+		panic(err)
+	}
+	return sn
+}()
+
+// deliverPacket attempts to find one or more matching transport endpoints, and
+// then, if matches are found, delivers the packet to them. Returns true if it
+// found one or more endpoints, false otherwise.
+func (d *transportDemuxer) deliverPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView, id TransportEndpointID) bool {
+	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
+	if !ok {
+		return false
+	}
+
+	// If a sender bound to the Loopback interface sends a broadcast,
+	// that broadcast must not be delivered to the sender.
+	if loopbackSubnet.Contains(r.RemoteAddress) && r.LocalAddress == header.IPv4Broadcast && id.LocalPort == id.RemotePort {
+		return false
+	}
+
+	// If the packet is a broadcast, then find all matching transport endpoints.
+	// Otherwise, try to find a single matching transport endpoint.
+	destEps := make([]TransportEndpoint, 0, 1)
+	eps.mu.RLock()
+
+	if protocol == header.UDPProtocolNumber && id.LocalAddress == header.IPv4Broadcast {
+		for epID, endpoint := range eps.endpoints {
+			if epID.LocalPort == id.LocalPort {
+				destEps = append(destEps, endpoint)
+			}
+		}
+	} else if ep := d.findEndpointLocked(eps, vv, id); ep != nil {
+		destEps = append(destEps, ep)
+	}
+
+	eps.mu.RUnlock()
+
+	// Fail if we didn't find at least one matching transport endpoint.
+	if len(destEps) == 0 {
+		// UDP packet could not be delivered to an unknown destination port.
+		if protocol == header.UDPProtocolNumber {
+			r.Stats().UDP.UnknownPortErrors.Increment()
+		}
+		return false
+	}
+
+	// Deliver the packet.
+	for _, ep := range destEps {
+		ep.HandlePacket(r, id, vv)
+	}
+
+	return true
+}
+
+// deliverRawPacket attempts to deliver the given packet and returns whether it
+// was delivered successfully.
+func (d *transportDemuxer) deliverRawPacket(r *Route, protocol tcpip.TransportProtocolNumber, netHeader buffer.View, vv buffer.VectorisedView) bool {
+	eps, ok := d.protocol[protocolIDs{r.NetProto, protocol}]
+	if !ok {
+		return false
+	}
+
+	// As in net/ipv4/ip_input.c:ip_local_deliver, attempt to deliver via
+	// raw endpoint first. If there are multiple raw endpoints, they all
+	// receive the packet.
+	foundRaw := false
+	eps.mu.RLock()
+	for _, rawEP := range eps.rawEndpoints {
+		// Each endpoint gets its own copy of the packet for the sake
+		// of save/restore.
+		rawEP.HandlePacket(r, buffer.NewViewFromBytes(netHeader), vv.ToView().ToVectorisedView())
+		foundRaw = true
+	}
+	eps.mu.RUnlock()
+
+	return foundRaw
+}
+
+// deliverControlPacket attempts to deliver the given control packet. Returns
+// true if it found an endpoint, false otherwise.
+func (d *transportDemuxer) deliverControlPacket(net tcpip.NetworkProtocolNumber, trans tcpip.TransportProtocolNumber, typ ControlType, extra uint32, vv buffer.VectorisedView, id TransportEndpointID) bool {
+	eps, ok := d.protocol[protocolIDs{net, trans}]
+	if !ok {
+		return false
+	}
+
+	// Try to find the endpoint.
+	eps.mu.RLock()
+	ep := d.findEndpointLocked(eps, vv, id)
+	eps.mu.RUnlock()
+
+	// Fail if we didn't find one.
+	if ep == nil {
+		return false
+	}
+
+	// Deliver the packet.
+	ep.HandleControlPacket(id, typ, extra, vv)
+
+	return true
+}
+
+func (d *transportDemuxer) findEndpointLocked(eps *transportEndpoints, vv buffer.VectorisedView, id TransportEndpointID) TransportEndpoint {
+	// Try to find a match with the id as provided.
+	if ep, ok := eps.endpoints[id]; ok {
+		return ep
+	}
+
+	// Try to find a match with the id minus the local address.
+	nid := id
+
+	nid.LocalAddress = ""
+	if ep, ok := eps.endpoints[nid]; ok {
+		return ep
+	}
+
+	// Try to find a match with the id minus the remote part.
+	nid.LocalAddress = id.LocalAddress
+	nid.RemoteAddress = ""
+	nid.RemotePort = 0
+	if ep, ok := eps.endpoints[nid]; ok {
+		return ep
+	}
+
+	// Try to find a match with only the local port.
+	nid.LocalAddress = ""
+	if ep, ok := eps.endpoints[nid]; ok {
+		return ep
+	}
+
+	return nil
+}
+
+// registerRawEndpoint registers the given endpoint with the dispatcher such
+// that packets of the appropriate protocol are delivered to it. A single
+// packet can be sent to one or more raw endpoints along with a non-raw
+// endpoint.
+func (d *transportDemuxer) registerRawEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) *tcpip.Error {
+	eps, ok := d.protocol[protocolIDs{netProto, transProto}]
+	if !ok {
+		return nil
+	}
+
+	eps.mu.Lock()
+	defer eps.mu.Unlock()
+	eps.rawEndpoints = append(eps.rawEndpoints, ep)
+
+	return nil
+}
+
+// unregisterRawEndpoint unregisters the raw endpoint for the given transport
+// protocol such that it won't receive any more packets.
+func (d *transportDemuxer) unregisterRawEndpoint(netProto tcpip.NetworkProtocolNumber, transProto tcpip.TransportProtocolNumber, ep RawTransportEndpoint) {
+	eps, ok := d.protocol[protocolIDs{netProto, transProto}]
+	if !ok {
+		panic(fmt.Errorf("tried to unregister endpoint with unsupported network and transport protocol pair: %d, %d", netProto, transProto))
+	}
+
+	eps.mu.Lock()
+	defer eps.mu.Unlock()
+	for i, rawEP := range eps.rawEndpoints {
+		if rawEP == ep {
+			eps.rawEndpoints = append(eps.rawEndpoints[:i], eps.rawEndpoints[i+1:]...)
+			return
+		}
+	}
+}