54 files changed, 8348 insertions, 0 deletions
diff --git a/pkg/tcpip/link/channel/BUILD b/pkg/tcpip/link/channel/BUILD
new file mode 100644
index 000000000..b8b93e78e
--- /dev/null
+++ b/pkg/tcpip/link/channel/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "channel",
+    srcs = ["channel.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/channel/channel.go b/pkg/tcpip/link/channel/channel.go
new file mode 100644
index 000000000..20b183da0
--- /dev/null
+++ b/pkg/tcpip/link/channel/channel.go
@@ -0,0 +1,298 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package channel provides the implemention of channel-based data-link layer
+// endpoints. Such endpoints allow injection of inbound packets and store
+// outbound packets in a channel.
+package channel
+
+import (
+	"context"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// PacketInfo holds all the information about an outbound packet.
+type PacketInfo struct {
+	Pkt   *stack.PacketBuffer
+	Proto tcpip.NetworkProtocolNumber
+	GSO   *stack.GSO
+	Route stack.Route
+}
+
+// Notification is the interface for receiving notification from the packet
+// queue.
+type Notification interface {
+	// WriteNotify will be called when a write happens to the queue.
+	WriteNotify()
+}
+
+// NotificationHandle is an opaque handle to the registered notification target.
+// It can be used to unregister the notification when no longer interested.
+//
+// +stateify savable
+type NotificationHandle struct {
+	n Notification
+}
+
+type queue struct {
+	// c is the outbound packet channel.
+	c chan PacketInfo
+	// mu protects fields below.
+	mu     sync.RWMutex
+	notify []*NotificationHandle
+}
+
+func (q *queue) Close() {
+	close(q.c)
+}
+
+func (q *queue) Read() (PacketInfo, bool) {
+	select {
+	case p := <-q.c:
+		return p, true
+	default:
+		return PacketInfo{}, false
+	}
+}
+
+func (q *queue) ReadContext(ctx context.Context) (PacketInfo, bool) {
+	select {
+	case pkt := <-q.c:
+		return pkt, true
+	case <-ctx.Done():
+		return PacketInfo{}, false
+	}
+}
+
+func (q *queue) Write(p PacketInfo) bool {
+	wrote := false
+	select {
+	case q.c <- p:
+		wrote = true
+	default:
+	}
+	q.mu.Lock()
+	notify := q.notify
+	q.mu.Unlock()
+
+	if wrote {
+		// Send notification outside of lock.
+		for _, h := range notify {
+			h.n.WriteNotify()
+		}
+	}
+	return wrote
+}
+
+func (q *queue) Num() int {
+	return len(q.c)
+}
+
+func (q *queue) AddNotify(notify Notification) *NotificationHandle {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	h := &NotificationHandle{n: notify}
+	q.notify = append(q.notify, h)
+	return h
+}
+
+func (q *queue) RemoveNotify(handle *NotificationHandle) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	// Make a copy, since we reads the array outside of lock when notifying.
+	notify := make([]*NotificationHandle, 0, len(q.notify))
+	for _, h := range q.notify {
+		if h != handle {
+			notify = append(notify, h)
+		}
+	}
+	q.notify = notify
+}
+
+// Endpoint is link layer endpoint that stores outbound packets in a channel
+// and allows injection of inbound packets.
+type Endpoint struct {
+	dispatcher         stack.NetworkDispatcher
+	mtu                uint32
+	linkAddr           tcpip.LinkAddress
+	LinkEPCapabilities stack.LinkEndpointCapabilities
+
+	// Outbound packet queue.
+	q *queue
+}
+
+// New creates a new channel endpoint.
+func New(size int, mtu uint32, linkAddr tcpip.LinkAddress) *Endpoint {
+	return &Endpoint{
+		q: &queue{
+			c: make(chan PacketInfo, size),
+		},
+		mtu:      mtu,
+		linkAddr: linkAddr,
+	}
+}
+
+// Close closes e. Further packet injections will panic. Reads continue to
+// succeed until all packets are read.
+func (e *Endpoint) Close() {
+	e.q.Close()
+}
+
+// Read does non-blocking read one packet from the outbound packet queue.
+func (e *Endpoint) Read() (PacketInfo, bool) {
+	return e.q.Read()
+}
+
+// ReadContext does blocking read for one packet from the outbound packet queue.
+// It can be cancelled by ctx, and in this case, it returns false.
+func (e *Endpoint) ReadContext(ctx context.Context) (PacketInfo, bool) {
+	return e.q.ReadContext(ctx)
+}
+
+// Drain removes all outbound packets from the channel and counts them.
+func (e *Endpoint) Drain() int {
+	c := 0
+	for {
+		if _, ok := e.Read(); !ok {
+			return c
+		}
+		c++
+	}
+}
+
+// NumQueued returns the number of packet queued for outbound.
+func (e *Endpoint) NumQueued() int {
+	return e.q.Num()
+}
+
+// InjectInbound injects an inbound packet.
+func (e *Endpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.InjectLinkAddr(protocol, "", pkt)
+}
+
+// InjectLinkAddr injects an inbound packet with a remote link address.
+func (e *Endpoint) InjectLinkAddr(protocol tcpip.NetworkProtocolNumber, remote tcpip.LinkAddress, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, "" /* local */, protocol, pkt)
+}
+
+// Attach saves the stack network-layer dispatcher for use later when packets
+// are injected.
+func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *Endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
+// during construction.
+func (e *Endpoint) MTU() uint32 {
+	return e.mtu
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.LinkEPCapabilities
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (*Endpoint) GSOMaxSize() uint32 {
+	return 1 << 15
+}
+
+// MaxHeaderLength returns the maximum size of the link layer header. Given it
+// doesn't have a header, it just returns 0.
+func (*Endpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.linkAddr
+}
+
+// WritePacket stores outbound packets into the channel.
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	// Clone r then release its resource so we only get the relevant fields from
+	// stack.Route without holding a reference to a NIC's endpoint.
+	route := r.Clone()
+	route.Release()
+	p := PacketInfo{
+		Pkt:   pkt,
+		Proto: protocol,
+		GSO:   gso,
+		Route: route,
+	}
+
+	e.q.Write(p)
+
+	return nil
+}
+
+// WritePackets stores outbound packets into the channel.
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	// Clone r then release its resource so we only get the relevant fields from
+	// stack.Route without holding a reference to a NIC's endpoint.
+	route := r.Clone()
+	route.Release()
+	n := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		p := PacketInfo{
+			Pkt:   pkt,
+			Proto: protocol,
+			GSO:   gso,
+			Route: route,
+		}
+
+		if !e.q.Write(p) {
+			break
+		}
+		n++
+	}
+
+	return n, nil
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	p := PacketInfo{
+		Pkt:   &stack.PacketBuffer{Data: vv},
+		Proto: 0,
+		GSO:   nil,
+	}
+
+	e.q.Write(p)
+
+	return nil
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (*Endpoint) Wait() {}
+
+// AddNotify adds a notification target for receiving event about outgoing
+// packets.
+func (e *Endpoint) AddNotify(notify Notification) *NotificationHandle {
+	return e.q.AddNotify(notify)
+}
+
+// RemoveNotify removes handle from the list of notification targets.
+func (e *Endpoint) RemoveNotify(handle *NotificationHandle) {
+	e.q.RemoveNotify(handle)
+}
diff --git a/pkg/tcpip/link/fdbased/BUILD b/pkg/tcpip/link/fdbased/BUILD
new file mode 100644
index 000000000..aa6db9aea
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/BUILD
@@ -0,0 +1,40 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "fdbased",
+    srcs = [
+        "endpoint.go",
+        "endpoint_unsafe.go",
+        "mmap.go",
+        "mmap_stub.go",
+        "mmap_unsafe.go",
+        "packet_dispatchers.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/binary",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/rawfile",
+        "//pkg/tcpip/stack",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "fdbased_test",
+    size = "small",
+    srcs = ["endpoint_test.go"],
+    library = ":fdbased",
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/rawfile",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/fdbased/endpoint.go b/pkg/tcpip/link/fdbased/endpoint.go
new file mode 100644
index 000000000..f34082e1a
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/endpoint.go
@@ -0,0 +1,657 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+// Package fdbased provides the implemention of data-link layer endpoints
+// backed by boundary-preserving file descriptors (e.g., TUN devices,
+// seqpacket/datagram sockets).
+//
+// FD based endpoints can be used in the networking stack by calling New() to
+// create a new endpoint, and then passing it as an argument to
+// Stack.CreateNIC().
+//
+// FD based endpoints can use more than one file descriptor to read incoming
+// packets. If there are more than one FDs specified and the underlying FD is an
+// AF_PACKET then the endpoint will enable FANOUT mode on the socket so that the
+// host kernel will consistently hash the packets to the sockets. This ensures
+// that packets for the same TCP streams are not reordered.
+//
+// Similarly if more than one FD's are specified where the underlying FD is not
+// AF_PACKET then it's the caller's responsibility to ensure that all inbound
+// packets on the descriptors are consistently 5 tuple hashed to one of the
+// descriptors to prevent TCP reordering.
+//
+// Since netstack today does not compute 5 tuple hashes for outgoing packets we
+// only use the first FD to write outbound packets. Once 5 tuple hashes for
+// all outbound packets are available we will make use of all underlying FD's to
+// write outbound packets.
+package fdbased
+
+import (
+	"fmt"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// linkDispatcher reads packets from the link FD and dispatches them to the
+// NetworkDispatcher.
+type linkDispatcher interface {
+	dispatch() (bool, *tcpip.Error)
+}
+
+// PacketDispatchMode are the various supported methods of receiving and
+// dispatching packets from the underlying FD.
+type PacketDispatchMode int
+
+const (
+	// Readv is the default dispatch mode and is the least performant of the
+	// dispatch options but the one that is supported by all underlying FD
+	// types.
+	Readv PacketDispatchMode = iota
+	// RecvMMsg enables use of recvmmsg() syscall instead of readv() to
+	// read inbound packets. This reduces # of syscalls needed to process
+	// packets.
+	//
+	// NOTE: recvmmsg() is only supported for sockets, so if the underlying
+	// FD is not a socket then the code will still fall back to the readv()
+	// path.
+	RecvMMsg
+	// PacketMMap enables use of PACKET_RX_RING to receive packets from the
+	// NIC. PacketMMap requires that the underlying FD be an AF_PACKET. The
+	// primary use-case for this is runsc which uses an AF_PACKET FD to
+	// receive packets from the veth device.
+	PacketMMap
+)
+
+func (p PacketDispatchMode) String() string {
+	switch p {
+	case Readv:
+		return "Readv"
+	case RecvMMsg:
+		return "RecvMMsg"
+	case PacketMMap:
+		return "PacketMMap"
+	default:
+		return fmt.Sprintf("unknown packet dispatch mode '%d'", p)
+	}
+}
+
+type endpoint struct {
+	// fds is the set of file descriptors each identifying one inbound/outbound
+	// channel. The endpoint will dispatch from all inbound channels as well as
+	// hash outbound packets to specific channels based on the packet hash.
+	fds []int
+
+	// mtu (maximum transmission unit) is the maximum size of a packet.
+	mtu uint32
+
+	// hdrSize specifies the link-layer header size. If set to 0, no header
+	// is added/removed; otherwise an ethernet header is used.
+	hdrSize int
+
+	// addr is the address of the endpoint.
+	addr tcpip.LinkAddress
+
+	// caps holds the endpoint capabilities.
+	caps stack.LinkEndpointCapabilities
+
+	// closed is a function to be called when the FD's peer (if any) closes
+	// its end of the communication pipe.
+	closed func(*tcpip.Error)
+
+	inboundDispatchers []linkDispatcher
+	dispatcher         stack.NetworkDispatcher
+
+	// packetDispatchMode controls the packet dispatcher used by this
+	// endpoint.
+	packetDispatchMode PacketDispatchMode
+
+	// gsoMaxSize is the maximum GSO packet size. It is zero if GSO is
+	// disabled.
+	gsoMaxSize uint32
+
+	// wg keeps track of running goroutines.
+	wg sync.WaitGroup
+}
+
+// Options specify the details about the fd-based endpoint to be created.
+type Options struct {
+	// FDs is a set of FDs used to read/write packets.
+	FDs []int
+
+	// MTU is the mtu to use for this endpoint.
+	MTU uint32
+
+	// EthernetHeader if true, indicates that the endpoint should read/write
+	// ethernet frames instead of IP packets.
+	EthernetHeader bool
+
+	// ClosedFunc is a function to be called when an endpoint's peer (if
+	// any) closes its end of the communication pipe.
+	ClosedFunc func(*tcpip.Error)
+
+	// Address is the link address for this endpoint. Only used if
+	// EthernetHeader is true.
+	Address tcpip.LinkAddress
+
+	// SaveRestore if true, indicates that this NIC capability set should
+	// include CapabilitySaveRestore
+	SaveRestore bool
+
+	// DisconnectOk if true, indicates that this NIC capability set should
+	// include CapabilityDisconnectOk.
+	DisconnectOk bool
+
+	// GSOMaxSize is the maximum GSO packet size. It is zero if GSO is
+	// disabled.
+	GSOMaxSize uint32
+
+	// SoftwareGSOEnabled indicates whether software GSO is enabled or not.
+	SoftwareGSOEnabled bool
+
+	// PacketDispatchMode specifies the type of inbound dispatcher to be
+	// used for this endpoint.
+	PacketDispatchMode PacketDispatchMode
+
+	// TXChecksumOffload if true, indicates that this endpoints capability
+	// set should include CapabilityTXChecksumOffload.
+	TXChecksumOffload bool
+
+	// RXChecksumOffload if true, indicates that this endpoints capability
+	// set should include CapabilityRXChecksumOffload.
+	RXChecksumOffload bool
+}
+
+// fanoutID is used for AF_PACKET based endpoints to enable PACKET_FANOUT
+// support in the host kernel. This allows us to use multiple FD's to receive
+// from the same underlying NIC. The fanoutID needs to be the same for a given
+// set of FD's that point to the same NIC. Trying to set the PACKET_FANOUT
+// option for an FD with a fanoutID already in use by another FD for a different
+// NIC will return an EINVAL.
+var fanoutID = 1
+
+// New creates a new fd-based endpoint.
+//
+// Makes fd non-blocking, but does not take ownership of fd, which must remain
+// open for the lifetime of the returned endpoint (until after the endpoint has
+// stopped being using and Wait returns).
+func New(opts *Options) (stack.LinkEndpoint, error) {
+	caps := stack.LinkEndpointCapabilities(0)
+	if opts.RXChecksumOffload {
+		caps |= stack.CapabilityRXChecksumOffload
+	}
+
+	if opts.TXChecksumOffload {
+		caps |= stack.CapabilityTXChecksumOffload
+	}
+
+	hdrSize := 0
+	if opts.EthernetHeader {
+		hdrSize = header.EthernetMinimumSize
+		caps |= stack.CapabilityResolutionRequired
+	}
+
+	if opts.SaveRestore {
+		caps |= stack.CapabilitySaveRestore
+	}
+
+	if opts.DisconnectOk {
+		caps |= stack.CapabilityDisconnectOk
+	}
+
+	if len(opts.FDs) == 0 {
+		return nil, fmt.Errorf("opts.FD is empty, at least one FD must be specified")
+	}
+
+	e := &endpoint{
+		fds:                opts.FDs,
+		mtu:                opts.MTU,
+		caps:               caps,
+		closed:             opts.ClosedFunc,
+		addr:               opts.Address,
+		hdrSize:            hdrSize,
+		packetDispatchMode: opts.PacketDispatchMode,
+	}
+
+	// Create per channel dispatchers.
+	for i := 0; i < len(e.fds); i++ {
+		fd := e.fds[i]
+		if err := syscall.SetNonblock(fd, true); err != nil {
+			return nil, fmt.Errorf("syscall.SetNonblock(%v) failed: %v", fd, err)
+		}
+
+		isSocket, err := isSocketFD(fd)
+		if err != nil {
+			return nil, err
+		}
+		if isSocket {
+			if opts.GSOMaxSize != 0 {
+				if opts.SoftwareGSOEnabled {
+					e.caps |= stack.CapabilitySoftwareGSO
+				} else {
+					e.caps |= stack.CapabilityHardwareGSO
+				}
+				e.gsoMaxSize = opts.GSOMaxSize
+			}
+		}
+		inboundDispatcher, err := createInboundDispatcher(e, fd, isSocket)
+		if err != nil {
+			return nil, fmt.Errorf("createInboundDispatcher(...) = %v", err)
+		}
+		e.inboundDispatchers = append(e.inboundDispatchers, inboundDispatcher)
+	}
+
+	// Increment fanoutID to ensure that we don't re-use the same fanoutID for
+	// the next endpoint.
+	fanoutID++
+
+	return e, nil
+}
+
+func createInboundDispatcher(e *endpoint, fd int, isSocket bool) (linkDispatcher, error) {
+	// By default use the readv() dispatcher as it works with all kinds of
+	// FDs (tap/tun/unix domain sockets and af_packet).
+	inboundDispatcher, err := newReadVDispatcher(fd, e)
+	if err != nil {
+		return nil, fmt.Errorf("newReadVDispatcher(%d, %+v) = %v", fd, e, err)
+	}
+
+	if isSocket {
+		sa, err := unix.Getsockname(fd)
+		if err != nil {
+			return nil, fmt.Errorf("unix.Getsockname(%d) = %v", fd, err)
+		}
+		switch sa.(type) {
+		case *unix.SockaddrLinklayer:
+			// enable PACKET_FANOUT mode is the underlying socket is
+			// of type AF_PACKET.
+			const fanoutType = 0x8000 // PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_DEFRAG
+			fanoutArg := fanoutID | fanoutType<<16
+			if err := syscall.SetsockoptInt(fd, syscall.SOL_PACKET, unix.PACKET_FANOUT, fanoutArg); err != nil {
+				return nil, fmt.Errorf("failed to enable PACKET_FANOUT option: %v", err)
+			}
+		}
+
+		switch e.packetDispatchMode {
+		case PacketMMap:
+			inboundDispatcher, err = newPacketMMapDispatcher(fd, e)
+			if err != nil {
+				return nil, fmt.Errorf("newPacketMMapDispatcher(%d, %+v) = %v", fd, e, err)
+			}
+		case RecvMMsg:
+			// If the provided FD is a socket then we optimize
+			// packet reads by using recvmmsg() instead of read() to
+			// read packets in a batch.
+			inboundDispatcher, err = newRecvMMsgDispatcher(fd, e)
+			if err != nil {
+				return nil, fmt.Errorf("newRecvMMsgDispatcher(%d, %+v) = %v", fd, e, err)
+			}
+		}
+	}
+	return inboundDispatcher, nil
+}
+
+func isSocketFD(fd int) (bool, error) {
+	var stat syscall.Stat_t
+	if err := syscall.Fstat(fd, &stat); err != nil {
+		return false, fmt.Errorf("syscall.Fstat(%v,...) failed: %v", fd, err)
+	}
+	return (stat.Mode & syscall.S_IFSOCK) == syscall.S_IFSOCK, nil
+}
+
+// Attach launches the goroutine that reads packets from the file descriptor and
+// dispatches them via the provided dispatcher.
+func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+	// Link endpoints are not savable. When transportation endpoints are
+	// saved, they stop sending outgoing packets and all incoming packets
+	// are rejected.
+	for i := range e.inboundDispatchers {
+		e.wg.Add(1)
+		go func(i int) { // S/R-SAFE: See above.
+			e.dispatchLoop(e.inboundDispatchers[i])
+			e.wg.Done()
+		}(i)
+	}
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
+// during construction.
+func (e *endpoint) MTU() uint32 {
+	return e.mtu
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.caps
+}
+
+// MaxHeaderLength returns the maximum size of the link-layer header.
+func (e *endpoint) MaxHeaderLength() uint16 {
+	return uint16(e.hdrSize)
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (e *endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.addr
+}
+
+// Wait implements stack.LinkEndpoint.Wait. It waits for the endpoint to stop
+// reading from its FD.
+func (e *endpoint) Wait() {
+	e.wg.Wait()
+}
+
+// virtioNetHdr is declared in linux/virtio_net.h.
+type virtioNetHdr struct {
+	flags      uint8
+	gsoType    uint8
+	hdrLen     uint16
+	gsoSize    uint16
+	csumStart  uint16
+	csumOffset uint16
+}
+
+// These constants are declared in linux/virtio_net.h.
+const (
+	_VIRTIO_NET_HDR_F_NEEDS_CSUM = 1
+
+	_VIRTIO_NET_HDR_GSO_TCPV4 = 1
+	_VIRTIO_NET_HDR_GSO_TCPV6 = 4
+)
+
+// WritePacket writes outbound packets to the file descriptor. If it is not
+// currently writable, the packet is dropped.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	if e.hdrSize > 0 {
+		// Add ethernet header if needed.
+		eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
+		pkt.LinkHeader = buffer.View(eth)
+		ethHdr := &header.EthernetFields{
+			DstAddr: r.RemoteLinkAddress,
+			Type:    protocol,
+		}
+
+		// Preserve the src address if it's set in the route.
+		if r.LocalLinkAddress != "" {
+			ethHdr.SrcAddr = r.LocalLinkAddress
+		} else {
+			ethHdr.SrcAddr = e.addr
+		}
+		eth.Encode(ethHdr)
+	}
+
+	fd := e.fds[pkt.Hash%uint32(len(e.fds))]
+	if e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+		vnetHdr := virtioNetHdr{}
+		if gso != nil {
+			vnetHdr.hdrLen = uint16(pkt.Header.UsedLength())
+			if gso.NeedsCsum {
+				vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
+				vnetHdr.csumStart = header.EthernetMinimumSize + gso.L3HdrLen
+				vnetHdr.csumOffset = gso.CsumOffset
+			}
+			if gso.Type != stack.GSONone && uint16(pkt.Data.Size()) > gso.MSS {
+				switch gso.Type {
+				case stack.GSOTCPv4:
+					vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
+				case stack.GSOTCPv6:
+					vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
+				default:
+					panic(fmt.Sprintf("Unknown gso type: %v", gso.Type))
+				}
+				vnetHdr.gsoSize = gso.MSS
+			}
+		}
+
+		vnetHdrBuf := binary.Marshal(make([]byte, 0, virtioNetHdrSize), binary.LittleEndian, vnetHdr)
+		return rawfile.NonBlockingWrite3(fd, vnetHdrBuf, pkt.Header.View(), pkt.Data.ToView())
+	}
+
+	if pkt.Data.Size() == 0 {
+		return rawfile.NonBlockingWrite(fd, pkt.Header.View())
+	}
+	if pkt.Header.UsedLength() == 0 {
+		return rawfile.NonBlockingWrite(fd, pkt.Data.ToView())
+	}
+
+	return rawfile.NonBlockingWrite3(fd, pkt.Header.View(), pkt.Data.ToView(), nil)
+}
+
+func (e *endpoint) sendBatch(batchFD int, batch []*stack.PacketBuffer) (int, *tcpip.Error) {
+	// Send a batch of packets through batchFD.
+	mmsgHdrs := make([]rawfile.MMsgHdr, 0, len(batch))
+	for _, pkt := range batch {
+		var ethHdrBuf []byte
+		iovLen := 0
+		if e.hdrSize > 0 {
+			// Add ethernet header if needed.
+			ethHdrBuf = make([]byte, header.EthernetMinimumSize)
+			eth := header.Ethernet(ethHdrBuf)
+			ethHdr := &header.EthernetFields{
+				DstAddr: pkt.EgressRoute.RemoteLinkAddress,
+				Type:    pkt.NetworkProtocolNumber,
+			}
+
+			// Preserve the src address if it's set in the route.
+			if pkt.EgressRoute.LocalLinkAddress != "" {
+				ethHdr.SrcAddr = pkt.EgressRoute.LocalLinkAddress
+			} else {
+				ethHdr.SrcAddr = e.addr
+			}
+			eth.Encode(ethHdr)
+			iovLen++
+		}
+
+		vnetHdr := virtioNetHdr{}
+		var vnetHdrBuf []byte
+		if e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+			if pkt.GSOOptions != nil {
+				vnetHdr.hdrLen = uint16(pkt.Header.UsedLength())
+				if pkt.GSOOptions.NeedsCsum {
+					vnetHdr.flags = _VIRTIO_NET_HDR_F_NEEDS_CSUM
+					vnetHdr.csumStart = header.EthernetMinimumSize + pkt.GSOOptions.L3HdrLen
+					vnetHdr.csumOffset = pkt.GSOOptions.CsumOffset
+				}
+				if pkt.GSOOptions.Type != stack.GSONone && uint16(pkt.Data.Size()) > pkt.GSOOptions.MSS {
+					switch pkt.GSOOptions.Type {
+					case stack.GSOTCPv4:
+						vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV4
+					case stack.GSOTCPv6:
+						vnetHdr.gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
+					default:
+						panic(fmt.Sprintf("Unknown gso type: %v", pkt.GSOOptions.Type))
+					}
+					vnetHdr.gsoSize = pkt.GSOOptions.MSS
+				}
+			}
+			vnetHdrBuf = binary.Marshal(make([]byte, 0, virtioNetHdrSize), binary.LittleEndian, vnetHdr)
+			iovLen++
+		}
+
+		iovecs := make([]syscall.Iovec, iovLen+1+len(pkt.Data.Views()))
+		var mmsgHdr rawfile.MMsgHdr
+		mmsgHdr.Msg.Iov = &iovecs[0]
+		iovecIdx := 0
+		if vnetHdrBuf != nil {
+			v := &iovecs[iovecIdx]
+			v.Base = &vnetHdrBuf[0]
+			v.Len = uint64(len(vnetHdrBuf))
+			iovecIdx++
+		}
+		if ethHdrBuf != nil {
+			v := &iovecs[iovecIdx]
+			v.Base = &ethHdrBuf[0]
+			v.Len = uint64(len(ethHdrBuf))
+			iovecIdx++
+		}
+		pktSize := uint64(0)
+		// Encode L3 Header
+		v := &iovecs[iovecIdx]
+		hdr := &pkt.Header
+		hdrView := hdr.View()
+		v.Base = &hdrView[0]
+		v.Len = uint64(len(hdrView))
+		pktSize += v.Len
+		iovecIdx++
+
+		// Now encode the Transport Payload.
+		pktViews := pkt.Data.Views()
+		for i := range pktViews {
+			vec := &iovecs[iovecIdx]
+			iovecIdx++
+			vec.Base = &pktViews[i][0]
+			vec.Len = uint64(len(pktViews[i]))
+			pktSize += vec.Len
+		}
+		mmsgHdr.Msg.Iovlen = uint64(iovecIdx)
+		mmsgHdrs = append(mmsgHdrs, mmsgHdr)
+	}
+
+	packets := 0
+	for len(mmsgHdrs) > 0 {
+		sent, err := rawfile.NonBlockingSendMMsg(batchFD, mmsgHdrs)
+		if err != nil {
+			return packets, err
+		}
+		packets += sent
+		mmsgHdrs = mmsgHdrs[sent:]
+	}
+
+	return packets, nil
+}
+
+// WritePackets writes outbound packets to the underlying file descriptors. If
+// one is not currently writable, the packet is dropped.
+//
+// Being a batch API, each packet in pkts should have the following
+// fields populated:
+//  - pkt.EgressRoute
+//  - pkt.GSOOptions
+//  - pkt.NetworkProtocolNumber
+func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, _ tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	// Preallocate to avoid repeated reallocation as we append to batch.
+	// batchSz is 47 because when SWGSO is in use then a single 65KB TCP
+	// segment can get split into 46 segments of 1420 bytes and a single 216
+	// byte segment.
+	const batchSz = 47
+	batch := make([]*stack.PacketBuffer, 0, batchSz)
+	batchFD := -1
+	sentPackets := 0
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		if len(batch) == 0 {
+			batchFD = e.fds[pkt.Hash%uint32(len(e.fds))]
+		}
+		pktFD := e.fds[pkt.Hash%uint32(len(e.fds))]
+		if sendNow := pktFD != batchFD; !sendNow {
+			batch = append(batch, pkt)
+			continue
+		}
+		n, err := e.sendBatch(batchFD, batch)
+		sentPackets += n
+		if err != nil {
+			return sentPackets, err
+		}
+		batch = batch[:0]
+		batch = append(batch, pkt)
+		batchFD = pktFD
+	}
+
+	if len(batch) != 0 {
+		n, err := e.sendBatch(batchFD, batch)
+		sentPackets += n
+		if err != nil {
+			return sentPackets, err
+		}
+	}
+	return sentPackets, nil
+}
+
+// viewsEqual tests whether v1 and v2 refer to the same backing bytes.
+func viewsEqual(vs1, vs2 []buffer.View) bool {
+	return len(vs1) == len(vs2) && (len(vs1) == 0 || &vs1[0] == &vs2[0])
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	return rawfile.NonBlockingWrite(e.fds[0], vv.ToView())
+}
+
+// InjectOutobund implements stack.InjectableEndpoint.InjectOutbound.
+func (e *endpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error {
+	return rawfile.NonBlockingWrite(e.fds[0], packet)
+}
+
+// dispatchLoop reads packets from the file descriptor in a loop and dispatches
+// them to the network stack.
+func (e *endpoint) dispatchLoop(inboundDispatcher linkDispatcher) *tcpip.Error {
+	for {
+		cont, err := inboundDispatcher.dispatch()
+		if err != nil || !cont {
+			if e.closed != nil {
+				e.closed(err)
+			}
+			return err
+		}
+	}
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (e *endpoint) GSOMaxSize() uint32 {
+	return e.gsoMaxSize
+}
+
+// InjectableEndpoint is an injectable fd-based endpoint. The endpoint writes
+// to the FD, but does not read from it. All reads come from injected packets.
+type InjectableEndpoint struct {
+	endpoint
+
+	dispatcher stack.NetworkDispatcher
+}
+
+// Attach saves the stack network-layer dispatcher for use later when packets
+// are injected.
+func (e *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// InjectInbound injects an inbound packet.
+func (e *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, pkt)
+}
+
+// NewInjectable creates a new fd-based InjectableEndpoint.
+func NewInjectable(fd int, mtu uint32, capabilities stack.LinkEndpointCapabilities) *InjectableEndpoint {
+	syscall.SetNonblock(fd, true)
+
+	return &InjectableEndpoint{endpoint: endpoint{
+		fds:  []int{fd},
+		mtu:  mtu,
+		caps: capabilities,
+	}}
+}
diff --git a/pkg/tcpip/link/fdbased/endpoint_test.go b/pkg/tcpip/link/fdbased/endpoint_test.go
new file mode 100644
index 000000000..eaee7e5d7
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/endpoint_test.go
@@ -0,0 +1,502 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package fdbased
+
+import (
+	"bytes"
+	"fmt"
+	"math/rand"
+	"reflect"
+	"syscall"
+	"testing"
+	"time"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	mtu        = 1500
+	laddr      = tcpip.LinkAddress("\x11\x22\x33\x44\x55\x66")
+	raddr      = tcpip.LinkAddress("\x77\x88\x99\xaa\xbb\xcc")
+	proto      = 10
+	csumOffset = 48
+	gsoMSS     = 500
+)
+
+type packetInfo struct {
+	raddr    tcpip.LinkAddress
+	proto    tcpip.NetworkProtocolNumber
+	contents *stack.PacketBuffer
+}
+
+type context struct {
+	t        *testing.T
+	readFDs  []int
+	writeFDs []int
+	ep       stack.LinkEndpoint
+	ch       chan packetInfo
+	done     chan struct{}
+}
+
+func newContext(t *testing.T, opt *Options) *context {
+	firstFDPair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
+	if err != nil {
+		t.Fatalf("Socketpair failed: %v", err)
+	}
+	secondFDPair, err := syscall.Socketpair(syscall.AF_UNIX, syscall.SOCK_SEQPACKET, 0)
+	if err != nil {
+		t.Fatalf("Socketpair failed: %v", err)
+	}
+
+	done := make(chan struct{}, 2)
+	opt.ClosedFunc = func(*tcpip.Error) {
+		done <- struct{}{}
+	}
+
+	opt.FDs = []int{firstFDPair[1], secondFDPair[1]}
+	ep, err := New(opt)
+	if err != nil {
+		t.Fatalf("Failed to create FD endpoint: %v", err)
+	}
+
+	c := &context{
+		t:        t,
+		readFDs:  []int{firstFDPair[0], secondFDPair[0]},
+		writeFDs: opt.FDs,
+		ep:       ep,
+		ch:       make(chan packetInfo, 100),
+		done:     done,
+	}
+
+	ep.Attach(c)
+
+	return c
+}
+
+func (c *context) cleanup() {
+	for _, fd := range c.readFDs {
+		syscall.Close(fd)
+	}
+	<-c.done
+	<-c.done
+	for _, fd := range c.writeFDs {
+		syscall.Close(fd)
+	}
+}
+
+func (c *context) DeliverNetworkPacket(remote tcpip.LinkAddress, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	c.ch <- packetInfo{remote, protocol, pkt}
+}
+
+func TestNoEthernetProperties(t *testing.T) {
+	c := newContext(t, &Options{MTU: mtu})
+	defer c.cleanup()
+
+	if want, v := uint16(0), c.ep.MaxHeaderLength(); want != v {
+		t.Fatalf("MaxHeaderLength() = %v, want %v", v, want)
+	}
+
+	if want, v := uint32(mtu), c.ep.MTU(); want != v {
+		t.Fatalf("MTU() = %v, want %v", v, want)
+	}
+}
+
+func TestEthernetProperties(t *testing.T) {
+	c := newContext(t, &Options{EthernetHeader: true, MTU: mtu})
+	defer c.cleanup()
+
+	if want, v := uint16(header.EthernetMinimumSize), c.ep.MaxHeaderLength(); want != v {
+		t.Fatalf("MaxHeaderLength() = %v, want %v", v, want)
+	}
+
+	if want, v := uint32(mtu), c.ep.MTU(); want != v {
+		t.Fatalf("MTU() = %v, want %v", v, want)
+	}
+}
+
+func TestAddress(t *testing.T) {
+	addrs := []tcpip.LinkAddress{"", "abc", "def"}
+	for _, a := range addrs {
+		t.Run(fmt.Sprintf("Address: %q", a), func(t *testing.T) {
+			c := newContext(t, &Options{Address: a, MTU: mtu})
+			defer c.cleanup()
+
+			if want, v := a, c.ep.LinkAddress(); want != v {
+				t.Fatalf("LinkAddress() = %v, want %v", v, want)
+			}
+		})
+	}
+}
+
+func testWritePacket(t *testing.T, plen int, eth bool, gsoMaxSize uint32, hash uint32) {
+	c := newContext(t, &Options{Address: laddr, MTU: mtu, EthernetHeader: eth, GSOMaxSize: gsoMaxSize})
+	defer c.cleanup()
+
+	r := &stack.Route{
+		RemoteLinkAddress: raddr,
+	}
+
+	// Build header.
+	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()) + 100)
+	b := hdr.Prepend(100)
+	for i := range b {
+		b[i] = uint8(rand.Intn(256))
+	}
+
+	// Build payload and write.
+	payload := make(buffer.View, plen)
+	for i := range payload {
+		payload[i] = uint8(rand.Intn(256))
+	}
+	want := append(hdr.View(), payload...)
+	var gso *stack.GSO
+	if gsoMaxSize != 0 {
+		gso = &stack.GSO{
+			Type:       stack.GSOTCPv6,
+			NeedsCsum:  true,
+			CsumOffset: csumOffset,
+			MSS:        gsoMSS,
+			MaxSize:    gsoMaxSize,
+			L3HdrLen:   header.IPv4MaximumHeaderSize,
+		}
+	}
+	if err := c.ep.WritePacket(r, gso, proto, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   payload.ToVectorisedView(),
+		Hash:   hash,
+	}); err != nil {
+		t.Fatalf("WritePacket failed: %v", err)
+	}
+
+	// Read from the corresponding FD, then compare with what we wrote.
+	b = make([]byte, mtu)
+	fd := c.readFDs[hash%uint32(len(c.readFDs))]
+	n, err := syscall.Read(fd, b)
+	if err != nil {
+		t.Fatalf("Read failed: %v", err)
+	}
+	b = b[:n]
+	if gsoMaxSize != 0 {
+		vnetHdr := *(*virtioNetHdr)(unsafe.Pointer(&b[0]))
+		if vnetHdr.flags&_VIRTIO_NET_HDR_F_NEEDS_CSUM == 0 {
+			t.Fatalf("virtioNetHdr.flags %v  doesn't contain %v", vnetHdr.flags, _VIRTIO_NET_HDR_F_NEEDS_CSUM)
+		}
+		csumStart := header.EthernetMinimumSize + gso.L3HdrLen
+		if vnetHdr.csumStart != csumStart {
+			t.Fatalf("vnetHdr.csumStart = %v, want %v", vnetHdr.csumStart, csumStart)
+		}
+		if vnetHdr.csumOffset != csumOffset {
+			t.Fatalf("vnetHdr.csumOffset = %v, want %v", vnetHdr.csumOffset, csumOffset)
+		}
+		gsoType := uint8(0)
+		if int(gso.MSS) < plen {
+			gsoType = _VIRTIO_NET_HDR_GSO_TCPV6
+		}
+		if vnetHdr.gsoType != gsoType {
+			t.Fatalf("vnetHdr.gsoType = %v, want %v", vnetHdr.gsoType, gsoType)
+		}
+		b = b[virtioNetHdrSize:]
+	}
+	if eth {
+		h := header.Ethernet(b)
+		b = b[header.EthernetMinimumSize:]
+
+		if a := h.SourceAddress(); a != laddr {
+			t.Fatalf("SourceAddress() = %v, want %v", a, laddr)
+		}
+
+		if a := h.DestinationAddress(); a != raddr {
+			t.Fatalf("DestinationAddress() = %v, want %v", a, raddr)
+		}
+
+		if et := h.Type(); et != proto {
+			t.Fatalf("Type() = %v, want %v", et, proto)
+		}
+	}
+	if len(b) != len(want) {
+		t.Fatalf("Read returned %v bytes, want %v", len(b), len(want))
+	}
+	if !bytes.Equal(b, want) {
+		t.Fatalf("Read returned %x, want %x", b, want)
+	}
+}
+
+func TestWritePacket(t *testing.T) {
+	lengths := []int{0, 100, 1000}
+	eths := []bool{true, false}
+	gsos := []uint32{0, 32768}
+
+	for _, eth := range eths {
+		for _, plen := range lengths {
+			for _, gso := range gsos {
+				t.Run(
+					fmt.Sprintf("Eth=%v,PayloadLen=%v,GSOMaxSize=%v", eth, plen, gso),
+					func(t *testing.T) {
+						testWritePacket(t, plen, eth, gso, 0)
+					},
+				)
+			}
+		}
+	}
+}
+
+func TestHashedWritePacket(t *testing.T) {
+	lengths := []int{0, 100, 1000}
+	eths := []bool{true, false}
+	gsos := []uint32{0, 32768}
+	hashes := []uint32{0, 1}
+	for _, eth := range eths {
+		for _, plen := range lengths {
+			for _, gso := range gsos {
+				for _, hash := range hashes {
+					t.Run(
+						fmt.Sprintf("Eth=%v,PayloadLen=%v,GSOMaxSize=%v,Hash=%d", eth, plen, gso, hash),
+						func(t *testing.T) {
+							testWritePacket(t, plen, eth, gso, hash)
+						},
+					)
+				}
+			}
+		}
+	}
+}
+
+func TestPreserveSrcAddress(t *testing.T) {
+	baddr := tcpip.LinkAddress("\xcc\xbb\xaa\x77\x88\x99")
+
+	c := newContext(t, &Options{Address: laddr, MTU: mtu, EthernetHeader: true})
+	defer c.cleanup()
+
+	// Set LocalLinkAddress in route to the value of the bridged address.
+	r := &stack.Route{
+		RemoteLinkAddress: raddr,
+		LocalLinkAddress:  baddr,
+	}
+
+	// WritePacket panics given a prependable with anything less than
+	// the minimum size of the ethernet header.
+	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
+	if err := c.ep.WritePacket(r, nil /* gso */, proto, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   buffer.VectorisedView{},
+	}); err != nil {
+		t.Fatalf("WritePacket failed: %v", err)
+	}
+
+	// Read from the FD, then compare with what we wrote.
+	b := make([]byte, mtu)
+	n, err := syscall.Read(c.readFDs[0], b)
+	if err != nil {
+		t.Fatalf("Read failed: %v", err)
+	}
+	b = b[:n]
+	h := header.Ethernet(b)
+
+	if a := h.SourceAddress(); a != baddr {
+		t.Fatalf("SourceAddress() = %v, want %v", a, baddr)
+	}
+}
+
+func TestDeliverPacket(t *testing.T) {
+	lengths := []int{100, 1000}
+	eths := []bool{true, false}
+
+	for _, eth := range eths {
+		for _, plen := range lengths {
+			t.Run(fmt.Sprintf("Eth=%v,PayloadLen=%v", eth, plen), func(t *testing.T) {
+				c := newContext(t, &Options{Address: laddr, MTU: mtu, EthernetHeader: eth})
+				defer c.cleanup()
+
+				// Build packet.
+				b := make([]byte, plen)
+				all := b
+				for i := range b {
+					b[i] = uint8(rand.Intn(256))
+				}
+
+				var hdr header.Ethernet
+				if !eth {
+					// So that it looks like an IPv4 packet.
+					b[0] = 0x40
+				} else {
+					hdr = make(header.Ethernet, header.EthernetMinimumSize)
+					hdr.Encode(&header.EthernetFields{
+						SrcAddr: raddr,
+						DstAddr: laddr,
+						Type:    proto,
+					})
+					all = append(hdr, b...)
+				}
+
+				// Write packet via the file descriptor.
+				if _, err := syscall.Write(c.readFDs[0], all); err != nil {
+					t.Fatalf("Write failed: %v", err)
+				}
+
+				// Receive packet through the endpoint.
+				select {
+				case pi := <-c.ch:
+					want := packetInfo{
+						raddr: raddr,
+						proto: proto,
+						contents: &stack.PacketBuffer{
+							Data:       buffer.View(b).ToVectorisedView(),
+							LinkHeader: buffer.View(hdr),
+						},
+					}
+					if !eth {
+						want.proto = header.IPv4ProtocolNumber
+						want.raddr = ""
+					}
+					// want.contents.Data will be a single
+					// view, so make pi do the same for the
+					// DeepEqual check.
+					pi.contents.Data = pi.contents.Data.ToView().ToVectorisedView()
+					if !reflect.DeepEqual(want, pi) {
+						t.Fatalf("Unexpected received packet: %+v, want %+v", pi, want)
+					}
+				case <-time.After(10 * time.Second):
+					t.Fatalf("Timed out waiting for packet")
+				}
+			})
+		}
+	}
+}
+
+func TestBufConfigMaxLength(t *testing.T) {
+	got := 0
+	for _, i := range BufConfig {
+		got += i
+	}
+	want := header.MaxIPPacketSize // maximum TCP packet size
+	if got < want {
+		t.Errorf("total buffer size is invalid: got %d, want >= %d", got, want)
+	}
+}
+
+func TestBufConfigFirst(t *testing.T) {
+	// The stack assumes that the TCP/IP header is enterily contained in the first view.
+	// Therefore, the first view needs to be large enough to contain the maximum TCP/IP
+	// header, which is 120 bytes (60 bytes for IP + 60 bytes for TCP).
+	want := 120
+	got := BufConfig[0]
+	if got < want {
+		t.Errorf("first view has an invalid size: got %d, want >= %d", got, want)
+	}
+}
+
+var capLengthTestCases = []struct {
+	comment     string
+	config      []int
+	n           int
+	wantUsed    int
+	wantLengths []int
+}{
+	{
+		comment:     "Single slice",
+		config:      []int{2},
+		n:           1,
+		wantUsed:    1,
+		wantLengths: []int{1},
+	},
+	{
+		comment:     "Multiple slices",
+		config:      []int{1, 2},
+		n:           2,
+		wantUsed:    2,
+		wantLengths: []int{1, 1},
+	},
+	{
+		comment:     "Entire buffer",
+		config:      []int{1, 2},
+		n:           3,
+		wantUsed:    2,
+		wantLengths: []int{1, 2},
+	},
+	{
+		comment:     "Entire buffer but not on the last slice",
+		config:      []int{1, 2, 3},
+		n:           3,
+		wantUsed:    2,
+		wantLengths: []int{1, 2, 3},
+	},
+}
+
+func TestReadVDispatcherCapLength(t *testing.T) {
+	for _, c := range capLengthTestCases {
+		// fd does not matter for this test.
+		d := readVDispatcher{fd: -1, e: &endpoint{}}
+		d.views = make([]buffer.View, len(c.config))
+		d.iovecs = make([]syscall.Iovec, len(c.config))
+		d.allocateViews(c.config)
+
+		used := d.capViews(c.n, c.config)
+		if used != c.wantUsed {
+			t.Errorf("Test %q failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed)
+		}
+		lengths := make([]int, len(d.views))
+		for i, v := range d.views {
+			lengths[i] = len(v)
+		}
+		if !reflect.DeepEqual(lengths, c.wantLengths) {
+			t.Errorf("Test %q failed when calling capViews(%d, %v). Got %v. Want %v", c.comment, c.n, c.config, lengths, c.wantLengths)
+		}
+	}
+}
+
+func TestRecvMMsgDispatcherCapLength(t *testing.T) {
+	for _, c := range capLengthTestCases {
+		d := recvMMsgDispatcher{
+			fd:      -1, // fd does not matter for this test.
+			e:       &endpoint{},
+			views:   make([][]buffer.View, 1),
+			iovecs:  make([][]syscall.Iovec, 1),
+			msgHdrs: make([]rawfile.MMsgHdr, 1),
+		}
+
+		for i, _ := range d.views {
+			d.views[i] = make([]buffer.View, len(c.config))
+		}
+		for i := range d.iovecs {
+			d.iovecs[i] = make([]syscall.Iovec, len(c.config))
+		}
+		for k, msgHdr := range d.msgHdrs {
+			msgHdr.Msg.Iov = &d.iovecs[k][0]
+			msgHdr.Msg.Iovlen = uint64(len(c.config))
+		}
+
+		d.allocateViews(c.config)
+
+		used := d.capViews(0, c.n, c.config)
+		if used != c.wantUsed {
+			t.Errorf("Test %q failed when calling capViews(%d, %v). Got %d. Want %d", c.comment, c.n, c.config, used, c.wantUsed)
+		}
+		lengths := make([]int, len(d.views[0]))
+		for i, v := range d.views[0] {
+			lengths[i] = len(v)
+		}
+		if !reflect.DeepEqual(lengths, c.wantLengths) {
+			t.Errorf("Test %q failed when calling capViews(%d, %v). Got %v. Want %v", c.comment, c.n, c.config, lengths, c.wantLengths)
+		}
+
+	}
+}
diff --git a/pkg/tcpip/link/fdbased/endpoint_unsafe.go b/pkg/tcpip/link/fdbased/endpoint_unsafe.go
new file mode 100644
index 000000000..df14eaad1
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/endpoint_unsafe.go
@@ -0,0 +1,23 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package fdbased
+
+import (
+	"unsafe"
+)
+
+const virtioNetHdrSize = int(unsafe.Sizeof(virtioNetHdr{}))
diff --git a/pkg/tcpip/link/fdbased/mmap.go b/pkg/tcpip/link/fdbased/mmap.go
new file mode 100644
index 000000000..2dfd29aa9
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/mmap.go
@@ -0,0 +1,199 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux,amd64 linux,arm64
+
+package fdbased
+
+import (
+	"encoding/binary"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	tPacketAlignment = uintptr(16)
+	tpStatusKernel   = 0
+	tpStatusUser     = 1
+	tpStatusCopy     = 2
+	tpStatusLosing   = 4
+)
+
+// We overallocate the frame size to accommodate space for the
+// TPacketHdr+RawSockAddrLinkLayer+MAC header and any padding.
+//
+// Memory allocated for the ring buffer: tpBlockSize * tpBlockNR = 2 MiB
+//
+// NOTE:
+//   Frames need to be aligned at 16 byte boundaries.
+//   BlockSize needs to be page aligned.
+//
+//   For details see PACKET_MMAP setting constraints in
+//   https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
+const (
+	tpFrameSize = 65536 + 128
+	tpBlockSize = tpFrameSize * 32
+	tpBlockNR   = 1
+	tpFrameNR   = (tpBlockSize * tpBlockNR) / tpFrameSize
+)
+
+// tPacketAlign aligns the pointer v at a tPacketAlignment boundary. Direct
+// translation of the TPACKET_ALIGN macro in <linux/if_packet.h>.
+func tPacketAlign(v uintptr) uintptr {
+	return (v + tPacketAlignment - 1) & uintptr(^(tPacketAlignment - 1))
+}
+
+// tPacketReq is the tpacket_req structure as described in
+// https://www.kernel.org/doc/Documentation/networking/packet_mmap.txt
+type tPacketReq struct {
+	tpBlockSize uint32
+	tpBlockNR   uint32
+	tpFrameSize uint32
+	tpFrameNR   uint32
+}
+
+// tPacketHdr is tpacket_hdr structure as described in <linux/if_packet.h>
+type tPacketHdr []byte
+
+const (
+	tpStatusOffset  = 0
+	tpLenOffset     = 8
+	tpSnapLenOffset = 12
+	tpMacOffset     = 16
+	tpNetOffset     = 18
+	tpSecOffset     = 20
+	tpUSecOffset    = 24
+)
+
+func (t tPacketHdr) tpLen() uint32 {
+	return binary.LittleEndian.Uint32(t[tpLenOffset:])
+}
+
+func (t tPacketHdr) tpSnapLen() uint32 {
+	return binary.LittleEndian.Uint32(t[tpSnapLenOffset:])
+}
+
+func (t tPacketHdr) tpMac() uint16 {
+	return binary.LittleEndian.Uint16(t[tpMacOffset:])
+}
+
+func (t tPacketHdr) tpNet() uint16 {
+	return binary.LittleEndian.Uint16(t[tpNetOffset:])
+}
+
+func (t tPacketHdr) tpSec() uint32 {
+	return binary.LittleEndian.Uint32(t[tpSecOffset:])
+}
+
+func (t tPacketHdr) tpUSec() uint32 {
+	return binary.LittleEndian.Uint32(t[tpUSecOffset:])
+}
+
+func (t tPacketHdr) Payload() []byte {
+	return t[uint32(t.tpMac()) : uint32(t.tpMac())+t.tpSnapLen()]
+}
+
+// packetMMapDispatcher uses PACKET_RX_RING's to read/dispatch inbound packets.
+// See: mmap_amd64_unsafe.go for implementation details.
+type packetMMapDispatcher struct {
+	// fd is the file descriptor used to send and receive packets.
+	fd int
+
+	// e is the endpoint this dispatcher is attached to.
+	e *endpoint
+
+	// ringBuffer is only used when PacketMMap dispatcher is used and points
+	// to the start of the mmapped PACKET_RX_RING buffer.
+	ringBuffer []byte
+
+	// ringOffset is the current offset into the ring buffer where the next
+	// inbound packet will be placed by the kernel.
+	ringOffset int
+}
+
+func (d *packetMMapDispatcher) readMMappedPacket() ([]byte, *tcpip.Error) {
+	hdr := tPacketHdr(d.ringBuffer[d.ringOffset*tpFrameSize:])
+	for hdr.tpStatus()&tpStatusUser == 0 {
+		event := rawfile.PollEvent{
+			FD:     int32(d.fd),
+			Events: unix.POLLIN | unix.POLLERR,
+		}
+		if _, errno := rawfile.BlockingPoll(&event, 1, nil); errno != 0 {
+			if errno == syscall.EINTR {
+				continue
+			}
+			return nil, rawfile.TranslateErrno(errno)
+		}
+		if hdr.tpStatus()&tpStatusCopy != 0 {
+			// This frame is truncated so skip it after flipping the
+			// buffer to the kernel.
+			hdr.setTPStatus(tpStatusKernel)
+			d.ringOffset = (d.ringOffset + 1) % tpFrameNR
+			hdr = (tPacketHdr)(d.ringBuffer[d.ringOffset*tpFrameSize:])
+			continue
+		}
+	}
+
+	// Copy out the packet from the mmapped frame to a locally owned buffer.
+	pkt := make([]byte, hdr.tpSnapLen())
+	copy(pkt, hdr.Payload())
+	// Release packet to kernel.
+	hdr.setTPStatus(tpStatusKernel)
+	d.ringOffset = (d.ringOffset + 1) % tpFrameNR
+	return pkt, nil
+}
+
+// dispatch reads packets from an mmaped ring buffer and dispatches them to the
+// network stack.
+func (d *packetMMapDispatcher) dispatch() (bool, *tcpip.Error) {
+	pkt, err := d.readMMappedPacket()
+	if err != nil {
+		return false, err
+	}
+	var (
+		p             tcpip.NetworkProtocolNumber
+		remote, local tcpip.LinkAddress
+		eth           header.Ethernet
+	)
+	if d.e.hdrSize > 0 {
+		eth = header.Ethernet(pkt)
+		p = eth.Type()
+		remote = eth.SourceAddress()
+		local = eth.DestinationAddress()
+	} else {
+		// We don't get any indication of what the packet is, so try to guess
+		// if it's an IPv4 or IPv6 packet.
+		switch header.IPVersion(pkt) {
+		case header.IPv4Version:
+			p = header.IPv4ProtocolNumber
+		case header.IPv6Version:
+			p = header.IPv6ProtocolNumber
+		default:
+			return true, nil
+		}
+	}
+
+	pkt = pkt[d.e.hdrSize:]
+	d.e.dispatcher.DeliverNetworkPacket(remote, local, p, &stack.PacketBuffer{
+		Data:       buffer.View(pkt).ToVectorisedView(),
+		LinkHeader: buffer.View(eth),
+	})
+	return true, nil
+}
diff --git a/pkg/tcpip/link/fdbased/mmap_stub.go b/pkg/tcpip/link/fdbased/mmap_stub.go
new file mode 100644
index 000000000..67be52d67
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/mmap_stub.go
@@ -0,0 +1,23 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build !linux !amd64,!arm64
+
+package fdbased
+
+// Stubbed out version for non-linux/non-amd64/non-arm64 platforms.
+
+func newPacketMMapDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
+	return nil, nil
+}
diff --git a/pkg/tcpip/link/fdbased/mmap_unsafe.go b/pkg/tcpip/link/fdbased/mmap_unsafe.go
new file mode 100644
index 000000000..3894185ae
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/mmap_unsafe.go
@@ -0,0 +1,84 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux,amd64 linux,arm64
+
+package fdbased
+
+import (
+	"fmt"
+	"sync/atomic"
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+)
+
+// tPacketHdrlen is the TPACKET_HDRLEN variable defined in <linux/if_packet.h>.
+var tPacketHdrlen = tPacketAlign(unsafe.Sizeof(tPacketHdr{}) + unsafe.Sizeof(syscall.RawSockaddrLinklayer{}))
+
+// tpStatus returns the frame status field.
+// The status is concurrently updated by the kernel as a result we must
+// use atomic operations to prevent races.
+func (t tPacketHdr) tpStatus() uint32 {
+	hdr := unsafe.Pointer(&t[0])
+	statusPtr := unsafe.Pointer(uintptr(hdr) + uintptr(tpStatusOffset))
+	return atomic.LoadUint32((*uint32)(statusPtr))
+}
+
+// setTPStatus set's the frame status to the provided status.
+// The status is concurrently updated by the kernel as a result we must
+// use atomic operations to prevent races.
+func (t tPacketHdr) setTPStatus(status uint32) {
+	hdr := unsafe.Pointer(&t[0])
+	statusPtr := unsafe.Pointer(uintptr(hdr) + uintptr(tpStatusOffset))
+	atomic.StoreUint32((*uint32)(statusPtr), status)
+}
+
+func newPacketMMapDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
+	d := &packetMMapDispatcher{
+		fd: fd,
+		e:  e,
+	}
+	pageSize := unix.Getpagesize()
+	if tpBlockSize%pageSize != 0 {
+		return nil, fmt.Errorf("tpBlockSize: %d is not page aligned, pagesize: %d", tpBlockSize, pageSize)
+	}
+	tReq := tPacketReq{
+		tpBlockSize: uint32(tpBlockSize),
+		tpBlockNR:   uint32(tpBlockNR),
+		tpFrameSize: uint32(tpFrameSize),
+		tpFrameNR:   uint32(tpFrameNR),
+	}
+	// Setup PACKET_RX_RING.
+	if err := setsockopt(d.fd, syscall.SOL_PACKET, syscall.PACKET_RX_RING, unsafe.Pointer(&tReq), unsafe.Sizeof(tReq)); err != nil {
+		return nil, fmt.Errorf("failed to enable PACKET_RX_RING: %v", err)
+	}
+	// Let's mmap the blocks.
+	sz := tpBlockSize * tpBlockNR
+	buf, err := syscall.Mmap(d.fd, 0, sz, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED)
+	if err != nil {
+		return nil, fmt.Errorf("syscall.Mmap(...,0, %v, ...) failed = %v", sz, err)
+	}
+	d.ringBuffer = buf
+	return d, nil
+}
+
+func setsockopt(fd, level, name int, val unsafe.Pointer, vallen uintptr) error {
+	if _, _, errno := syscall.Syscall6(syscall.SYS_SETSOCKOPT, uintptr(fd), uintptr(level), uintptr(name), uintptr(val), vallen, 0); errno != 0 {
+		return error(errno)
+	}
+
+	return nil
+}
diff --git a/pkg/tcpip/link/fdbased/packet_dispatchers.go b/pkg/tcpip/link/fdbased/packet_dispatchers.go
new file mode 100644
index 000000000..f04738cfb
--- /dev/null
+++ b/pkg/tcpip/link/fdbased/packet_dispatchers.go
@@ -0,0 +1,317 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package fdbased
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// BufConfig defines the shape of the vectorised view used to read packets from the NIC.
+var BufConfig = []int{128, 256, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768}
+
+// readVDispatcher uses readv() system call to read inbound packets and
+// dispatches them.
+type readVDispatcher struct {
+	// fd is the file descriptor used to send and receive packets.
+	fd int
+
+	// e is the endpoint this dispatcher is attached to.
+	e *endpoint
+
+	// views are the actual buffers that hold the packet contents.
+	views []buffer.View
+
+	// iovecs are initialized with base pointers/len of the corresponding
+	// entries in the views defined above, except when GSO is enabled then
+	// the first iovec points to a buffer for the vnet header which is
+	// stripped before the views are passed up the stack for further
+	// processing.
+	iovecs []syscall.Iovec
+}
+
+func newReadVDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
+	d := &readVDispatcher{fd: fd, e: e}
+	d.views = make([]buffer.View, len(BufConfig))
+	iovLen := len(BufConfig)
+	if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+		iovLen++
+	}
+	d.iovecs = make([]syscall.Iovec, iovLen)
+	return d, nil
+}
+
+func (d *readVDispatcher) allocateViews(bufConfig []int) {
+	var vnetHdr [virtioNetHdrSize]byte
+	vnetHdrOff := 0
+	if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+		// The kernel adds virtioNetHdr before each packet, but
+		// we don't use it, so so we allocate a buffer for it,
+		// add it in iovecs but don't add it in a view.
+		d.iovecs[0] = syscall.Iovec{
+			Base: &vnetHdr[0],
+			Len:  uint64(virtioNetHdrSize),
+		}
+		vnetHdrOff++
+	}
+	for i := 0; i < len(bufConfig); i++ {
+		if d.views[i] != nil {
+			break
+		}
+		b := buffer.NewView(bufConfig[i])
+		d.views[i] = b
+		d.iovecs[i+vnetHdrOff] = syscall.Iovec{
+			Base: &b[0],
+			Len:  uint64(len(b)),
+		}
+	}
+}
+
+func (d *readVDispatcher) capViews(n int, buffers []int) int {
+	c := 0
+	for i, s := range buffers {
+		c += s
+		if c >= n {
+			d.views[i].CapLength(s - (c - n))
+			return i + 1
+		}
+	}
+	return len(buffers)
+}
+
+// dispatch reads one packet from the file descriptor and dispatches it.
+func (d *readVDispatcher) dispatch() (bool, *tcpip.Error) {
+	d.allocateViews(BufConfig)
+
+	n, err := rawfile.BlockingReadv(d.fd, d.iovecs)
+	if err != nil {
+		return false, err
+	}
+	if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+		// Skip virtioNetHdr which is added before each packet, it
+		// isn't used and it isn't in a view.
+		n -= virtioNetHdrSize
+	}
+	if n <= d.e.hdrSize {
+		return false, nil
+	}
+
+	var (
+		p             tcpip.NetworkProtocolNumber
+		remote, local tcpip.LinkAddress
+		eth           header.Ethernet
+	)
+	if d.e.hdrSize > 0 {
+		eth = header.Ethernet(d.views[0][:header.EthernetMinimumSize])
+		p = eth.Type()
+		remote = eth.SourceAddress()
+		local = eth.DestinationAddress()
+	} else {
+		// We don't get any indication of what the packet is, so try to guess
+		// if it's an IPv4 or IPv6 packet.
+		switch header.IPVersion(d.views[0]) {
+		case header.IPv4Version:
+			p = header.IPv4ProtocolNumber
+		case header.IPv6Version:
+			p = header.IPv6ProtocolNumber
+		default:
+			return true, nil
+		}
+	}
+
+	used := d.capViews(n, BufConfig)
+	pkt := &stack.PacketBuffer{
+		Data:       buffer.NewVectorisedView(n, append([]buffer.View(nil), d.views[:used]...)),
+		LinkHeader: buffer.View(eth),
+	}
+	pkt.Data.TrimFront(d.e.hdrSize)
+
+	d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt)
+
+	// Prepare e.views for another packet: release used views.
+	for i := 0; i < used; i++ {
+		d.views[i] = nil
+	}
+
+	return true, nil
+}
+
+// recvMMsgDispatcher uses the recvmmsg system call to read inbound packets and
+// dispatches them.
+type recvMMsgDispatcher struct {
+	// fd is the file descriptor used to send and receive packets.
+	fd int
+
+	// e is the endpoint this dispatcher is attached to.
+	e *endpoint
+
+	// views is an array of array of buffers that contain packet contents.
+	views [][]buffer.View
+
+	// iovecs is an array of array of iovec records where each iovec base
+	// pointer and length are initialzed to the corresponding view above,
+	// except when GSO is enabled then the first iovec in each array of
+	// iovecs points to a buffer for the vnet header which is stripped
+	// before the views are passed up the stack for further processing.
+	iovecs [][]syscall.Iovec
+
+	// msgHdrs is an array of MMsgHdr objects where each MMsghdr is used to
+	// reference an array of iovecs in the iovecs field defined above.  This
+	// array is passed as the parameter to recvmmsg call to retrieve
+	// potentially more than 1 packet per syscall.
+	msgHdrs []rawfile.MMsgHdr
+}
+
+const (
+	// MaxMsgsPerRecv is the maximum number of packets we want to retrieve
+	// in a single RecvMMsg call.
+	MaxMsgsPerRecv = 8
+)
+
+func newRecvMMsgDispatcher(fd int, e *endpoint) (linkDispatcher, error) {
+	d := &recvMMsgDispatcher{
+		fd: fd,
+		e:  e,
+	}
+	d.views = make([][]buffer.View, MaxMsgsPerRecv)
+	for i := range d.views {
+		d.views[i] = make([]buffer.View, len(BufConfig))
+	}
+	d.iovecs = make([][]syscall.Iovec, MaxMsgsPerRecv)
+	iovLen := len(BufConfig)
+	if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+		// virtioNetHdr is prepended before each packet.
+		iovLen++
+	}
+	for i := range d.iovecs {
+		d.iovecs[i] = make([]syscall.Iovec, iovLen)
+	}
+	d.msgHdrs = make([]rawfile.MMsgHdr, MaxMsgsPerRecv)
+	for i := range d.msgHdrs {
+		d.msgHdrs[i].Msg.Iov = &d.iovecs[i][0]
+		d.msgHdrs[i].Msg.Iovlen = uint64(iovLen)
+	}
+	return d, nil
+}
+
+func (d *recvMMsgDispatcher) capViews(k, n int, buffers []int) int {
+	c := 0
+	for i, s := range buffers {
+		c += s
+		if c >= n {
+			d.views[k][i].CapLength(s - (c - n))
+			return i + 1
+		}
+	}
+	return len(buffers)
+}
+
+func (d *recvMMsgDispatcher) allocateViews(bufConfig []int) {
+	for k := 0; k < len(d.views); k++ {
+		var vnetHdr [virtioNetHdrSize]byte
+		vnetHdrOff := 0
+		if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+			// The kernel adds virtioNetHdr before each packet, but
+			// we don't use it, so so we allocate a buffer for it,
+			// add it in iovecs but don't add it in a view.
+			d.iovecs[k][0] = syscall.Iovec{
+				Base: &vnetHdr[0],
+				Len:  uint64(virtioNetHdrSize),
+			}
+			vnetHdrOff++
+		}
+		for i := 0; i < len(bufConfig); i++ {
+			if d.views[k][i] != nil {
+				break
+			}
+			b := buffer.NewView(bufConfig[i])
+			d.views[k][i] = b
+			d.iovecs[k][i+vnetHdrOff] = syscall.Iovec{
+				Base: &b[0],
+				Len:  uint64(len(b)),
+			}
+		}
+	}
+}
+
+// recvMMsgDispatch reads more than one packet at a time from the file
+// descriptor and dispatches it.
+func (d *recvMMsgDispatcher) dispatch() (bool, *tcpip.Error) {
+	d.allocateViews(BufConfig)
+
+	nMsgs, err := rawfile.BlockingRecvMMsg(d.fd, d.msgHdrs)
+	if err != nil {
+		return false, err
+	}
+	// Process each of received packets.
+	for k := 0; k < nMsgs; k++ {
+		n := int(d.msgHdrs[k].Len)
+		if d.e.Capabilities()&stack.CapabilityHardwareGSO != 0 {
+			n -= virtioNetHdrSize
+		}
+		if n <= d.e.hdrSize {
+			return false, nil
+		}
+
+		var (
+			p             tcpip.NetworkProtocolNumber
+			remote, local tcpip.LinkAddress
+			eth           header.Ethernet
+		)
+		if d.e.hdrSize > 0 {
+			eth = header.Ethernet(d.views[k][0])
+			p = eth.Type()
+			remote = eth.SourceAddress()
+			local = eth.DestinationAddress()
+		} else {
+			// We don't get any indication of what the packet is, so try to guess
+			// if it's an IPv4 or IPv6 packet.
+			switch header.IPVersion(d.views[k][0]) {
+			case header.IPv4Version:
+				p = header.IPv4ProtocolNumber
+			case header.IPv6Version:
+				p = header.IPv6ProtocolNumber
+			default:
+				return true, nil
+			}
+		}
+
+		used := d.capViews(k, int(n), BufConfig)
+		pkt := &stack.PacketBuffer{
+			Data:       buffer.NewVectorisedView(int(n), append([]buffer.View(nil), d.views[k][:used]...)),
+			LinkHeader: buffer.View(eth),
+		}
+		pkt.Data.TrimFront(d.e.hdrSize)
+		d.e.dispatcher.DeliverNetworkPacket(remote, local, p, pkt)
+
+		// Prepare e.views for another packet: release used views.
+		for i := 0; i < used; i++ {
+			d.views[k][i] = nil
+		}
+	}
+
+	for k := 0; k < nMsgs; k++ {
+		d.msgHdrs[k].Len = 0
+	}
+
+	return true, nil
+}
diff --git a/pkg/tcpip/link/loopback/BUILD b/pkg/tcpip/link/loopback/BUILD
new file mode 100644
index 000000000..6bf3805b7
--- /dev/null
+++ b/pkg/tcpip/link/loopback/BUILD
@@ -0,0 +1,15 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "loopback",
+    srcs = ["loopback.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/loopback/loopback.go b/pkg/tcpip/link/loopback/loopback.go
new file mode 100644
index 000000000..568c6874f
--- /dev/null
+++ b/pkg/tcpip/link/loopback/loopback.go
@@ -0,0 +1,115 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package loopback provides the implemention of loopback data-link layer
+// endpoints. Such endpoints just turn outbound packets into inbound ones.
+//
+// Loopback endpoints can be used in the networking stack by calling New() to
+// create a new endpoint, and then passing it as an argument to
+// Stack.CreateNIC().
+package loopback
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+type endpoint struct {
+	dispatcher stack.NetworkDispatcher
+}
+
+// New creates a new loopback endpoint. This link-layer endpoint just turns
+// outbound packets into inbound packets.
+func New() stack.LinkEndpoint {
+	return &endpoint{}
+}
+
+// Attach implements stack.LinkEndpoint.Attach. It just saves the stack network-
+// layer dispatcher for later use when packets need to be dispatched.
+func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It returns a constant that matches the
+// linux loopback interface.
+func (*endpoint) MTU() uint32 {
+	return 65536
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities. Loopback advertises
+// itself as supporting checksum offload, but in reality it's just omitted.
+func (*endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return stack.CapabilityRXChecksumOffload | stack.CapabilityTXChecksumOffload | stack.CapabilitySaveRestore | stack.CapabilityLoopback
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. Given that the
+// loopback interface doesn't have a header, it just returns 0.
+func (*endpoint) MaxHeaderLength() uint16 {
+	return 0
+}
+
+// LinkAddress returns the link address of this endpoint.
+func (*endpoint) LinkAddress() tcpip.LinkAddress {
+	return ""
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (*endpoint) Wait() {}
+
+// WritePacket implements stack.LinkEndpoint.WritePacket. It delivers outbound
+// packets to the network-layer dispatcher.
+func (e *endpoint) WritePacket(_ *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	views := make([]buffer.View, 1, 1+len(pkt.Data.Views()))
+	views[0] = pkt.Header.View()
+	views = append(views, pkt.Data.Views()...)
+
+	// Because we're immediately turning around and writing the packet back
+	// to the rx path, we intentionally don't preserve the remote and local
+	// link addresses from the stack.Route we're passed.
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, &stack.PacketBuffer{
+		Data: buffer.NewVectorisedView(len(views[0])+pkt.Data.Size(), views),
+	})
+
+	return nil
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(*stack.Route, *stack.GSO, stack.PacketBufferList, tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	// There should be an ethernet header at the beginning of vv.
+	hdr, ok := vv.PullUp(header.EthernetMinimumSize)
+	if !ok {
+		// Reject the packet if it's shorter than an ethernet header.
+		return tcpip.ErrBadAddress
+	}
+	linkHeader := header.Ethernet(hdr)
+	vv.TrimFront(len(linkHeader))
+	e.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, linkHeader.Type(), &stack.PacketBuffer{
+		Data:       vv,
+		LinkHeader: buffer.View(linkHeader),
+	})
+
+	return nil
+}
diff --git a/pkg/tcpip/link/muxed/BUILD b/pkg/tcpip/link/muxed/BUILD
new file mode 100644
index 000000000..82b441b79
--- /dev/null
+++ b/pkg/tcpip/link/muxed/BUILD
@@ -0,0 +1,28 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "muxed",
+    srcs = ["injectable.go"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
+    ],
+)
+
+go_test(
+    name = "muxed_test",
+    size = "small",
+    srcs = ["injectable_test.go"],
+    library = ":muxed",
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/link/fdbased",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/muxed/injectable.go b/pkg/tcpip/link/muxed/injectable.go
new file mode 100644
index 000000000..c69d6b7e9
--- /dev/null
+++ b/pkg/tcpip/link/muxed/injectable.go
@@ -0,0 +1,137 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package muxed provides a muxed link endpoints.
+package muxed
+
+import (
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// InjectableEndpoint is an injectable multi endpoint. The endpoint has
+// trivial routing rules that determine which InjectableEndpoint a given packet
+// will be written to. Note that HandleLocal works differently for this
+// endpoint (see WritePacket).
+type InjectableEndpoint struct {
+	routes     map[tcpip.Address]stack.InjectableLinkEndpoint
+	dispatcher stack.NetworkDispatcher
+}
+
+// MTU implements stack.LinkEndpoint.
+func (m *InjectableEndpoint) MTU() uint32 {
+	minMTU := ^uint32(0)
+	for _, endpoint := range m.routes {
+		if endpointMTU := endpoint.MTU(); endpointMTU < minMTU {
+			minMTU = endpointMTU
+		}
+	}
+	return minMTU
+}
+
+// Capabilities implements stack.LinkEndpoint.
+func (m *InjectableEndpoint) Capabilities() stack.LinkEndpointCapabilities {
+	minCapabilities := stack.LinkEndpointCapabilities(^uint(0))
+	for _, endpoint := range m.routes {
+		minCapabilities &= endpoint.Capabilities()
+	}
+	return minCapabilities
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.
+func (m *InjectableEndpoint) MaxHeaderLength() uint16 {
+	minHeaderLen := ^uint16(0)
+	for _, endpoint := range m.routes {
+		if headerLen := endpoint.MaxHeaderLength(); headerLen < minHeaderLen {
+			minHeaderLen = headerLen
+		}
+	}
+	return minHeaderLen
+}
+
+// LinkAddress implements stack.LinkEndpoint.
+func (m *InjectableEndpoint) LinkAddress() tcpip.LinkAddress {
+	return ""
+}
+
+// Attach implements stack.LinkEndpoint.
+func (m *InjectableEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	for _, endpoint := range m.routes {
+		endpoint.Attach(dispatcher)
+	}
+	m.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.
+func (m *InjectableEndpoint) IsAttached() bool {
+	return m.dispatcher != nil
+}
+
+// InjectInbound implements stack.InjectableLinkEndpoint.
+func (m *InjectableEndpoint) InjectInbound(protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	m.dispatcher.DeliverNetworkPacket("" /* remote */, "" /* local */, protocol, pkt)
+}
+
+// WritePackets writes outbound packets to the appropriate
+// LinkInjectableEndpoint based on the RemoteAddress. HandleLocal only works if
+// r.RemoteAddress has a route registered in this endpoint.
+func (m *InjectableEndpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	endpoint, ok := m.routes[r.RemoteAddress]
+	if !ok {
+		return 0, tcpip.ErrNoRoute
+	}
+	return endpoint.WritePackets(r, gso, pkts, protocol)
+}
+
+// WritePacket writes outbound packets to the appropriate LinkInjectableEndpoint
+// based on the RemoteAddress. HandleLocal only works if r.RemoteAddress has a
+// route registered in this endpoint.
+func (m *InjectableEndpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	if endpoint, ok := m.routes[r.RemoteAddress]; ok {
+		return endpoint.WritePacket(r, gso, protocol, pkt)
+	}
+	return tcpip.ErrNoRoute
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (m *InjectableEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
+	// WriteRawPacket doesn't get a route or network address, so there's
+	// nowhere to write this.
+	return tcpip.ErrNoRoute
+}
+
+// InjectOutbound writes outbound packets to the appropriate
+// LinkInjectableEndpoint based on the dest address.
+func (m *InjectableEndpoint) InjectOutbound(dest tcpip.Address, packet []byte) *tcpip.Error {
+	endpoint, ok := m.routes[dest]
+	if !ok {
+		return tcpip.ErrNoRoute
+	}
+	return endpoint.InjectOutbound(dest, packet)
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (m *InjectableEndpoint) Wait() {
+	for _, ep := range m.routes {
+		ep.Wait()
+	}
+}
+
+// NewInjectableEndpoint creates a new multi-endpoint injectable endpoint.
+func NewInjectableEndpoint(routes map[tcpip.Address]stack.InjectableLinkEndpoint) *InjectableEndpoint {
+	return &InjectableEndpoint{
+		routes: routes,
+	}
+}
diff --git a/pkg/tcpip/link/muxed/injectable_test.go b/pkg/tcpip/link/muxed/injectable_test.go
new file mode 100644
index 000000000..0744f66d6
--- /dev/null
+++ b/pkg/tcpip/link/muxed/injectable_test.go
@@ -0,0 +1,98 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package muxed
+
+import (
+	"bytes"
+	"net"
+	"os"
+	"syscall"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/link/fdbased"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+func TestInjectableEndpointRawDispatch(t *testing.T) {
+	endpoint, sock, dstIP := makeTestInjectableEndpoint(t)
+
+	endpoint.InjectOutbound(dstIP, []byte{0xFA})
+
+	buf := make([]byte, ipv4.MaxTotalSize)
+	bytesRead, err := sock.Read(buf)
+	if err != nil {
+		t.Fatalf("Unable to read from socketpair: %v", err)
+	}
+	if got, want := buf[:bytesRead], []byte{0xFA}; !bytes.Equal(got, want) {
+		t.Fatalf("Read %v from the socketpair, wanted %v", got, want)
+	}
+}
+
+func TestInjectableEndpointDispatch(t *testing.T) {
+	endpoint, sock, dstIP := makeTestInjectableEndpoint(t)
+
+	hdr := buffer.NewPrependable(1)
+	hdr.Prepend(1)[0] = 0xFA
+	packetRoute := stack.Route{RemoteAddress: dstIP}
+
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   buffer.NewViewFromBytes([]byte{0xFB}).ToVectorisedView(),
+	})
+
+	buf := make([]byte, 6500)
+	bytesRead, err := sock.Read(buf)
+	if err != nil {
+		t.Fatalf("Unable to read from socketpair: %v", err)
+	}
+	if got, want := buf[:bytesRead], []byte{0xFA, 0xFB}; !bytes.Equal(got, want) {
+		t.Fatalf("Read %v from the socketpair, wanted %v", got, want)
+	}
+}
+
+func TestInjectableEndpointDispatchHdrOnly(t *testing.T) {
+	endpoint, sock, dstIP := makeTestInjectableEndpoint(t)
+	hdr := buffer.NewPrependable(1)
+	hdr.Prepend(1)[0] = 0xFA
+	packetRoute := stack.Route{RemoteAddress: dstIP}
+	endpoint.WritePacket(&packetRoute, nil /* gso */, ipv4.ProtocolNumber, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   buffer.NewView(0).ToVectorisedView(),
+	})
+	buf := make([]byte, 6500)
+	bytesRead, err := sock.Read(buf)
+	if err != nil {
+		t.Fatalf("Unable to read from socketpair: %v", err)
+	}
+	if got, want := buf[:bytesRead], []byte{0xFA}; !bytes.Equal(got, want) {
+		t.Fatalf("Read %v from the socketpair, wanted %v", got, want)
+	}
+}
+
+func makeTestInjectableEndpoint(t *testing.T) (*InjectableEndpoint, *os.File, tcpip.Address) {
+	dstIP := tcpip.Address(net.ParseIP("1.2.3.4").To4())
+	pair, err := syscall.Socketpair(syscall.AF_UNIX,
+		syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC|syscall.SOCK_NONBLOCK, 0)
+	if err != nil {
+		t.Fatal("Failed to create socket pair:", err)
+	}
+	underlyingEndpoint := fdbased.NewInjectable(pair[1], 6500, stack.CapabilityNone)
+	routes := map[tcpip.Address]stack.InjectableLinkEndpoint{dstIP: underlyingEndpoint}
+	endpoint := NewInjectableEndpoint(routes)
+	return endpoint, os.NewFile(uintptr(pair[0]), "test route end"), dstIP
+}
diff --git a/pkg/tcpip/link/nested/BUILD b/pkg/tcpip/link/nested/BUILD
new file mode 100644
index 000000000..bdd5276ad
--- /dev/null
+++ b/pkg/tcpip/link/nested/BUILD
@@ -0,0 +1,31 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "nested",
+    srcs = [
+        "nested.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
+    ],
+)
+
+go_test(
+    name = "nested_test",
+    size = "small",
+    srcs = [
+        "nested_test.go",
+    ],
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/nested",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/nested/nested.go b/pkg/tcpip/link/nested/nested.go
new file mode 100644
index 000000000..2998f9c4f
--- /dev/null
+++ b/pkg/tcpip/link/nested/nested.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package nested provides helpers to implement the pattern of nested
+// stack.LinkEndpoints.
+package nested
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// Endpoint is a wrapper around stack.LinkEndpoint and stack.NetworkDispatcher
+// that can be used to implement nesting safely by providing lifecycle
+// concurrency guards.
+//
+// See the tests in this package for example usage.
+type Endpoint struct {
+	child    stack.LinkEndpoint
+	embedder stack.NetworkDispatcher
+
+	// mu protects dispatcher.
+	mu         sync.RWMutex
+	dispatcher stack.NetworkDispatcher
+}
+
+var _ stack.GSOEndpoint = (*Endpoint)(nil)
+var _ stack.LinkEndpoint = (*Endpoint)(nil)
+var _ stack.NetworkDispatcher = (*Endpoint)(nil)
+
+// Init initializes a nested.Endpoint that uses embedder as the dispatcher for
+// child on Attach.
+//
+// See the tests in this package for example usage.
+func (e *Endpoint) Init(child stack.LinkEndpoint, embedder stack.NetworkDispatcher) {
+	e.child = child
+	e.embedder = embedder
+}
+
+// DeliverNetworkPacket implements stack.NetworkDispatcher.
+func (e *Endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.mu.RLock()
+	d := e.dispatcher
+	e.mu.RUnlock()
+	if d != nil {
+		d.DeliverNetworkPacket(remote, local, protocol, pkt)
+	}
+}
+
+// Attach implements stack.LinkEndpoint.
+func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.mu.Lock()
+	e.dispatcher = dispatcher
+	e.mu.Unlock()
+	// If we're attaching to a valid dispatcher, pass embedder as the dispatcher
+	// to our child, otherwise detach the child by giving it a nil dispatcher.
+	var pass stack.NetworkDispatcher
+	if dispatcher != nil {
+		pass = e.embedder
+	}
+	e.child.Attach(pass)
+}
+
+// IsAttached implements stack.LinkEndpoint.
+func (e *Endpoint) IsAttached() bool {
+	e.mu.RLock()
+	isAttached := e.dispatcher != nil
+	e.mu.RUnlock()
+	return isAttached
+}
+
+// MTU implements stack.LinkEndpoint.
+func (e *Endpoint) MTU() uint32 {
+	return e.child.MTU()
+}
+
+// Capabilities implements stack.LinkEndpoint.
+func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.child.Capabilities()
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.
+func (e *Endpoint) MaxHeaderLength() uint16 {
+	return e.child.MaxHeaderLength()
+}
+
+// LinkAddress implements stack.LinkEndpoint.
+func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.child.LinkAddress()
+}
+
+// WritePacket implements stack.LinkEndpoint.
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	return e.child.WritePacket(r, gso, protocol, pkt)
+}
+
+// WritePackets implements stack.LinkEndpoint.
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	return e.child.WritePackets(r, gso, pkts, protocol)
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.
+func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	return e.child.WriteRawPacket(vv)
+}
+
+// Wait implements stack.LinkEndpoint.
+func (e *Endpoint) Wait() {
+	e.child.Wait()
+}
+
+// GSOMaxSize implements stack.GSOEndpoint.
+func (e *Endpoint) GSOMaxSize() uint32 {
+	if e, ok := e.child.(stack.GSOEndpoint); ok {
+		return e.GSOMaxSize()
+	}
+	return 0
+}
diff --git a/pkg/tcpip/link/nested/nested_test.go b/pkg/tcpip/link/nested/nested_test.go
new file mode 100644
index 000000000..c1a219f02
--- /dev/null
+++ b/pkg/tcpip/link/nested/nested_test.go
@@ -0,0 +1,105 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package nested_test
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/nested"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+type parentEndpoint struct {
+	nested.Endpoint
+}
+
+var _ stack.LinkEndpoint = (*parentEndpoint)(nil)
+var _ stack.NetworkDispatcher = (*parentEndpoint)(nil)
+
+type childEndpoint struct {
+	stack.LinkEndpoint
+	dispatcher stack.NetworkDispatcher
+}
+
+var _ stack.LinkEndpoint = (*childEndpoint)(nil)
+
+func (c *childEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	c.dispatcher = dispatcher
+}
+
+func (c *childEndpoint) IsAttached() bool {
+	return c.dispatcher != nil
+}
+
+type counterDispatcher struct {
+	count int
+}
+
+var _ stack.NetworkDispatcher = (*counterDispatcher)(nil)
+
+func (d *counterDispatcher) DeliverNetworkPacket(tcpip.LinkAddress, tcpip.LinkAddress, tcpip.NetworkProtocolNumber, *stack.PacketBuffer) {
+	d.count++
+}
+
+func TestNestedLinkEndpoint(t *testing.T) {
+	const emptyAddress = tcpip.LinkAddress("")
+
+	var (
+		childEP  childEndpoint
+		nestedEP parentEndpoint
+		disp     counterDispatcher
+	)
+	nestedEP.Endpoint.Init(&childEP, &nestedEP)
+
+	if childEP.IsAttached() {
+		t.Error("On init, childEP.IsAttached() = true, want = false")
+	}
+	if nestedEP.IsAttached() {
+		t.Error("On init, nestedEP.IsAttached() = true, want = false")
+	}
+
+	nestedEP.Attach(&disp)
+	if disp.count != 0 {
+		t.Fatalf("After attach, got disp.count = %d, want = 0", disp.count)
+	}
+	if !childEP.IsAttached() {
+		t.Error("After attach, childEP.IsAttached() = false, want = true")
+	}
+	if !nestedEP.IsAttached() {
+		t.Error("After attach, nestedEP.IsAttached() = false, want = true")
+	}
+
+	nestedEP.DeliverNetworkPacket(emptyAddress, emptyAddress, header.IPv4ProtocolNumber, &stack.PacketBuffer{})
+	if disp.count != 1 {
+		t.Errorf("After first packet with dispatcher attached, got disp.count = %d, want = 1", disp.count)
+	}
+
+	nestedEP.Attach(nil)
+	if childEP.IsAttached() {
+		t.Error("After detach, childEP.IsAttached() = true, want = false")
+	}
+	if nestedEP.IsAttached() {
+		t.Error("After detach, nestedEP.IsAttached() = true, want = false")
+	}
+
+	disp.count = 0
+	nestedEP.DeliverNetworkPacket(emptyAddress, emptyAddress, header.IPv4ProtocolNumber, &stack.PacketBuffer{})
+	if disp.count != 0 {
+		t.Errorf("After second packet with dispatcher detached, got disp.count = %d, want = 0", disp.count)
+	}
+
+}
diff --git a/pkg/tcpip/link/qdisc/fifo/BUILD b/pkg/tcpip/link/qdisc/fifo/BUILD
new file mode 100644
index 000000000..054c213bc
--- /dev/null
+++ b/pkg/tcpip/link/qdisc/fifo/BUILD
@@ -0,0 +1,19 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "fifo",
+    srcs = [
+        "endpoint.go",
+        "packet_buffer_queue.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/sleep",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/qdisc/fifo/endpoint.go b/pkg/tcpip/link/qdisc/fifo/endpoint.go
new file mode 100644
index 000000000..b5dfb7850
--- /dev/null
+++ b/pkg/tcpip/link/qdisc/fifo/endpoint.go
@@ -0,0 +1,209 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fifo provides the implementation of data-link layer endpoints that
+// wrap another endpoint and queues all outbound packets and asynchronously
+// dispatches them to the lower endpoint.
+package fifo
+
+import (
+	"gvisor.dev/gvisor/pkg/sleep"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// endpoint represents a LinkEndpoint which implements a FIFO queue for all
+// outgoing packets. endpoint can have 1 or more underlying queueDispatchers.
+// All outgoing packets are consistenly hashed to a single underlying queue
+// using the PacketBuffer.Hash if set, otherwise all packets are queued to the
+// first queue to avoid reordering in case of missing hash.
+type endpoint struct {
+	dispatcher  stack.NetworkDispatcher
+	lower       stack.LinkEndpoint
+	wg          sync.WaitGroup
+	dispatchers []*queueDispatcher
+}
+
+// queueDispatcher is responsible for dispatching all outbound packets in its
+// queue. It will also smartly batch packets when possible and write them
+// through the lower LinkEndpoint.
+type queueDispatcher struct {
+	lower          stack.LinkEndpoint
+	q              *packetBufferQueue
+	newPacketWaker sleep.Waker
+	closeWaker     sleep.Waker
+}
+
+// New creates a new fifo link endpoint with the n queues with maximum
+// capacity of queueLen.
+func New(lower stack.LinkEndpoint, n int, queueLen int) stack.LinkEndpoint {
+	e := &endpoint{
+		lower: lower,
+	}
+	// Create the required dispatchers
+	for i := 0; i < n; i++ {
+		qd := &queueDispatcher{
+			q:     &packetBufferQueue{limit: queueLen},
+			lower: lower,
+		}
+		e.dispatchers = append(e.dispatchers, qd)
+		e.wg.Add(1)
+		go func() {
+			defer e.wg.Done()
+			qd.dispatchLoop()
+		}()
+	}
+	return e
+}
+
+func (q *queueDispatcher) dispatchLoop() {
+	const newPacketWakerID = 1
+	const closeWakerID = 2
+	s := sleep.Sleeper{}
+	s.AddWaker(&q.newPacketWaker, newPacketWakerID)
+	s.AddWaker(&q.closeWaker, closeWakerID)
+	defer s.Done()
+
+	const batchSize = 32
+	var batch stack.PacketBufferList
+	for {
+		id, ok := s.Fetch(true)
+		if ok && id == closeWakerID {
+			return
+		}
+		for pkt := q.q.dequeue(); pkt != nil; pkt = q.q.dequeue() {
+			batch.PushBack(pkt)
+			if batch.Len() < batchSize && !q.q.empty() {
+				continue
+			}
+			// We pass a protocol of zero here because each packet carries its
+			// NetworkProtocol.
+			q.lower.WritePackets(nil /* route */, nil /* gso */, batch, 0 /* protocol */)
+			for pkt := batch.Front(); pkt != nil; pkt = pkt.Next() {
+				pkt.EgressRoute.Release()
+				batch.Remove(pkt)
+			}
+			batch.Reset()
+		}
+	}
+}
+
+// DeliverNetworkPacket implements stack.NetworkDispatcher.DeliverNetworkPacket.
+func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
+}
+
+// Attach implements stack.LinkEndpoint.Attach.
+func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+	e.lower.Attach(e)
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU.
+func (e *endpoint) MTU() uint32 {
+	return e.lower.MTU()
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (e *endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.lower.Capabilities()
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength.
+func (e *endpoint) MaxHeaderLength() uint16 {
+	return e.lower.MaxHeaderLength()
+}
+
+// LinkAddress implements stack.LinkEndpoint.LinkAddress.
+func (e *endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.lower.LinkAddress()
+}
+
+// GSOMaxSize returns the maximum GSO packet size.
+func (e *endpoint) GSOMaxSize() uint32 {
+	if gso, ok := e.lower.(stack.GSOEndpoint); ok {
+		return gso.GSOMaxSize()
+	}
+	return 0
+}
+
+// WritePacket implements stack.LinkEndpoint.WritePacket.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	// WritePacket caller's do not set the following fields in PacketBuffer
+	// so we populate them here.
+	newRoute := r.Clone()
+	pkt.EgressRoute = &newRoute
+	pkt.GSOOptions = gso
+	pkt.NetworkProtocolNumber = protocol
+	d := e.dispatchers[int(pkt.Hash)%len(e.dispatchers)]
+	if !d.q.enqueue(pkt) {
+		return tcpip.ErrNoBufferSpace
+	}
+	d.newPacketWaker.Assert()
+	return nil
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+//
+// Being a batch API, each packet in pkts should have the following fields
+// populated:
+//   - pkt.EgressRoute
+//   - pkt.GSOOptions
+//   - pkt.NetworkProtocolNumber
+func (e *endpoint) WritePackets(_ *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, _ tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	enqueued := 0
+	for pkt := pkts.Front(); pkt != nil; {
+		d := e.dispatchers[int(pkt.Hash)%len(e.dispatchers)]
+		nxt := pkt.Next()
+		// Since qdisc can hold onto a packet for long we should Clone
+		// the route here to ensure it doesn't get released while the
+		// packet is still in our queue.
+		newRoute := pkt.EgressRoute.Clone()
+		pkt.EgressRoute = &newRoute
+		if !d.q.enqueue(pkt) {
+			if enqueued > 0 {
+				d.newPacketWaker.Assert()
+			}
+			return enqueued, tcpip.ErrNoBufferSpace
+		}
+		pkt = nxt
+		enqueued++
+		d.newPacketWaker.Assert()
+	}
+	return enqueued, nil
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	return e.lower.WriteRawPacket(vv)
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (e *endpoint) Wait() {
+	e.lower.Wait()
+
+	// The linkEP is gone. Teardown the outbound dispatcher goroutines.
+	for i := range e.dispatchers {
+		e.dispatchers[i].closeWaker.Assert()
+	}
+
+	e.wg.Wait()
+}
diff --git a/pkg/tcpip/link/qdisc/fifo/packet_buffer_queue.go b/pkg/tcpip/link/qdisc/fifo/packet_buffer_queue.go
new file mode 100644
index 000000000..eb5abb906
--- /dev/null
+++ b/pkg/tcpip/link/qdisc/fifo/packet_buffer_queue.go
@@ -0,0 +1,84 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fifo
+
+import (
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// packetBufferQueue is a bounded, thread-safe queue of PacketBuffers.
+//
+type packetBufferQueue struct {
+	mu    sync.Mutex
+	list  stack.PacketBufferList
+	limit int
+	used  int
+}
+
+// emptyLocked determines if the queue is empty.
+// Preconditions: q.mu must be held.
+func (q *packetBufferQueue) emptyLocked() bool {
+	return q.used == 0
+}
+
+// empty determines if the queue is empty.
+func (q *packetBufferQueue) empty() bool {
+	q.mu.Lock()
+	r := q.emptyLocked()
+	q.mu.Unlock()
+
+	return r
+}
+
+// setLimit updates the limit. No PacketBuffers are immediately dropped in case
+// the queue becomes full due to the new limit.
+func (q *packetBufferQueue) setLimit(limit int) {
+	q.mu.Lock()
+	q.limit = limit
+	q.mu.Unlock()
+}
+
+// enqueue adds the given packet to the queue.
+//
+// Returns true when the PacketBuffer is successfully added to the queue, in
+// which case ownership of the reference is transferred to the queue. And
+// returns false if the queue is full, in which case ownership is retained by
+// the caller.
+func (q *packetBufferQueue) enqueue(s *stack.PacketBuffer) bool {
+	q.mu.Lock()
+	r := q.used < q.limit
+	if r {
+		q.list.PushBack(s)
+		q.used++
+	}
+	q.mu.Unlock()
+
+	return r
+}
+
+// dequeue removes and returns the next PacketBuffer from queue, if one exists.
+// Ownership is transferred to the caller.
+func (q *packetBufferQueue) dequeue() *stack.PacketBuffer {
+	q.mu.Lock()
+	s := q.list.Front()
+	if s != nil {
+		q.list.Remove(s)
+		q.used--
+	}
+	q.mu.Unlock()
+
+	return s
+}
diff --git a/pkg/tcpip/link/rawfile/BUILD b/pkg/tcpip/link/rawfile/BUILD
new file mode 100644
index 000000000..14b527bc2
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/BUILD
@@ -0,0 +1,20 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "rawfile",
+    srcs = [
+        "blockingpoll_amd64.s",
+        "blockingpoll_arm64.s",
+        "blockingpoll_noyield_unsafe.go",
+        "blockingpoll_yield_unsafe.go",
+        "errors.go",
+        "rawfile_unsafe.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/tcpip",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_amd64.s b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
new file mode 100644
index 000000000..298bad55d
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/blockingpoll_amd64.s
@@ -0,0 +1,41 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// BlockingPoll makes the ppoll() syscall while calling the version of
+// entersyscall that relinquishes the P so that other Gs can run. This is meant
+// to be called in cases when the syscall is expected to block.
+//
+// func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (n int, err syscall.Errno)
+TEXT ·BlockingPoll(SB),NOSPLIT,$0-40
+	CALL	·callEntersyscallblock(SB)
+	MOVQ	fds+0(FP), DI
+	MOVQ	nfds+8(FP), SI
+	MOVQ	timeout+16(FP), DX
+	MOVQ	$0x0, R10  // sigmask parameter which isn't used here
+	MOVQ	$0x10f, AX // SYS_PPOLL
+	SYSCALL
+	CMPQ	AX, $0xfffffffffffff001
+	JLS	ok
+	MOVQ	$-1, n+24(FP)
+	NEGQ	AX
+	MOVQ	AX, err+32(FP)
+	CALL	·callExitsyscall(SB)
+	RET
+ok:
+	MOVQ	AX, n+24(FP)
+	MOVQ	$0, err+32(FP)
+	CALL	·callExitsyscall(SB)
+	RET
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_arm64.s b/pkg/tcpip/link/rawfile/blockingpoll_arm64.s
new file mode 100644
index 000000000..b62888b93
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/blockingpoll_arm64.s
@@ -0,0 +1,42 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "textflag.h"
+
+// BlockingPoll makes the ppoll() syscall while calling the version of
+// entersyscall that relinquishes the P so that other Gs can run. This is meant
+// to be called in cases when the syscall is expected to block.
+//
+// func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (n int, err syscall.Errno)
+TEXT ·BlockingPoll(SB),NOSPLIT,$0-40
+	BL	·callEntersyscallblock(SB)
+	MOVD	fds+0(FP), R0
+	MOVD	nfds+8(FP), R1
+	MOVD	timeout+16(FP), R2
+	MOVD	$0x0, R3  // sigmask parameter which isn't used here
+	MOVD	$0x49, R8 // SYS_PPOLL
+	SVC
+	CMP	$0xfffffffffffff001, R0
+	BLS	ok
+	MOVD	$-1, R1
+	MOVD	R1, n+24(FP)
+	NEG	R0, R0
+	MOVD	R0, err+32(FP)
+	BL	·callExitsyscall(SB)
+	RET
+ok:
+	MOVD	R0, n+24(FP)
+	MOVD	$0, err+32(FP)
+	BL	·callExitsyscall(SB)
+	RET
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_noyield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_noyield_unsafe.go
new file mode 100644
index 000000000..621ab8d29
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/blockingpoll_noyield_unsafe.go
@@ -0,0 +1,31 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux,!amd64,!arm64
+
+package rawfile
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+// BlockingPoll is just a stub function that forwards to the ppoll() system call
+// on non-amd64 and non-arm64 platforms.
+func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (int, syscall.Errno) {
+	n, _, e := syscall.Syscall6(syscall.SYS_PPOLL, uintptr(unsafe.Pointer(fds)),
+		uintptr(nfds), uintptr(unsafe.Pointer(timeout)), 0, 0, 0)
+
+	return int(n), e
+}
diff --git a/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
new file mode 100644
index 000000000..99313ee25
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/blockingpoll_yield_unsafe.go
@@ -0,0 +1,66 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux,amd64 linux,arm64
+// +build go1.12
+// +build !go1.16
+
+// Check go:linkname function signatures when updating Go version.
+
+package rawfile
+
+import (
+	"syscall"
+	_ "unsafe" // for go:linkname
+)
+
+// BlockingPoll on amd64/arm64 makes the ppoll() syscall while calling the
+// version of entersyscall that relinquishes the P so that other Gs can
+// run. This is meant to be called in cases when the syscall is expected to
+// block. On non amd64/arm64 platforms it just forwards to the ppoll() system
+// call.
+//
+//go:noescape
+func BlockingPoll(fds *PollEvent, nfds int, timeout *syscall.Timespec) (int, syscall.Errno)
+
+// Use go:linkname to call into the runtime. As of Go 1.12 this has to
+// be done from Go code so that we make an ABIInternal call to an
+// ABIInternal function; see https://golang.org/issue/27539.
+
+// We need to call both entersyscallblock and exitsyscall this way so
+// that the runtime's check on the stack pointer lines up.
+
+// Note that calling an unexported function in the runtime package is
+// unsafe and this hack is likely to break in future Go releases.
+
+//go:linkname entersyscallblock runtime.entersyscallblock
+func entersyscallblock()
+
+//go:linkname exitsyscall runtime.exitsyscall
+func exitsyscall()
+
+// These forwarding functions must be nosplit because 1) we must
+// disallow preemption between entersyscallblock and exitsyscall, and
+// 2) we have an untyped assembly frame on the stack which can not be
+// grown or moved.
+
+//go:nosplit
+func callEntersyscallblock() {
+	entersyscallblock()
+}
+
+//go:nosplit
+func callExitsyscall() {
+	exitsyscall()
+}
diff --git a/pkg/tcpip/link/rawfile/errors.go b/pkg/tcpip/link/rawfile/errors.go
new file mode 100644
index 000000000..a0a873c84
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/errors.go
@@ -0,0 +1,70 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package rawfile
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const maxErrno = 134
+
+var translations [maxErrno]*tcpip.Error
+
+// TranslateErrno translate an errno from the syscall package into a
+// *tcpip.Error.
+//
+// Valid, but unrecognized errnos will be translated to
+// tcpip.ErrInvalidEndpointState (EINVAL). Panics on invalid errnos.
+func TranslateErrno(e syscall.Errno) *tcpip.Error {
+	if err := translations[e]; err != nil {
+		return err
+	}
+	return tcpip.ErrInvalidEndpointState
+}
+
+func addTranslation(host syscall.Errno, trans *tcpip.Error) {
+	if translations[host] != nil {
+		panic(fmt.Sprintf("duplicate translation for host errno %q (%d)", host.Error(), host))
+	}
+	translations[host] = trans
+}
+
+func init() {
+	addTranslation(syscall.EEXIST, tcpip.ErrDuplicateAddress)
+	addTranslation(syscall.ENETUNREACH, tcpip.ErrNoRoute)
+	addTranslation(syscall.EINVAL, tcpip.ErrInvalidEndpointState)
+	addTranslation(syscall.EALREADY, tcpip.ErrAlreadyConnecting)
+	addTranslation(syscall.EISCONN, tcpip.ErrAlreadyConnected)
+	addTranslation(syscall.EADDRINUSE, tcpip.ErrPortInUse)
+	addTranslation(syscall.EADDRNOTAVAIL, tcpip.ErrBadLocalAddress)
+	addTranslation(syscall.EPIPE, tcpip.ErrClosedForSend)
+	addTranslation(syscall.EWOULDBLOCK, tcpip.ErrWouldBlock)
+	addTranslation(syscall.ECONNREFUSED, tcpip.ErrConnectionRefused)
+	addTranslation(syscall.ETIMEDOUT, tcpip.ErrTimeout)
+	addTranslation(syscall.EINPROGRESS, tcpip.ErrConnectStarted)
+	addTranslation(syscall.EDESTADDRREQ, tcpip.ErrDestinationRequired)
+	addTranslation(syscall.ENOTSUP, tcpip.ErrNotSupported)
+	addTranslation(syscall.ENOTTY, tcpip.ErrQueueSizeNotSupported)
+	addTranslation(syscall.ENOTCONN, tcpip.ErrNotConnected)
+	addTranslation(syscall.ECONNRESET, tcpip.ErrConnectionReset)
+	addTranslation(syscall.ECONNABORTED, tcpip.ErrConnectionAborted)
+	addTranslation(syscall.EMSGSIZE, tcpip.ErrMessageTooLong)
+	addTranslation(syscall.ENOBUFS, tcpip.ErrNoBufferSpace)
+}
diff --git a/pkg/tcpip/link/rawfile/rawfile_unsafe.go b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
new file mode 100644
index 000000000..69de6eb3e
--- /dev/null
+++ b/pkg/tcpip/link/rawfile/rawfile_unsafe.go
@@ -0,0 +1,192 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+// Package rawfile contains utilities for using the netstack with raw host
+// files on Linux hosts.
+package rawfile
+
+import (
+	"syscall"
+	"unsafe"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+// GetMTU determines the MTU of a network interface device.
+func GetMTU(name string) (uint32, error) {
+	fd, err := syscall.Socket(syscall.AF_UNIX, syscall.SOCK_DGRAM, 0)
+	if err != nil {
+		return 0, err
+	}
+
+	defer syscall.Close(fd)
+
+	var ifreq struct {
+		name [16]byte
+		mtu  int32
+		_    [20]byte
+	}
+
+	copy(ifreq.name[:], name)
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.SIOCGIFMTU, uintptr(unsafe.Pointer(&ifreq)))
+	if errno != 0 {
+		return 0, errno
+	}
+
+	return uint32(ifreq.mtu), nil
+}
+
+// NonBlockingWrite writes the given buffer to a file descriptor. It fails if
+// partial data is written.
+func NonBlockingWrite(fd int, buf []byte) *tcpip.Error {
+	var ptr unsafe.Pointer
+	if len(buf) > 0 {
+		ptr = unsafe.Pointer(&buf[0])
+	}
+
+	_, _, e := syscall.RawSyscall(syscall.SYS_WRITE, uintptr(fd), uintptr(ptr), uintptr(len(buf)))
+	if e != 0 {
+		return TranslateErrno(e)
+	}
+
+	return nil
+}
+
+// NonBlockingWrite3 writes up to three byte slices to a file descriptor in a
+// single syscall. It fails if partial data is written.
+func NonBlockingWrite3(fd int, b1, b2, b3 []byte) *tcpip.Error {
+	// If there is no second and third buffer, issue a regular write.
+	if len(b2) == 0 && len(b3) == 0 {
+		return NonBlockingWrite(fd, b1)
+	}
+
+	// Build the iovec that represents them and issue a writev syscall.
+	iovec := [3]syscall.Iovec{
+		{
+			Base: &b1[0],
+			Len:  uint64(len(b1)),
+		},
+		{
+			Base: &b2[0],
+			Len:  uint64(len(b2)),
+		},
+	}
+	iovecLen := uintptr(2)
+
+	if len(b3) > 0 {
+		iovecLen++
+		iovec[2].Base = &b3[0]
+		iovec[2].Len = uint64(len(b3))
+	}
+
+	_, _, e := syscall.RawSyscall(syscall.SYS_WRITEV, uintptr(fd), uintptr(unsafe.Pointer(&iovec[0])), iovecLen)
+	if e != 0 {
+		return TranslateErrno(e)
+	}
+
+	return nil
+}
+
+// NonBlockingSendMMsg sends multiple messages on a socket.
+func NonBlockingSendMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) {
+	n, _, e := syscall.RawSyscall6(unix.SYS_SENDMMSG, uintptr(fd), uintptr(unsafe.Pointer(&msgHdrs[0])), uintptr(len(msgHdrs)), syscall.MSG_DONTWAIT, 0, 0)
+	if e != 0 {
+		return 0, TranslateErrno(e)
+	}
+
+	return int(n), nil
+}
+
+// PollEvent represents the pollfd structure passed to a poll() system call.
+type PollEvent struct {
+	FD      int32
+	Events  int16
+	Revents int16
+}
+
+// BlockingRead reads from a file descriptor that is set up as non-blocking. If
+// no data is available, it will block in a poll() syscall until the file
+// descriptor becomes readable.
+func BlockingRead(fd int, b []byte) (int, *tcpip.Error) {
+	for {
+		n, _, e := syscall.RawSyscall(syscall.SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(&b[0])), uintptr(len(b)))
+		if e == 0 {
+			return int(n), nil
+		}
+
+		event := PollEvent{
+			FD:     int32(fd),
+			Events: 1, // POLLIN
+		}
+
+		_, e = BlockingPoll(&event, 1, nil)
+		if e != 0 && e != syscall.EINTR {
+			return 0, TranslateErrno(e)
+		}
+	}
+}
+
+// BlockingReadv reads from a file descriptor that is set up as non-blocking and
+// stores the data in a list of iovecs buffers. If no data is available, it will
+// block in a poll() syscall until the file descriptor becomes readable.
+func BlockingReadv(fd int, iovecs []syscall.Iovec) (int, *tcpip.Error) {
+	for {
+		n, _, e := syscall.RawSyscall(syscall.SYS_READV, uintptr(fd), uintptr(unsafe.Pointer(&iovecs[0])), uintptr(len(iovecs)))
+		if e == 0 {
+			return int(n), nil
+		}
+
+		event := PollEvent{
+			FD:     int32(fd),
+			Events: 1, // POLLIN
+		}
+
+		_, e = BlockingPoll(&event, 1, nil)
+		if e != 0 && e != syscall.EINTR {
+			return 0, TranslateErrno(e)
+		}
+	}
+}
+
+// MMsgHdr represents the mmsg_hdr structure required by recvmmsg() on linux.
+type MMsgHdr struct {
+	Msg syscall.Msghdr
+	Len uint32
+	_   [4]byte
+}
+
+// BlockingRecvMMsg reads from a file descriptor that is set up as non-blocking
+// and stores the received messages in a slice of MMsgHdr structures. If no data
+// is available, it will block in a poll() syscall until the file descriptor
+// becomes readable.
+func BlockingRecvMMsg(fd int, msgHdrs []MMsgHdr) (int, *tcpip.Error) {
+	for {
+		n, _, e := syscall.RawSyscall6(syscall.SYS_RECVMMSG, uintptr(fd), uintptr(unsafe.Pointer(&msgHdrs[0])), uintptr(len(msgHdrs)), syscall.MSG_DONTWAIT, 0, 0)
+		if e == 0 {
+			return int(n), nil
+		}
+
+		event := PollEvent{
+			FD:     int32(fd),
+			Events: 1, // POLLIN
+		}
+
+		if _, e := BlockingPoll(&event, 1, nil); e != 0 && e != syscall.EINTR {
+			return 0, TranslateErrno(e)
+		}
+	}
+}
diff --git a/pkg/tcpip/link/sharedmem/BUILD b/pkg/tcpip/link/sharedmem/BUILD
new file mode 100644
index 000000000..13243ebbb
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/BUILD
@@ -0,0 +1,41 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "sharedmem",
+    srcs = [
+        "rx.go",
+        "sharedmem.go",
+        "sharedmem_unsafe.go",
+        "tx.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/log",
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/rawfile",
+        "//pkg/tcpip/link/sharedmem/queue",
+        "//pkg/tcpip/stack",
+    ],
+)
+
+go_test(
+    name = "sharedmem_test",
+    srcs = [
+        "sharedmem_test.go",
+    ],
+    library = ":sharedmem",
+    deps = [
+        "//pkg/sync",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/sharedmem/pipe",
+        "//pkg/tcpip/link/sharedmem/queue",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/sharedmem/pipe/BUILD b/pkg/tcpip/link/sharedmem/pipe/BUILD
new file mode 100644
index 000000000..87020ec08
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/pipe/BUILD
@@ -0,0 +1,23 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "pipe",
+    srcs = [
+        "pipe.go",
+        "pipe_unsafe.go",
+        "rx.go",
+        "tx.go",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+go_test(
+    name = "pipe_test",
+    srcs = [
+        "pipe_test.go",
+    ],
+    library = ":pipe",
+    deps = ["//pkg/sync"],
+)
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe.go b/pkg/tcpip/link/sharedmem/pipe/pipe.go
new file mode 100644
index 000000000..74c9f0311
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe.go
@@ -0,0 +1,78 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pipe implements a shared memory ring buffer on which a single reader
+// and a single writer can operate (read/write) concurrently. The ring buffer
+// allows for data of different sizes to be written, and preserves the boundary
+// of the written data.
+//
+// Example usage is as follows:
+//
+//	wb := t.Push(20)
+//	// Write data to wb.
+//	t.Flush()
+//
+//	rb := r.Pull()
+//	// Do something with data in rb.
+//	t.Flush()
+package pipe
+
+import (
+	"math"
+)
+
+const (
+	jump           uint64 = math.MaxUint32 + 1
+	offsetMask     uint64 = math.MaxUint32
+	revolutionMask uint64 = ^offsetMask
+
+	sizeOfSlotHeader        = 8 // sizeof(uint64)
+	slotFree         uint64 = 1 << 63
+	slotSizeMask     uint64 = math.MaxUint32
+)
+
+// payloadToSlotSize calculates the total size of a slot based on its payload
+// size. The  total size is the header size, plus the payload size, plus padding
+// if necessary to make the total size a multiple of sizeOfSlotHeader.
+func payloadToSlotSize(payloadSize uint64) uint64 {
+	s := sizeOfSlotHeader + payloadSize
+	return (s + sizeOfSlotHeader - 1) &^ (sizeOfSlotHeader - 1)
+}
+
+// slotToPayloadSize calculates the payload size of a slot based on the total
+// size of the slot. This is only meant to be used when creating slots that
+// don't carry information (e.g., free slots or wrap slots).
+func slotToPayloadSize(offset uint64) uint64 {
+	return offset - sizeOfSlotHeader
+}
+
+// pipe is a basic data structure used by both (transmit & receive) ends of a
+// pipe. Indices into this pipe are split into two fields: offset, which counts
+// the number of bytes from the beginning of the buffer, and revolution, which
+// counts the number of times the index has wrapped around.
+type pipe struct {
+	buffer []byte
+}
+
+// init initializes the pipe buffer such that its size is a multiple of the size
+// of the slot header.
+func (p *pipe) init(b []byte) {
+	p.buffer = b[:len(b)&^(sizeOfSlotHeader-1)]
+}
+
+// data returns a section of the buffer starting at the given index (which may
+// include revolution information) and with the given size.
+func (p *pipe) data(idx uint64, size uint64) []byte {
+	return p.buffer[(idx&offsetMask)+sizeOfSlotHeader:][:size]
+}
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_test.go b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
new file mode 100644
index 000000000..dc239a0d0
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_test.go
@@ -0,0 +1,518 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"math/rand"
+	"reflect"
+	"runtime"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+func TestSimpleReadWrite(t *testing.T) {
+	// Check that a simple write can be properly read from the rx side.
+	tr := rand.New(rand.NewSource(99))
+	rr := rand.New(rand.NewSource(99))
+
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	wb := tx.Push(10)
+	if wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	for i := range wb {
+		wb[i] = byte(tr.Intn(256))
+	}
+	tx.Flush()
+
+	var rx Rx
+	rx.Init(b)
+	rb := rx.Pull()
+	if len(rb) != 10 {
+		t.Fatalf("Bad buffer size returned: got %v, want %v", len(rb), 10)
+	}
+
+	for i := range rb {
+		if v := byte(rr.Intn(256)); v != rb[i] {
+			t.Fatalf("Bad read buffer at index %v: got %v, want %v", i, rb[i], v)
+		}
+	}
+	rx.Flush()
+}
+
+func TestEmptyRead(t *testing.T) {
+	// Check that pulling from an empty pipe fails.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on empty pipe")
+	}
+}
+
+func TestTooLargeWrite(t *testing.T) {
+	// Check that writes that are too large are properly rejected.
+	b := make([]byte, 96)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(96); wb != nil {
+		t.Fatalf("Write of 96 bytes succeeded on 96-byte pipe")
+	}
+
+	if wb := tx.Push(88); wb != nil {
+		t.Fatalf("Write of 88 bytes succeeded on 96-byte pipe")
+	}
+
+	if wb := tx.Push(80); wb == nil {
+		t.Fatalf("Write of 80 bytes failed on 96-byte pipe")
+	}
+}
+
+func TestFullWrite(t *testing.T) {
+	// Check that writes fail when the pipe is full.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(80); wb == nil {
+		t.Fatalf("Write of 80 bytes failed on 96-byte pipe")
+	}
+
+	if wb := tx.Push(1); wb != nil {
+		t.Fatalf("Write succeeded on full pipe")
+	}
+}
+
+func TestFullAndFlushedWrite(t *testing.T) {
+	// Check that writes fail when the pipe is full and has already been
+	// flushed.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(80); wb == nil {
+		t.Fatalf("Write of 80 bytes failed on 96-byte pipe")
+	}
+
+	tx.Flush()
+
+	if wb := tx.Push(1); wb != nil {
+		t.Fatalf("Write succeeded on full pipe")
+	}
+}
+
+func TestTxFlushTwice(t *testing.T) {
+	// Checks that a second consecutive tx flush is a no-op.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	tx.Flush()
+
+	// Make copy of original tx queue, flush it, then check that it didn't
+	// change.
+	orig := tx
+	tx.Flush()
+
+	if !reflect.DeepEqual(orig, tx) {
+		t.Fatalf("Flush mutated tx pipe: got %v, want %v", tx, orig)
+	}
+}
+
+func TestRxFlushTwice(t *testing.T) {
+	// Checks that a second consecutive rx flush is a no-op.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	tx.Flush()
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+	rx.Flush()
+
+	// Make copy of original rx queue, flush it, then check that it didn't
+	// change.
+	orig := rx
+	rx.Flush()
+
+	if !reflect.DeepEqual(orig, rx) {
+		t.Fatalf("Flush mutated rx pipe: got %v, want %v", rx, orig)
+	}
+}
+
+func TestWrapInMiddleOfTransaction(t *testing.T) {
+	// Check that writes are not flushed when we need to wrap the buffer
+	// around.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	tx.Flush()
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+	rx.Flush()
+
+	// At this point the ring buffer is empty, but the write is at offset
+	// 64 (50 + sizeOfSlotHeader + padding-for-8-byte-alignment).
+	if wb := tx.Push(10); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on non-full pipe")
+	}
+
+	// We haven't flushed yet, so pull must return nil.
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on non-flushed pipe")
+	}
+
+	tx.Flush()
+
+	// The two buffers must be available now.
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+}
+
+func TestWriteAbort(t *testing.T) {
+	// Check that a read fails on a pipe that has had data pushed to it but
+	// has aborted the push.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(10); wb == nil {
+		t.Fatalf("Write failed on empty pipe")
+	}
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on empty pipe")
+	}
+
+	tx.Abort()
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on empty pipe")
+	}
+}
+
+func TestWrappedWriteAbort(t *testing.T) {
+	// Check that writes are properly aborted even if the writes wrap
+	// around.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	tx.Flush()
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+	rx.Flush()
+
+	// At this point the ring buffer is empty, but the write is at offset
+	// 64 (50 + sizeOfSlotHeader + padding-for-8-byte-alignment).
+	if wb := tx.Push(10); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on non-full pipe")
+	}
+
+	// We haven't flushed yet, so pull must return nil.
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on non-flushed pipe")
+	}
+
+	tx.Abort()
+
+	// The pushes were aborted, so no data should be readable.
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on non-flushed pipe")
+	}
+
+	// Try the same transactions again, but flush this time.
+	if wb := tx.Push(10); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on non-full pipe")
+	}
+
+	tx.Flush()
+
+	// The two buffers must be available now.
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+}
+
+func TestEmptyReadOnNonFlushedWrite(t *testing.T) {
+	// Check that a read fails on a pipe that has had data pushed to it
+	// but not yet flushed.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(10); wb == nil {
+		t.Fatalf("Write failed on empty pipe")
+	}
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on empty pipe")
+	}
+
+	tx.Flush()
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull on failed on non-empty pipe")
+	}
+}
+
+func TestPullAfterPullingEntirePipe(t *testing.T) {
+	// Check that Pull fails when the pipe is full, but all of it has
+	// already been pulled but not yet flushed.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	tx.Flush()
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+	rx.Flush()
+
+	// At this point the ring buffer is empty, but the write is at offset
+	// 64 (50 + sizeOfSlotHeader + padding-for-8-byte-alignment). Write 3
+	// buffers that will fill the pipe.
+	if wb := tx.Push(10); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+
+	if wb := tx.Push(20); wb == nil {
+		t.Fatalf("Push failed on non-full pipe")
+	}
+
+	if wb := tx.Push(24); wb == nil {
+		t.Fatalf("Push failed on non-full pipe")
+	}
+
+	tx.Flush()
+
+	// The three buffers must be available now.
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+
+	// Fourth pull must fail.
+	if rb := rx.Pull(); rb != nil {
+		t.Fatalf("Pull succeeded on empty pipe")
+	}
+}
+
+func TestNoRoomToWrapOnPush(t *testing.T) {
+	// Check that Push fails when it tries to allocate room to add a wrap
+	// message.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	tx.Flush()
+
+	var rx Rx
+	rx.Init(b)
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+	rx.Flush()
+
+	// At this point the ring buffer is empty, but the write is at offset
+	// 64 (50 + sizeOfSlotHeader + padding-for-8-byte-alignment). Write 20,
+	// which won't fit (64+20+8+padding = 96, which wouldn't leave room for
+	// the padding), so it wraps around.
+	if wb := tx.Push(20); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+
+	tx.Flush()
+
+	// Buffer offset is at 28. Try to write 70, which would require a wrap
+	// slot which cannot be created now.
+	if wb := tx.Push(70); wb != nil {
+		t.Fatalf("Push succeeded on pipe with no room for wrap message")
+	}
+}
+
+func TestRxImplicitFlushOfWrapMessage(t *testing.T) {
+	// Check if the first read is that of a wrapping message, that it gets
+	// immediately flushed.
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	if wb := tx.Push(50); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+	tx.Flush()
+
+	// This will cause a wrapping message to written.
+	if wb := tx.Push(60); wb != nil {
+		t.Fatalf("Push succeeded when there is no room in pipe")
+	}
+
+	var rx Rx
+	rx.Init(b)
+
+	// Read the first message.
+	if rb := rx.Pull(); rb == nil {
+		t.Fatalf("Pull failed on non-empty pipe")
+	}
+	rx.Flush()
+
+	// This should fail because of the wrapping message is taking up space.
+	if wb := tx.Push(60); wb != nil {
+		t.Fatalf("Push succeeded when there is no room in pipe")
+	}
+
+	// Try to read the next one. This should consume the wrapping message.
+	rx.Pull()
+
+	// This must now succeed.
+	if wb := tx.Push(60); wb == nil {
+		t.Fatalf("Push failed on empty pipe")
+	}
+}
+
+func TestConcurrentReaderWriter(t *testing.T) {
+	// Push a million buffers of random sizes and random contents. Check
+	// that buffers read match what was written.
+	tr := rand.New(rand.NewSource(99))
+	rr := rand.New(rand.NewSource(99))
+
+	b := make([]byte, 100)
+	var tx Tx
+	tx.Init(b)
+
+	var rx Rx
+	rx.Init(b)
+
+	const count = 1000000
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		runtime.Gosched()
+		for i := 0; i < count; i++ {
+			n := 1 + tr.Intn(80)
+			wb := tx.Push(uint64(n))
+			for wb == nil {
+				wb = tx.Push(uint64(n))
+			}
+
+			for j := range wb {
+				wb[j] = byte(tr.Intn(256))
+			}
+
+			tx.Flush()
+		}
+	}()
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		runtime.Gosched()
+		for i := 0; i < count; i++ {
+			n := 1 + rr.Intn(80)
+			rb := rx.Pull()
+			for rb == nil {
+				rb = rx.Pull()
+			}
+
+			if n != len(rb) {
+				t.Fatalf("Bad %v-th buffer length: got %v, want %v", i, len(rb), n)
+			}
+
+			for j := range rb {
+				if v := byte(rr.Intn(256)); v != rb[j] {
+					t.Fatalf("Bad %v-th read buffer at index %v: got %v, want %v", i, j, rb[j], v)
+				}
+			}
+
+			rx.Flush()
+		}
+	}()
+
+	wg.Wait()
+}
diff --git a/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go b/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
new file mode 100644
index 000000000..62d17029e
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/pipe/pipe_unsafe.go
@@ -0,0 +1,35 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+import (
+	"sync/atomic"
+	"unsafe"
+)
+
+func (p *pipe) write(idx uint64, v uint64) {
+	ptr := (*uint64)(unsafe.Pointer(&p.buffer[idx&offsetMask:][:8][0]))
+	*ptr = v
+}
+
+func (p *pipe) writeAtomic(idx uint64, v uint64) {
+	ptr := (*uint64)(unsafe.Pointer(&p.buffer[idx&offsetMask:][:8][0]))
+	atomic.StoreUint64(ptr, v)
+}
+
+func (p *pipe) readAtomic(idx uint64) uint64 {
+	ptr := (*uint64)(unsafe.Pointer(&p.buffer[idx&offsetMask:][:8][0]))
+	return atomic.LoadUint64(ptr)
+}
diff --git a/pkg/tcpip/link/sharedmem/pipe/rx.go b/pkg/tcpip/link/sharedmem/pipe/rx.go
new file mode 100644
index 000000000..f22e533ac
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/pipe/rx.go
@@ -0,0 +1,93 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+// Rx is the receive side of the shared memory ring buffer.
+type Rx struct {
+	p pipe
+
+	tail uint64
+	head uint64
+}
+
+// Init initializes the receive end of the pipe. In the initial state, the next
+// slot to be inspected is the very first one.
+func (r *Rx) Init(b []byte) {
+	r.p.init(b)
+	r.tail = 0xfffffffe * jump
+	r.head = r.tail
+}
+
+// Pull reads the next buffer from the pipe, returning nil if there isn't one
+// currently available.
+//
+// The returned slice is available until Flush() is next called. After that, it
+// must not be touched.
+func (r *Rx) Pull() []byte {
+	if r.head == r.tail+jump {
+		// We've already pulled the whole pipe.
+		return nil
+	}
+
+	header := r.p.readAtomic(r.head)
+	if header&slotFree != 0 {
+		// The next slot is free, we can't pull it yet.
+		return nil
+	}
+
+	payloadSize := header & slotSizeMask
+	newHead := r.head + payloadToSlotSize(payloadSize)
+	headWrap := (r.head & revolutionMask) | uint64(len(r.p.buffer))
+
+	// Check if this is a wrapping slot. If that's the case, it carries no
+	// data, so we just skip it and try again from the first slot.
+	if int64(newHead-headWrap) >= 0 {
+		if int64(newHead-headWrap) > int64(jump) || newHead&offsetMask != 0 {
+			return nil
+		}
+
+		if r.tail == r.head {
+			// If this is the first pull since the last Flush()
+			// call, we flush the state so that the sender can use
+			// this space if it needs to.
+			r.p.writeAtomic(r.head, slotFree|slotToPayloadSize(newHead-r.head))
+			r.tail = newHead
+		}
+
+		r.head = newHead
+		return r.Pull()
+	}
+
+	// Grab the buffer before updating r.head.
+	b := r.p.data(r.head, payloadSize)
+	r.head = newHead
+	return b
+}
+
+// Flush tells the transmitter that all buffers pulled since the last Flush()
+// have been used, so the transmitter is free to used their slots for further
+// transmission.
+func (r *Rx) Flush() {
+	if r.head == r.tail {
+		return
+	}
+	r.p.writeAtomic(r.tail, slotFree|slotToPayloadSize(r.head-r.tail))
+	r.tail = r.head
+}
+
+// Bytes returns the byte slice on which the pipe operates.
+func (r *Rx) Bytes() []byte {
+	return r.p.buffer
+}
diff --git a/pkg/tcpip/link/sharedmem/pipe/tx.go b/pkg/tcpip/link/sharedmem/pipe/tx.go
new file mode 100644
index 000000000..9841eb231
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/pipe/tx.go
@@ -0,0 +1,161 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pipe
+
+// Tx is the transmit side of the shared memory ring buffer.
+type Tx struct {
+	p              pipe
+	maxPayloadSize uint64
+
+	head uint64
+	tail uint64
+	next uint64
+
+	tailHeader uint64
+}
+
+// Init initializes the transmit end of the pipe. In the initial state, the next
+// slot to be written is the very first one, and the transmitter has the whole
+// ring buffer available to it.
+func (t *Tx) Init(b []byte) {
+	t.p.init(b)
+	// maxPayloadSize excludes the header of the payload, and the header
+	// of the wrapping message.
+	t.maxPayloadSize = uint64(len(t.p.buffer)) - 2*sizeOfSlotHeader
+	t.tail = 0xfffffffe * jump
+	t.next = t.tail
+	t.head = t.tail + jump
+	t.p.write(t.tail, slotFree)
+}
+
+// Capacity determines how many records of the given size can be written to the
+// pipe before it fills up.
+func (t *Tx) Capacity(recordSize uint64) uint64 {
+	available := uint64(len(t.p.buffer)) - sizeOfSlotHeader
+	entryLen := payloadToSlotSize(recordSize)
+	return available / entryLen
+}
+
+// Push reserves "payloadSize" bytes for transmission in the pipe. The caller
+// populates the returned slice with the data to be transferred and enventually
+// calls Flush() to make the data visible to the reader, or Abort() to make the
+// pipe forget all Push() calls since the last Flush().
+//
+// The returned slice is available until Flush() or Abort() is next called.
+// After that, it must not be touched.
+func (t *Tx) Push(payloadSize uint64) []byte {
+	// Fail request if we know we will never have enough room.
+	if payloadSize > t.maxPayloadSize {
+		return nil
+	}
+
+	totalLen := payloadToSlotSize(payloadSize)
+	newNext := t.next + totalLen
+	nextWrap := (t.next & revolutionMask) | uint64(len(t.p.buffer))
+	if int64(newNext-nextWrap) >= 0 {
+		// The new buffer would overflow the pipe, so we push a wrapping
+		// slot, then try to add the actual slot to the front of the
+		// pipe.
+		newNext = (newNext & revolutionMask) + jump
+		wrappingPayloadSize := slotToPayloadSize(newNext - t.next)
+		if !t.reclaim(newNext) {
+			return nil
+		}
+
+		oldNext := t.next
+		t.next = newNext
+		if oldNext != t.tail {
+			t.p.write(oldNext, wrappingPayloadSize)
+		} else {
+			t.tailHeader = wrappingPayloadSize
+			t.Flush()
+		}
+
+		newNext += totalLen
+	}
+
+	// Check that we have enough room for the buffer.
+	if !t.reclaim(newNext) {
+		return nil
+	}
+
+	if t.next != t.tail {
+		t.p.write(t.next, payloadSize)
+	} else {
+		t.tailHeader = payloadSize
+	}
+
+	// Grab the buffer before updating t.next.
+	b := t.p.data(t.next, payloadSize)
+	t.next = newNext
+
+	return b
+}
+
+// reclaim attempts to advance the head until at least newNext. If the head is
+// already at or beyond newNext, nothing happens and true is returned; otherwise
+// it tries to reclaim slots that have already been consumed by the receive end
+// of the pipe (they will be marked as free) and returns a boolean indicating
+// whether it was successful in reclaiming enough slots.
+func (t *Tx) reclaim(newNext uint64) bool {
+	for int64(newNext-t.head) > 0 {
+		// Can't reclaim if slot is not free.
+		header := t.p.readAtomic(t.head)
+		if header&slotFree == 0 {
+			return false
+		}
+
+		payloadSize := header & slotSizeMask
+		newHead := t.head + payloadToSlotSize(payloadSize)
+
+		// Check newHead is within bounds and valid.
+		if int64(newHead-t.tail) > int64(jump) || newHead&offsetMask >= uint64(len(t.p.buffer)) {
+			return false
+		}
+
+		t.head = newHead
+	}
+
+	return true
+}
+
+// Abort causes all Push() calls since the last Flush() to be forgotten and
+// therefore they will not be made visible to the receiver.
+func (t *Tx) Abort() {
+	t.next = t.tail
+}
+
+// Flush causes all buffers pushed since the last Flush() [or Abort(), whichever
+// is the most recent] to be made visible to the receiver.
+func (t *Tx) Flush() {
+	if t.next == t.tail {
+		// Nothing to do if there are no pushed buffers.
+		return
+	}
+
+	if t.next != t.head {
+		// The receiver will spin in t.next, so we must make sure that
+		// the slotFree bit is set.
+		t.p.write(t.next, slotFree)
+	}
+
+	t.p.writeAtomic(t.tail, t.tailHeader)
+	t.tail = t.next
+}
+
+// Bytes returns the byte slice on which the pipe operates.
+func (t *Tx) Bytes() []byte {
+	return t.p.buffer
+}
diff --git a/pkg/tcpip/link/sharedmem/queue/BUILD b/pkg/tcpip/link/sharedmem/queue/BUILD
new file mode 100644
index 000000000..3ba06af73
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/queue/BUILD
@@ -0,0 +1,27 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "queue",
+    srcs = [
+        "rx.go",
+        "tx.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/log",
+        "//pkg/tcpip/link/sharedmem/pipe",
+    ],
+)
+
+go_test(
+    name = "queue_test",
+    srcs = [
+        "queue_test.go",
+    ],
+    library = ":queue",
+    deps = [
+        "//pkg/tcpip/link/sharedmem/pipe",
+    ],
+)
diff --git a/pkg/tcpip/link/sharedmem/queue/queue_test.go b/pkg/tcpip/link/sharedmem/queue/queue_test.go
new file mode 100644
index 000000000..9a0aad5d7
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/queue/queue_test.go
@@ -0,0 +1,517 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package queue
+
+import (
+	"encoding/binary"
+	"reflect"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe"
+)
+
+func TestBasicTxQueue(t *testing.T) {
+	// Tests that a basic transmit on a queue works, and that completion
+	// gets properly reported as well.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Tx
+	q.Init(pb1, pb2)
+
+	// Enqueue two buffers.
+	b := []TxBuffer{
+		{nil, 100, 60},
+		{nil, 200, 40},
+	}
+
+	b[0].Next = &b[1]
+
+	const usedID = 1002
+	const usedTotalSize = 100
+	if !q.Enqueue(usedID, usedTotalSize, 2, &b[0]) {
+		t.Fatalf("Enqueue failed on empty queue")
+	}
+
+	// Check the contents of the pipe.
+	d := rxp.Pull()
+	if d == nil {
+		t.Fatalf("Tx pipe is empty after Enqueue")
+	}
+
+	want := []byte{
+		234, 3, 0, 0, 0, 0, 0, 0, // id
+		100, 0, 0, 0, // total size
+		0, 0, 0, 0, // reserved
+		100, 0, 0, 0, 0, 0, 0, 0, // offset 1
+		60, 0, 0, 0, // size 1
+		200, 0, 0, 0, 0, 0, 0, 0, // offset 2
+		40, 0, 0, 0, // size 2
+	}
+
+	if !reflect.DeepEqual(want, d) {
+		t.Fatalf("Bad posted packet: got %v, want %v", d, want)
+	}
+
+	rxp.Flush()
+
+	// Check that there are no completions yet.
+	if _, ok := q.CompletedPacket(); ok {
+		t.Fatalf("Packet reported as completed too soon")
+	}
+
+	// Post a completion.
+	d = txp.Push(8)
+	if d == nil {
+		t.Fatalf("Unable to push to rx pipe")
+	}
+	binary.LittleEndian.PutUint64(d, usedID)
+	txp.Flush()
+
+	// Check that completion is properly reported.
+	id, ok := q.CompletedPacket()
+	if !ok {
+		t.Fatalf("Completion not reported")
+	}
+
+	if id != usedID {
+		t.Fatalf("Bad completion id: got %v, want %v", id, usedID)
+	}
+}
+
+func TestBasicRxQueue(t *testing.T) {
+	// Tests that a basic receive on a queue works.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Rx
+	q.Init(pb1, pb2, nil)
+
+	// Post two buffers.
+	b := []RxBuffer{
+		{100, 60, 1077, 0},
+		{200, 40, 2123, 0},
+	}
+
+	if !q.PostBuffers(b) {
+		t.Fatalf("PostBuffers failed on empty queue")
+	}
+
+	// Check the contents of the pipe.
+	want := [][]byte{
+		{
+			100, 0, 0, 0, 0, 0, 0, 0, // Offset1
+			60, 0, 0, 0, // Size1
+			0, 0, 0, 0, // Remaining in group 1
+			0, 0, 0, 0, 0, 0, 0, 0, // User data 1
+			53, 4, 0, 0, 0, 0, 0, 0, // ID 1
+		},
+		{
+			200, 0, 0, 0, 0, 0, 0, 0, // Offset2
+			40, 0, 0, 0, // Size2
+			0, 0, 0, 0, // Remaining in group 2
+			0, 0, 0, 0, 0, 0, 0, 0, // User data 2
+			75, 8, 0, 0, 0, 0, 0, 0, // ID 2
+		},
+	}
+
+	for i := range b {
+		d := rxp.Pull()
+		if d == nil {
+			t.Fatalf("Tx pipe is empty after PostBuffers")
+		}
+
+		if !reflect.DeepEqual(want[i], d) {
+			t.Fatalf("Bad posted packet: got %v, want %v", d, want[i])
+		}
+
+		rxp.Flush()
+	}
+
+	// Check that there are no completions.
+	if _, n := q.Dequeue(nil); n != 0 {
+		t.Fatalf("Packet reported as received too soon")
+	}
+
+	// Post a completion.
+	d := txp.Push(sizeOfConsumedPacketHeader + 2*sizeOfConsumedBuffer)
+	if d == nil {
+		t.Fatalf("Unable to push to rx pipe")
+	}
+
+	copy(d, []byte{
+		100, 0, 0, 0, // packet size
+		0, 0, 0, 0, // reserved
+
+		100, 0, 0, 0, 0, 0, 0, 0, // offset 1
+		60, 0, 0, 0, // size 1
+		0, 0, 0, 0, 0, 0, 0, 0, // user data 1
+		53, 4, 0, 0, 0, 0, 0, 0, // ID 1
+
+		200, 0, 0, 0, 0, 0, 0, 0, // offset 2
+		40, 0, 0, 0, // size 2
+		0, 0, 0, 0, 0, 0, 0, 0, // user data 2
+		75, 8, 0, 0, 0, 0, 0, 0, // ID 2
+	})
+
+	txp.Flush()
+
+	// Check that completion is properly reported.
+	bufs, n := q.Dequeue(nil)
+	if n != 100 {
+		t.Fatalf("Bad packet size: got %v, want %v", n, 100)
+	}
+
+	if !reflect.DeepEqual(bufs, b) {
+		t.Fatalf("Bad returned buffers: got %v, want %v", bufs, b)
+	}
+}
+
+func TestBadTxCompletion(t *testing.T) {
+	// Check that tx completions with bad sizes are properly ignored.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Tx
+	q.Init(pb1, pb2)
+
+	// Post a completion that is too short, and check that it is ignored.
+	if d := txp.Push(7); d == nil {
+		t.Fatalf("Unable to push to rx pipe")
+	}
+	txp.Flush()
+
+	if _, ok := q.CompletedPacket(); ok {
+		t.Fatalf("Bad completion not ignored")
+	}
+
+	// Post a completion that is too long, and check that it is ignored.
+	if d := txp.Push(10); d == nil {
+		t.Fatalf("Unable to push to rx pipe")
+	}
+	txp.Flush()
+
+	if _, ok := q.CompletedPacket(); ok {
+		t.Fatalf("Bad completion not ignored")
+	}
+}
+
+func TestBadRxCompletion(t *testing.T) {
+	// Check that bad rx completions are properly ignored.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Rx
+	q.Init(pb1, pb2, nil)
+
+	// Post a completion that is too short, and check that it is ignored.
+	if d := txp.Push(7); d == nil {
+		t.Fatalf("Unable to push to rx pipe")
+	}
+	txp.Flush()
+
+	if b, _ := q.Dequeue(nil); b != nil {
+		t.Fatalf("Bad completion not ignored")
+	}
+
+	// Post a completion whose buffer sizes add up to less than the total
+	// size.
+	d := txp.Push(sizeOfConsumedPacketHeader + 2*sizeOfConsumedBuffer)
+	if d == nil {
+		t.Fatalf("Unable to push to rx pipe")
+	}
+
+	copy(d, []byte{
+		100, 0, 0, 0, // packet size
+		0, 0, 0, 0, // reserved
+
+		100, 0, 0, 0, 0, 0, 0, 0, // offset 1
+		10, 0, 0, 0, // size 1
+		0, 0, 0, 0, 0, 0, 0, 0, // user data 1
+		53, 4, 0, 0, 0, 0, 0, 0, // ID 1
+
+		200, 0, 0, 0, 0, 0, 0, 0, // offset 2
+		10, 0, 0, 0, // size 2
+		0, 0, 0, 0, 0, 0, 0, 0, // user data 2
+		75, 8, 0, 0, 0, 0, 0, 0, // ID 2
+	})
+
+	txp.Flush()
+	if b, _ := q.Dequeue(nil); b != nil {
+		t.Fatalf("Bad completion not ignored")
+	}
+
+	// Post a completion whose buffer sizes will cause a 32-bit overflow,
+	// but adds up to the right number.
+	d = txp.Push(sizeOfConsumedPacketHeader + 2*sizeOfConsumedBuffer)
+	if d == nil {
+		t.Fatalf("Unable to push to rx pipe")
+	}
+
+	copy(d, []byte{
+		100, 0, 0, 0, // packet size
+		0, 0, 0, 0, // reserved
+
+		100, 0, 0, 0, 0, 0, 0, 0, // offset 1
+		255, 255, 255, 255, // size 1
+		0, 0, 0, 0, 0, 0, 0, 0, // user data 1
+		53, 4, 0, 0, 0, 0, 0, 0, // ID 1
+
+		200, 0, 0, 0, 0, 0, 0, 0, // offset 2
+		101, 0, 0, 0, // size 2
+		0, 0, 0, 0, 0, 0, 0, 0, // user data 2
+		75, 8, 0, 0, 0, 0, 0, 0, // ID 2
+	})
+
+	txp.Flush()
+	if b, _ := q.Dequeue(nil); b != nil {
+		t.Fatalf("Bad completion not ignored")
+	}
+}
+
+func TestFillTxPipe(t *testing.T) {
+	// Check that transmitting a new buffer when the buffer pipe is full
+	// fails gracefully.
+	pb1 := make([]byte, 104)
+	pb2 := make([]byte, 104)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Tx
+	q.Init(pb1, pb2)
+
+	// Transmit twice, which should fill the tx pipe.
+	b := []TxBuffer{
+		{nil, 100, 60},
+		{nil, 200, 40},
+	}
+
+	b[0].Next = &b[1]
+
+	const usedID = 1002
+	const usedTotalSize = 100
+	for i := uint64(0); i < 2; i++ {
+		if !q.Enqueue(usedID+i, usedTotalSize, 2, &b[0]) {
+			t.Fatalf("Failed to transmit buffer")
+		}
+	}
+
+	// Transmit another packet now that the tx pipe is full.
+	if q.Enqueue(usedID+2, usedTotalSize, 2, &b[0]) {
+		t.Fatalf("Enqueue succeeded when tx pipe is full")
+	}
+}
+
+func TestFillRxPipe(t *testing.T) {
+	// Check that posting a new buffer when the buffer pipe is full fails
+	// gracefully.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Rx
+	q.Init(pb1, pb2, nil)
+
+	// Post a buffer twice, it should fill the tx pipe.
+	b := []RxBuffer{
+		{100, 60, 1077, 0},
+	}
+
+	for i := 0; i < 2; i++ {
+		if !q.PostBuffers(b) {
+			t.Fatalf("PostBuffers failed on non-full queue")
+		}
+	}
+
+	// Post another buffer now that the tx pipe is full.
+	if q.PostBuffers(b) {
+		t.Fatalf("PostBuffers succeeded on full queue")
+	}
+}
+
+func TestLotsOfTransmissions(t *testing.T) {
+	// Make sure pipes are being properly flushed when transmitting packets.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Tx
+	q.Init(pb1, pb2)
+
+	// Prepare packet with two buffers.
+	b := []TxBuffer{
+		{nil, 100, 60},
+		{nil, 200, 40},
+	}
+
+	b[0].Next = &b[1]
+
+	const usedID = 1002
+	const usedTotalSize = 100
+
+	// Post 100000 packets and completions.
+	for i := 100000; i > 0; i-- {
+		if !q.Enqueue(usedID, usedTotalSize, 2, &b[0]) {
+			t.Fatalf("Enqueue failed on non-full queue")
+		}
+
+		if d := rxp.Pull(); d == nil {
+			t.Fatalf("Tx pipe is empty after Enqueue")
+		}
+		rxp.Flush()
+
+		d := txp.Push(8)
+		if d == nil {
+			t.Fatalf("Unable to write to rx pipe")
+		}
+		binary.LittleEndian.PutUint64(d, usedID)
+		txp.Flush()
+		if _, ok := q.CompletedPacket(); !ok {
+			t.Fatalf("Completion not returned")
+		}
+	}
+}
+
+func TestLotsOfReceptions(t *testing.T) {
+	// Make sure pipes are being properly flushed when receiving packets.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var rxp pipe.Rx
+	rxp.Init(pb1)
+
+	var txp pipe.Tx
+	txp.Init(pb2)
+
+	var q Rx
+	q.Init(pb1, pb2, nil)
+
+	// Prepare for posting two buffers.
+	b := []RxBuffer{
+		{100, 60, 1077, 0},
+		{200, 40, 2123, 0},
+	}
+
+	// Post 100000 buffers and completions.
+	for i := 100000; i > 0; i-- {
+		if !q.PostBuffers(b) {
+			t.Fatalf("PostBuffers failed on non-full queue")
+		}
+
+		if d := rxp.Pull(); d == nil {
+			t.Fatalf("Tx pipe is empty after PostBuffers")
+		}
+		rxp.Flush()
+
+		if d := rxp.Pull(); d == nil {
+			t.Fatalf("Tx pipe is empty after PostBuffers")
+		}
+		rxp.Flush()
+
+		d := txp.Push(sizeOfConsumedPacketHeader + 2*sizeOfConsumedBuffer)
+		if d == nil {
+			t.Fatalf("Unable to push to rx pipe")
+		}
+
+		copy(d, []byte{
+			100, 0, 0, 0, // packet size
+			0, 0, 0, 0, // reserved
+
+			100, 0, 0, 0, 0, 0, 0, 0, // offset 1
+			60, 0, 0, 0, // size 1
+			0, 0, 0, 0, 0, 0, 0, 0, // user data 1
+			53, 4, 0, 0, 0, 0, 0, 0, // ID 1
+
+			200, 0, 0, 0, 0, 0, 0, 0, // offset 2
+			40, 0, 0, 0, // size 2
+			0, 0, 0, 0, 0, 0, 0, 0, // user data 2
+			75, 8, 0, 0, 0, 0, 0, 0, // ID 2
+		})
+
+		txp.Flush()
+
+		if _, n := q.Dequeue(nil); n == 0 {
+			t.Fatalf("Dequeue failed when there is a completion")
+		}
+	}
+}
+
+func TestRxEnableNotification(t *testing.T) {
+	// Check that enabling nofifications results in properly updated state.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var state uint32
+	var q Rx
+	q.Init(pb1, pb2, &state)
+
+	q.EnableNotification()
+	if state != eventFDEnabled {
+		t.Fatalf("Bad value in shared state: got %v, want %v", state, eventFDEnabled)
+	}
+}
+
+func TestRxDisableNotification(t *testing.T) {
+	// Check that disabling nofifications results in properly updated state.
+	pb1 := make([]byte, 100)
+	pb2 := make([]byte, 100)
+
+	var state uint32
+	var q Rx
+	q.Init(pb1, pb2, &state)
+
+	q.DisableNotification()
+	if state != eventFDDisabled {
+		t.Fatalf("Bad value in shared state: got %v, want %v", state, eventFDDisabled)
+	}
+}
diff --git a/pkg/tcpip/link/sharedmem/queue/rx.go b/pkg/tcpip/link/sharedmem/queue/rx.go
new file mode 100644
index 000000000..696e6c9e5
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/queue/rx.go
@@ -0,0 +1,221 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package queue provides the implementation of transmit and receive queues
+// based on shared memory ring buffers.
+package queue
+
+import (
+	"encoding/binary"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe"
+)
+
+const (
+	// Offsets within a posted buffer.
+	postedOffset           = 0
+	postedSize             = 8
+	postedRemainingInGroup = 12
+	postedUserData         = 16
+	postedID               = 24
+
+	sizeOfPostedBuffer = 32
+
+	// Offsets within a received packet header.
+	consumedPacketSize     = 0
+	consumedPacketReserved = 4
+
+	sizeOfConsumedPacketHeader = 8
+
+	// Offsets within a consumed buffer.
+	consumedOffset   = 0
+	consumedSize     = 8
+	consumedUserData = 12
+	consumedID       = 20
+
+	sizeOfConsumedBuffer = 28
+
+	// The following are the allowed states of the shared data area.
+	eventFDUninitialized = 0
+	eventFDDisabled      = 1
+	eventFDEnabled       = 2
+)
+
+// RxBuffer is the descriptor of a receive buffer.
+type RxBuffer struct {
+	Offset   uint64
+	Size     uint32
+	ID       uint64
+	UserData uint64
+}
+
+// Rx is a receive queue. It is implemented with one tx and one rx pipe: the tx
+// pipe is used to "post" buffers, while the rx pipe is used to receive packets
+// whose contents have been written to previously posted buffers.
+//
+// This struct is thread-compatible.
+type Rx struct {
+	tx                 pipe.Tx
+	rx                 pipe.Rx
+	sharedEventFDState *uint32
+}
+
+// Init initializes the receive queue with the given pipes, and shared state
+// pointer -- the latter is used to enable/disable eventfd notifications.
+func (r *Rx) Init(tx, rx []byte, sharedEventFDState *uint32) {
+	r.sharedEventFDState = sharedEventFDState
+	r.tx.Init(tx)
+	r.rx.Init(rx)
+}
+
+// EnableNotification updates the shared state such that the peer will notify
+// the eventfd when there are packets to be dequeued.
+func (r *Rx) EnableNotification() {
+	atomic.StoreUint32(r.sharedEventFDState, eventFDEnabled)
+}
+
+// DisableNotification updates the shared state such that the peer will not
+// notify the eventfd.
+func (r *Rx) DisableNotification() {
+	atomic.StoreUint32(r.sharedEventFDState, eventFDDisabled)
+}
+
+// PostedBuffersLimit returns the maximum number of buffers that can be posted
+// before the tx queue fills up.
+func (r *Rx) PostedBuffersLimit() uint64 {
+	return r.tx.Capacity(sizeOfPostedBuffer)
+}
+
+// PostBuffers makes the given buffers available for receiving data from the
+// peer. Once they are posted, the peer is free to write to them and will
+// eventually post them back for consumption.
+func (r *Rx) PostBuffers(buffers []RxBuffer) bool {
+	for i := range buffers {
+		b := r.tx.Push(sizeOfPostedBuffer)
+		if b == nil {
+			r.tx.Abort()
+			return false
+		}
+
+		pb := &buffers[i]
+		binary.LittleEndian.PutUint64(b[postedOffset:], pb.Offset)
+		binary.LittleEndian.PutUint32(b[postedSize:], pb.Size)
+		binary.LittleEndian.PutUint32(b[postedRemainingInGroup:], 0)
+		binary.LittleEndian.PutUint64(b[postedUserData:], pb.UserData)
+		binary.LittleEndian.PutUint64(b[postedID:], pb.ID)
+	}
+
+	r.tx.Flush()
+
+	return true
+}
+
+// Dequeue receives buffers that have been previously posted by PostBuffers()
+// and that have been filled by the peer and posted back.
+//
+// This is similar to append() in that new buffers are appended to "bufs", with
+// reallocation only if "bufs" doesn't have enough capacity.
+func (r *Rx) Dequeue(bufs []RxBuffer) ([]RxBuffer, uint32) {
+	for {
+		outBufs := bufs
+
+		// Pull the next descriptor from the rx pipe.
+		b := r.rx.Pull()
+		if b == nil {
+			return bufs, 0
+		}
+
+		if len(b) < sizeOfConsumedPacketHeader {
+			log.Warningf("Ignoring packet header: size (%v) is less than header size (%v)", len(b), sizeOfConsumedPacketHeader)
+			r.rx.Flush()
+			continue
+		}
+
+		totalDataSize := binary.LittleEndian.Uint32(b[consumedPacketSize:])
+
+		// Calculate the number of buffer descriptors and copy them
+		// over to the output.
+		count := (len(b) - sizeOfConsumedPacketHeader) / sizeOfConsumedBuffer
+		offset := sizeOfConsumedPacketHeader
+		buffersSize := uint32(0)
+		for i := count; i > 0; i-- {
+			s := binary.LittleEndian.Uint32(b[offset+consumedSize:])
+			buffersSize += s
+			if buffersSize < s {
+				// The buffer size overflows an unsigned 32-bit
+				// integer, so break out and force it to be
+				// ignored.
+				totalDataSize = 1
+				buffersSize = 0
+				break
+			}
+
+			outBufs = append(outBufs, RxBuffer{
+				Offset: binary.LittleEndian.Uint64(b[offset+consumedOffset:]),
+				Size:   s,
+				ID:     binary.LittleEndian.Uint64(b[offset+consumedID:]),
+			})
+
+			offset += sizeOfConsumedBuffer
+		}
+
+		r.rx.Flush()
+
+		if buffersSize < totalDataSize {
+			// The descriptor is corrupted, ignore it.
+			log.Warningf("Ignoring packet: actual data size (%v) less than expected size (%v)", buffersSize, totalDataSize)
+			continue
+		}
+
+		return outBufs, totalDataSize
+	}
+}
+
+// Bytes returns the byte slices on which the queue operates.
+func (r *Rx) Bytes() (tx, rx []byte) {
+	return r.tx.Bytes(), r.rx.Bytes()
+}
+
+// DecodeRxBufferHeader decodes the header of a buffer posted on an rx queue.
+func DecodeRxBufferHeader(b []byte) RxBuffer {
+	return RxBuffer{
+		Offset:   binary.LittleEndian.Uint64(b[postedOffset:]),
+		Size:     binary.LittleEndian.Uint32(b[postedSize:]),
+		ID:       binary.LittleEndian.Uint64(b[postedID:]),
+		UserData: binary.LittleEndian.Uint64(b[postedUserData:]),
+	}
+}
+
+// RxCompletionSize returns the number of bytes needed to encode an rx
+// completion containing "count" buffers.
+func RxCompletionSize(count int) uint64 {
+	return sizeOfConsumedPacketHeader + uint64(count)*sizeOfConsumedBuffer
+}
+
+// EncodeRxCompletion encodes an rx completion header.
+func EncodeRxCompletion(b []byte, size, reserved uint32) {
+	binary.LittleEndian.PutUint32(b[consumedPacketSize:], size)
+	binary.LittleEndian.PutUint32(b[consumedPacketReserved:], reserved)
+}
+
+// EncodeRxCompletionBuffer encodes the i-th rx completion buffer header.
+func EncodeRxCompletionBuffer(b []byte, i int, rxb RxBuffer) {
+	b = b[RxCompletionSize(i):]
+	binary.LittleEndian.PutUint64(b[consumedOffset:], rxb.Offset)
+	binary.LittleEndian.PutUint32(b[consumedSize:], rxb.Size)
+	binary.LittleEndian.PutUint64(b[consumedUserData:], rxb.UserData)
+	binary.LittleEndian.PutUint64(b[consumedID:], rxb.ID)
+}
diff --git a/pkg/tcpip/link/sharedmem/queue/tx.go b/pkg/tcpip/link/sharedmem/queue/tx.go
new file mode 100644
index 000000000..beffe807b
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/queue/tx.go
@@ -0,0 +1,151 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package queue
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe"
+)
+
+const (
+	// Offsets within a packet header.
+	packetID       = 0
+	packetSize     = 8
+	packetReserved = 12
+
+	sizeOfPacketHeader = 16
+
+	// Offsets with a buffer descriptor
+	bufferOffset = 0
+	bufferSize   = 8
+
+	sizeOfBufferDescriptor = 12
+)
+
+// TxBuffer is the descriptor of a transmit buffer.
+type TxBuffer struct {
+	Next   *TxBuffer
+	Offset uint64
+	Size   uint32
+}
+
+// Tx is a transmit queue. It is implemented with one tx and one rx pipe: the
+// tx pipe is used to request the transmission of packets, while the rx pipe
+// is used to receive which transmissions have completed.
+//
+// This struct is thread-compatible.
+type Tx struct {
+	tx pipe.Tx
+	rx pipe.Rx
+}
+
+// Init initializes the transmit queue with the given pipes.
+func (t *Tx) Init(tx, rx []byte) {
+	t.tx.Init(tx)
+	t.rx.Init(rx)
+}
+
+// Enqueue queues the given linked list of buffers for transmission as one
+// packet. While it is queued, the caller must not modify them.
+func (t *Tx) Enqueue(id uint64, totalDataLen, bufferCount uint32, buffer *TxBuffer) bool {
+	// Reserve room in the tx pipe.
+	totalLen := sizeOfPacketHeader + uint64(bufferCount)*sizeOfBufferDescriptor
+
+	b := t.tx.Push(totalLen)
+	if b == nil {
+		return false
+	}
+
+	// Initialize the packet and buffer descriptors.
+	binary.LittleEndian.PutUint64(b[packetID:], id)
+	binary.LittleEndian.PutUint32(b[packetSize:], totalDataLen)
+	binary.LittleEndian.PutUint32(b[packetReserved:], 0)
+
+	offset := sizeOfPacketHeader
+	for i := bufferCount; i != 0; i-- {
+		binary.LittleEndian.PutUint64(b[offset+bufferOffset:], buffer.Offset)
+		binary.LittleEndian.PutUint32(b[offset+bufferSize:], buffer.Size)
+		offset += sizeOfBufferDescriptor
+		buffer = buffer.Next
+	}
+
+	t.tx.Flush()
+
+	return true
+}
+
+// CompletedPacket returns the id of the last completed transmission. The
+// returned id, if any, refers to a value passed on a previous call to
+// Enqueue().
+func (t *Tx) CompletedPacket() (id uint64, ok bool) {
+	for {
+		b := t.rx.Pull()
+		if b == nil {
+			return 0, false
+		}
+
+		if len(b) != 8 {
+			t.rx.Flush()
+			log.Warningf("Ignoring completed packet: size (%v) is less than expected (%v)", len(b), 8)
+			continue
+		}
+
+		v := binary.LittleEndian.Uint64(b)
+
+		t.rx.Flush()
+
+		return v, true
+	}
+}
+
+// Bytes returns the byte slices on which the queue operates.
+func (t *Tx) Bytes() (tx, rx []byte) {
+	return t.tx.Bytes(), t.rx.Bytes()
+}
+
+// TxPacketInfo holds information about a packet sent on a tx queue.
+type TxPacketInfo struct {
+	ID          uint64
+	Size        uint32
+	Reserved    uint32
+	BufferCount int
+}
+
+// DecodeTxPacketHeader decodes the header of a packet sent over a tx queue.
+func DecodeTxPacketHeader(b []byte) TxPacketInfo {
+	return TxPacketInfo{
+		ID:          binary.LittleEndian.Uint64(b[packetID:]),
+		Size:        binary.LittleEndian.Uint32(b[packetSize:]),
+		Reserved:    binary.LittleEndian.Uint32(b[packetReserved:]),
+		BufferCount: (len(b) - sizeOfPacketHeader) / sizeOfBufferDescriptor,
+	}
+}
+
+// DecodeTxBufferHeader decodes the header of the i-th buffer of a packet sent
+// over a tx queue.
+func DecodeTxBufferHeader(b []byte, i int) TxBuffer {
+	b = b[sizeOfPacketHeader+i*sizeOfBufferDescriptor:]
+	return TxBuffer{
+		Offset: binary.LittleEndian.Uint64(b[bufferOffset:]),
+		Size:   binary.LittleEndian.Uint32(b[bufferSize:]),
+	}
+}
+
+// EncodeTxCompletion encodes a tx completion header.
+func EncodeTxCompletion(b []byte, id uint64) {
+	binary.LittleEndian.PutUint64(b, id)
+}
diff --git a/pkg/tcpip/link/sharedmem/rx.go b/pkg/tcpip/link/sharedmem/rx.go
new file mode 100644
index 000000000..eec11e4cb
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/rx.go
@@ -0,0 +1,159 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package sharedmem
+
+import (
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/tcpip/link/rawfile"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue"
+)
+
+// rx holds all state associated with an rx queue.
+type rx struct {
+	data       []byte
+	sharedData []byte
+	q          queue.Rx
+	eventFD    int
+}
+
+// init initializes all state needed by the rx queue based on the information
+// provided.
+//
+// The caller always retains ownership of all file descriptors passed in. The
+// queue implementation will duplicate any that it may need in the future.
+func (r *rx) init(mtu uint32, c *QueueConfig) error {
+	// Map in all buffers.
+	txPipe, err := getBuffer(c.TxPipeFD)
+	if err != nil {
+		return err
+	}
+
+	rxPipe, err := getBuffer(c.RxPipeFD)
+	if err != nil {
+		syscall.Munmap(txPipe)
+		return err
+	}
+
+	data, err := getBuffer(c.DataFD)
+	if err != nil {
+		syscall.Munmap(txPipe)
+		syscall.Munmap(rxPipe)
+		return err
+	}
+
+	sharedData, err := getBuffer(c.SharedDataFD)
+	if err != nil {
+		syscall.Munmap(txPipe)
+		syscall.Munmap(rxPipe)
+		syscall.Munmap(data)
+		return err
+	}
+
+	// Duplicate the eventFD so that caller can close it but we can still
+	// use it.
+	efd, err := syscall.Dup(c.EventFD)
+	if err != nil {
+		syscall.Munmap(txPipe)
+		syscall.Munmap(rxPipe)
+		syscall.Munmap(data)
+		syscall.Munmap(sharedData)
+		return err
+	}
+
+	// Set the eventfd as non-blocking.
+	if err := syscall.SetNonblock(efd, true); err != nil {
+		syscall.Munmap(txPipe)
+		syscall.Munmap(rxPipe)
+		syscall.Munmap(data)
+		syscall.Munmap(sharedData)
+		syscall.Close(efd)
+		return err
+	}
+
+	// Initialize state based on buffers.
+	r.q.Init(txPipe, rxPipe, sharedDataPointer(sharedData))
+	r.data = data
+	r.eventFD = efd
+	r.sharedData = sharedData
+
+	return nil
+}
+
+// cleanup releases all resources allocated during init(). It must only be
+// called if init() has previously succeeded.
+func (r *rx) cleanup() {
+	a, b := r.q.Bytes()
+	syscall.Munmap(a)
+	syscall.Munmap(b)
+
+	syscall.Munmap(r.data)
+	syscall.Munmap(r.sharedData)
+	syscall.Close(r.eventFD)
+}
+
+// postAndReceive posts the provided buffers (if any), and then tries to read
+// from the receive queue.
+//
+// Capacity permitting, it reuses the posted buffer slice to store the buffers
+// that were read as well.
+//
+// This function will block if there aren't any available packets.
+func (r *rx) postAndReceive(b []queue.RxBuffer, stopRequested *uint32) ([]queue.RxBuffer, uint32) {
+	// Post the buffers first. If we cannot post, sleep until we can. We
+	// never post more than will fit concurrently, so it's safe to wait
+	// until enough room is available.
+	if len(b) != 0 && !r.q.PostBuffers(b) {
+		r.q.EnableNotification()
+		for !r.q.PostBuffers(b) {
+			var tmp [8]byte
+			rawfile.BlockingRead(r.eventFD, tmp[:])
+			if atomic.LoadUint32(stopRequested) != 0 {
+				r.q.DisableNotification()
+				return nil, 0
+			}
+		}
+		r.q.DisableNotification()
+	}
+
+	// Read the next set of descriptors.
+	b, n := r.q.Dequeue(b[:0])
+	if len(b) != 0 {
+		return b, n
+	}
+
+	// Data isn't immediately available. Enable eventfd notifications.
+	r.q.EnableNotification()
+	for {
+		b, n = r.q.Dequeue(b)
+		if len(b) != 0 {
+			break
+		}
+
+		// Wait for notification.
+		var tmp [8]byte
+		rawfile.BlockingRead(r.eventFD, tmp[:])
+		if atomic.LoadUint32(stopRequested) != 0 {
+			r.q.DisableNotification()
+			return nil, 0
+		}
+	}
+	r.q.DisableNotification()
+
+	return b, n
+}
diff --git a/pkg/tcpip/link/sharedmem/sharedmem.go b/pkg/tcpip/link/sharedmem/sharedmem.go
new file mode 100644
index 000000000..0374a2441
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/sharedmem.go
@@ -0,0 +1,289 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+// Package sharedmem provides the implemention of data-link layer endpoints
+// backed by shared memory.
+//
+// Shared memory endpoints can be used in the networking stack by calling New()
+// to create a new endpoint, and then passing it as an argument to
+// Stack.CreateNIC().
+package sharedmem
+
+import (
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// QueueConfig holds all the file descriptors needed to describe a tx or rx
+// queue over shared memory. It is used when creating new shared memory
+// endpoints to describe tx and rx queues.
+type QueueConfig struct {
+	// DataFD is a file descriptor for the file that contains the data to
+	// be transmitted via this queue. Descriptors contain offsets within
+	// this file.
+	DataFD int
+
+	// EventFD is a file descriptor for the event that is signaled when
+	// data is becomes available in this queue.
+	EventFD int
+
+	// TxPipeFD is a file descriptor for the tx pipe associated with the
+	// queue.
+	TxPipeFD int
+
+	// RxPipeFD is a file descriptor for the rx pipe associated with the
+	// queue.
+	RxPipeFD int
+
+	// SharedDataFD is a file descriptor for the file that contains shared
+	// state between the two ends of the queue. This data specifies, for
+	// example, whether EventFD signaling is enabled or disabled.
+	SharedDataFD int
+}
+
+type endpoint struct {
+	// mtu (maximum transmission unit) is the maximum size of a packet.
+	mtu uint32
+
+	// bufferSize is the size of each individual buffer.
+	bufferSize uint32
+
+	// addr is the local address of this endpoint.
+	addr tcpip.LinkAddress
+
+	// rx is the receive queue.
+	rx rx
+
+	// stopRequested is to be accessed atomically only, and determines if
+	// the worker goroutines should stop.
+	stopRequested uint32
+
+	// Wait group used to indicate that all workers have stopped.
+	completed sync.WaitGroup
+
+	// mu protects the following fields.
+	mu sync.Mutex
+
+	// tx is the transmit queue.
+	tx tx
+
+	// workerStarted specifies whether the worker goroutine was started.
+	workerStarted bool
+}
+
+// New creates a new shared-memory-based endpoint. Buffers will be broken up
+// into buffers of "bufferSize" bytes.
+func New(mtu, bufferSize uint32, addr tcpip.LinkAddress, tx, rx QueueConfig) (stack.LinkEndpoint, error) {
+	e := &endpoint{
+		mtu:        mtu,
+		bufferSize: bufferSize,
+		addr:       addr,
+	}
+
+	if err := e.tx.init(bufferSize, &tx); err != nil {
+		return nil, err
+	}
+
+	if err := e.rx.init(bufferSize, &rx); err != nil {
+		e.tx.cleanup()
+		return nil, err
+	}
+
+	return e, nil
+}
+
+// Close frees all resources associated with the endpoint.
+func (e *endpoint) Close() {
+	// Tell dispatch goroutine to stop, then write to the eventfd so that
+	// it wakes up in case it's sleeping.
+	atomic.StoreUint32(&e.stopRequested, 1)
+	syscall.Write(e.rx.eventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+	// Cleanup the queues inline if the worker hasn't started yet; we also
+	// know it won't start from now on because stopRequested is set to 1.
+	e.mu.Lock()
+	workerPresent := e.workerStarted
+	e.mu.Unlock()
+
+	if !workerPresent {
+		e.tx.cleanup()
+		e.rx.cleanup()
+	}
+}
+
+// Wait implements stack.LinkEndpoint.Wait. It waits until all workers have
+// stopped after a Close() call.
+func (e *endpoint) Wait() {
+	e.completed.Wait()
+}
+
+// Attach implements stack.LinkEndpoint.Attach. It launches the goroutine that
+// reads packets from the rx queue.
+func (e *endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.mu.Lock()
+	if !e.workerStarted && atomic.LoadUint32(&e.stopRequested) == 0 {
+		e.workerStarted = true
+		e.completed.Add(1)
+		// Link endpoints are not savable. When transportation endpoints
+		// are saved, they stop sending outgoing packets and all
+		// incoming packets are rejected.
+		go e.dispatchLoop(dispatcher) // S/R-SAFE: see above.
+	}
+	e.mu.Unlock()
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *endpoint) IsAttached() bool {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	return e.workerStarted
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It returns the value initialized
+// during construction.
+func (e *endpoint) MTU() uint32 {
+	return e.mtu - header.EthernetMinimumSize
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities.
+func (*endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return 0
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. It returns the
+// ethernet frame header size.
+func (*endpoint) MaxHeaderLength() uint16 {
+	return header.EthernetMinimumSize
+}
+
+// LinkAddress implements stack.LinkEndpoint.LinkAddress. It returns the local
+// link address.
+func (e *endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.addr
+}
+
+// WritePacket writes outbound packets to the file descriptor. If it is not
+// currently writable, the packet is dropped.
+func (e *endpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	// Add the ethernet header here.
+	eth := header.Ethernet(pkt.Header.Prepend(header.EthernetMinimumSize))
+	pkt.LinkHeader = buffer.View(eth)
+	ethHdr := &header.EthernetFields{
+		DstAddr: r.RemoteLinkAddress,
+		Type:    protocol,
+	}
+	if r.LocalLinkAddress != "" {
+		ethHdr.SrcAddr = r.LocalLinkAddress
+	} else {
+		ethHdr.SrcAddr = e.addr
+	}
+	eth.Encode(ethHdr)
+
+	v := pkt.Data.ToView()
+	// Transmit the packet.
+	e.mu.Lock()
+	ok := e.tx.transmit(pkt.Header.View(), v)
+	e.mu.Unlock()
+
+	if !ok {
+		return tcpip.ErrWouldBlock
+	}
+
+	return nil
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *endpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	panic("not implemented")
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	v := vv.ToView()
+	// Transmit the packet.
+	e.mu.Lock()
+	ok := e.tx.transmit(v, buffer.View{})
+	e.mu.Unlock()
+
+	if !ok {
+		return tcpip.ErrWouldBlock
+	}
+
+	return nil
+}
+
+// dispatchLoop reads packets from the rx queue in a loop and dispatches them
+// to the network stack.
+func (e *endpoint) dispatchLoop(d stack.NetworkDispatcher) {
+	// Post initial set of buffers.
+	limit := e.rx.q.PostedBuffersLimit()
+	if l := uint64(len(e.rx.data)) / uint64(e.bufferSize); limit > l {
+		limit = l
+	}
+	for i := uint64(0); i < limit; i++ {
+		b := queue.RxBuffer{
+			Offset: i * uint64(e.bufferSize),
+			Size:   e.bufferSize,
+			ID:     i,
+		}
+		if !e.rx.q.PostBuffers([]queue.RxBuffer{b}) {
+			log.Warningf("Unable to post %v-th buffer", i)
+		}
+	}
+
+	// Read in a loop until a stop is requested.
+	var rxb []queue.RxBuffer
+	for atomic.LoadUint32(&e.stopRequested) == 0 {
+		var n uint32
+		rxb, n = e.rx.postAndReceive(rxb, &e.stopRequested)
+
+		// Copy data from the shared area to its own buffer, then
+		// prepare to repost the buffer.
+		b := make([]byte, n)
+		offset := uint32(0)
+		for i := range rxb {
+			copy(b[offset:], e.rx.data[rxb[i].Offset:][:rxb[i].Size])
+			offset += rxb[i].Size
+
+			rxb[i].Size = e.bufferSize
+		}
+
+		if n < header.EthernetMinimumSize {
+			continue
+		}
+
+		// Send packet up the stack.
+		eth := header.Ethernet(b[:header.EthernetMinimumSize])
+		d.DeliverNetworkPacket(eth.SourceAddress(), eth.DestinationAddress(), eth.Type(), &stack.PacketBuffer{
+			Data:       buffer.View(b[header.EthernetMinimumSize:]).ToVectorisedView(),
+			LinkHeader: buffer.View(eth),
+		})
+	}
+
+	// Clean state.
+	e.tx.cleanup()
+	e.rx.cleanup()
+
+	e.completed.Done()
+}
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_test.go b/pkg/tcpip/link/sharedmem/sharedmem_test.go
new file mode 100644
index 000000000..28a2e88ba
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/sharedmem_test.go
@@ -0,0 +1,812 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+package sharedmem
+
+import (
+	"bytes"
+	"io/ioutil"
+	"math/rand"
+	"os"
+	"strings"
+	"syscall"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/pipe"
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+const (
+	localLinkAddr  = "\xde\xad\xbe\xef\x56\x78"
+	remoteLinkAddr = "\xde\xad\xbe\xef\x12\x34"
+
+	queueDataSize = 1024 * 1024
+	queuePipeSize = 4096
+)
+
+type queueBuffers struct {
+	data []byte
+	rx   pipe.Tx
+	tx   pipe.Rx
+}
+
+func initQueue(t *testing.T, q *queueBuffers, c *QueueConfig) {
+	// Prepare tx pipe.
+	b, err := getBuffer(c.TxPipeFD)
+	if err != nil {
+		t.Fatalf("getBuffer failed: %v", err)
+	}
+	q.tx.Init(b)
+
+	// Prepare rx pipe.
+	b, err = getBuffer(c.RxPipeFD)
+	if err != nil {
+		t.Fatalf("getBuffer failed: %v", err)
+	}
+	q.rx.Init(b)
+
+	// Get data slice.
+	q.data, err = getBuffer(c.DataFD)
+	if err != nil {
+		t.Fatalf("getBuffer failed: %v", err)
+	}
+}
+
+func (q *queueBuffers) cleanup() {
+	syscall.Munmap(q.tx.Bytes())
+	syscall.Munmap(q.rx.Bytes())
+	syscall.Munmap(q.data)
+}
+
+type packetInfo struct {
+	addr       tcpip.LinkAddress
+	proto      tcpip.NetworkProtocolNumber
+	vv         buffer.VectorisedView
+	linkHeader buffer.View
+}
+
+type testContext struct {
+	t     *testing.T
+	ep    *endpoint
+	txCfg QueueConfig
+	rxCfg QueueConfig
+	txq   queueBuffers
+	rxq   queueBuffers
+
+	packetCh chan struct{}
+	mu       sync.Mutex
+	packets  []packetInfo
+}
+
+func newTestContext(t *testing.T, mtu, bufferSize uint32, addr tcpip.LinkAddress) *testContext {
+	var err error
+	c := &testContext{
+		t:        t,
+		packetCh: make(chan struct{}, 1000000),
+	}
+	c.txCfg = createQueueFDs(t, queueSizes{
+		dataSize:       queueDataSize,
+		txPipeSize:     queuePipeSize,
+		rxPipeSize:     queuePipeSize,
+		sharedDataSize: 4096,
+	})
+
+	c.rxCfg = createQueueFDs(t, queueSizes{
+		dataSize:       queueDataSize,
+		txPipeSize:     queuePipeSize,
+		rxPipeSize:     queuePipeSize,
+		sharedDataSize: 4096,
+	})
+
+	initQueue(t, &c.txq, &c.txCfg)
+	initQueue(t, &c.rxq, &c.rxCfg)
+
+	ep, err := New(mtu, bufferSize, addr, c.txCfg, c.rxCfg)
+	if err != nil {
+		t.Fatalf("New failed: %v", err)
+	}
+
+	c.ep = ep.(*endpoint)
+	c.ep.Attach(c)
+
+	return c
+}
+
+func (c *testContext) DeliverNetworkPacket(remoteLinkAddr, localLinkAddr tcpip.LinkAddress, proto tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	c.mu.Lock()
+	c.packets = append(c.packets, packetInfo{
+		addr:  remoteLinkAddr,
+		proto: proto,
+		vv:    pkt.Data.Clone(nil),
+	})
+	c.mu.Unlock()
+
+	c.packetCh <- struct{}{}
+}
+
+func (c *testContext) cleanup() {
+	c.ep.Close()
+	closeFDs(&c.txCfg)
+	closeFDs(&c.rxCfg)
+	c.txq.cleanup()
+	c.rxq.cleanup()
+}
+
+func (c *testContext) waitForPackets(n int, to <-chan time.Time, errorStr string) {
+	for i := 0; i < n; i++ {
+		select {
+		case <-c.packetCh:
+		case <-to:
+			c.t.Fatalf(errorStr)
+		}
+	}
+}
+
+func (c *testContext) pushRxCompletion(size uint32, bs []queue.RxBuffer) {
+	b := c.rxq.rx.Push(queue.RxCompletionSize(len(bs)))
+	queue.EncodeRxCompletion(b, size, 0)
+	for i := range bs {
+		queue.EncodeRxCompletionBuffer(b, i, queue.RxBuffer{
+			Offset: bs[i].Offset,
+			Size:   bs[i].Size,
+			ID:     bs[i].ID,
+		})
+	}
+}
+
+func randomFill(b []byte) {
+	for i := range b {
+		b[i] = byte(rand.Intn(256))
+	}
+}
+
+func shuffle(b []int) {
+	for i := len(b) - 1; i >= 0; i-- {
+		j := rand.Intn(i + 1)
+		b[i], b[j] = b[j], b[i]
+	}
+}
+
+func createFile(t *testing.T, size int64, initQueue bool) int {
+	tmpDir := os.Getenv("TEST_TMPDIR")
+	if tmpDir == "" {
+		tmpDir = os.Getenv("TMPDIR")
+	}
+	f, err := ioutil.TempFile(tmpDir, "sharedmem_test")
+	if err != nil {
+		t.Fatalf("TempFile failed: %v", err)
+	}
+	defer f.Close()
+	syscall.Unlink(f.Name())
+
+	if initQueue {
+		// Write the "slot-free" flag in the initial queue.
+		_, err := f.WriteAt([]byte{0, 0, 0, 0, 0, 0, 0, 0x80}, 0)
+		if err != nil {
+			t.Fatalf("WriteAt failed: %v", err)
+		}
+	}
+
+	fd, err := syscall.Dup(int(f.Fd()))
+	if err != nil {
+		t.Fatalf("Dup failed: %v", err)
+	}
+
+	if err := syscall.Ftruncate(fd, size); err != nil {
+		syscall.Close(fd)
+		t.Fatalf("Ftruncate failed: %v", err)
+	}
+
+	return fd
+}
+
+func closeFDs(c *QueueConfig) {
+	syscall.Close(c.DataFD)
+	syscall.Close(c.EventFD)
+	syscall.Close(c.TxPipeFD)
+	syscall.Close(c.RxPipeFD)
+	syscall.Close(c.SharedDataFD)
+}
+
+type queueSizes struct {
+	dataSize       int64
+	txPipeSize     int64
+	rxPipeSize     int64
+	sharedDataSize int64
+}
+
+func createQueueFDs(t *testing.T, s queueSizes) QueueConfig {
+	fd, _, err := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, 0, 0)
+	if err != 0 {
+		t.Fatalf("eventfd failed: %v", error(err))
+	}
+
+	return QueueConfig{
+		EventFD:      int(fd),
+		DataFD:       createFile(t, s.dataSize, false),
+		TxPipeFD:     createFile(t, s.txPipeSize, true),
+		RxPipeFD:     createFile(t, s.rxPipeSize, true),
+		SharedDataFD: createFile(t, s.sharedDataSize, false),
+	}
+}
+
+// TestSimpleSend sends 1000 packets with random header and payload sizes,
+// then checks that the right payload is received on the shared memory queues.
+func TestSimpleSend(t *testing.T) {
+	c := newTestContext(t, 20000, 1500, localLinkAddr)
+	defer c.cleanup()
+
+	// Prepare route.
+	r := stack.Route{
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+
+	for iters := 1000; iters > 0; iters-- {
+		func() {
+			// Prepare and send packet.
+			n := rand.Intn(10000)
+			hdr := buffer.NewPrependable(n + int(c.ep.MaxHeaderLength()))
+			hdrBuf := hdr.Prepend(n)
+			randomFill(hdrBuf)
+
+			n = rand.Intn(10000)
+			buf := buffer.NewView(n)
+			randomFill(buf)
+
+			proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
+			if err := c.ep.WritePacket(&r, nil /* gso */, proto, &stack.PacketBuffer{
+				Header: hdr,
+				Data:   buf.ToVectorisedView(),
+			}); err != nil {
+				t.Fatalf("WritePacket failed: %v", err)
+			}
+
+			// Receive packet.
+			desc := c.txq.tx.Pull()
+			pi := queue.DecodeTxPacketHeader(desc)
+			if pi.Reserved != 0 {
+				t.Fatalf("Reserved value is non-zero: 0x%x", pi.Reserved)
+			}
+			contents := make([]byte, 0, pi.Size)
+			for i := 0; i < pi.BufferCount; i++ {
+				bi := queue.DecodeTxBufferHeader(desc, i)
+				contents = append(contents, c.txq.data[bi.Offset:][:bi.Size]...)
+			}
+			c.txq.tx.Flush()
+
+			defer func() {
+				// Tell the endpoint about the completion of the write.
+				b := c.txq.rx.Push(8)
+				queue.EncodeTxCompletion(b, pi.ID)
+				c.txq.rx.Flush()
+			}()
+
+			// Check the ethernet header.
+			ethTemplate := make(header.Ethernet, header.EthernetMinimumSize)
+			ethTemplate.Encode(&header.EthernetFields{
+				SrcAddr: localLinkAddr,
+				DstAddr: remoteLinkAddr,
+				Type:    proto,
+			})
+			if got := contents[:header.EthernetMinimumSize]; !bytes.Equal(got, []byte(ethTemplate)) {
+				t.Fatalf("Bad ethernet header in packet: got %x, want %x", got, ethTemplate)
+			}
+
+			// Compare contents skipping the ethernet header added by the
+			// endpoint.
+			merged := append(hdrBuf, buf...)
+			if uint32(len(contents)) < pi.Size {
+				t.Fatalf("Sum of buffers is less than packet size: %v < %v", len(contents), pi.Size)
+			}
+			contents = contents[:pi.Size][header.EthernetMinimumSize:]
+
+			if !bytes.Equal(contents, merged) {
+				t.Fatalf("Buffers are different: got %x (%v bytes), want %x (%v bytes)", contents, len(contents), merged, len(merged))
+			}
+		}()
+	}
+}
+
+// TestPreserveSrcAddressInSend calls WritePacket once with LocalLinkAddress
+// set in Route (using much of the same code as TestSimpleSend), then checks
+// that the encoded ethernet header received includes the correct SrcAddr.
+func TestPreserveSrcAddressInSend(t *testing.T) {
+	c := newTestContext(t, 20000, 1500, localLinkAddr)
+	defer c.cleanup()
+
+	newLocalLinkAddress := tcpip.LinkAddress(strings.Repeat("0xFE", 6))
+	// Set both remote and local link address in route.
+	r := stack.Route{
+		RemoteLinkAddress: remoteLinkAddr,
+		LocalLinkAddress:  newLocalLinkAddress,
+	}
+
+	// WritePacket panics given a prependable with anything less than
+	// the minimum size of the ethernet header.
+	hdr := buffer.NewPrependable(header.EthernetMinimumSize)
+
+	proto := tcpip.NetworkProtocolNumber(rand.Intn(0x10000))
+	if err := c.ep.WritePacket(&r, nil /* gso */, proto, &stack.PacketBuffer{
+		Header: hdr,
+	}); err != nil {
+		t.Fatalf("WritePacket failed: %v", err)
+	}
+
+	// Receive packet.
+	desc := c.txq.tx.Pull()
+	pi := queue.DecodeTxPacketHeader(desc)
+	if pi.Reserved != 0 {
+		t.Fatalf("Reserved value is non-zero: 0x%x", pi.Reserved)
+	}
+	contents := make([]byte, 0, pi.Size)
+	for i := 0; i < pi.BufferCount; i++ {
+		bi := queue.DecodeTxBufferHeader(desc, i)
+		contents = append(contents, c.txq.data[bi.Offset:][:bi.Size]...)
+	}
+	c.txq.tx.Flush()
+
+	defer func() {
+		// Tell the endpoint about the completion of the write.
+		b := c.txq.rx.Push(8)
+		queue.EncodeTxCompletion(b, pi.ID)
+		c.txq.rx.Flush()
+	}()
+
+	// Check that the ethernet header contains the expected SrcAddr.
+	ethTemplate := make(header.Ethernet, header.EthernetMinimumSize)
+	ethTemplate.Encode(&header.EthernetFields{
+		SrcAddr: newLocalLinkAddress,
+		DstAddr: remoteLinkAddr,
+		Type:    proto,
+	})
+	if got := contents[:header.EthernetMinimumSize]; !bytes.Equal(got, []byte(ethTemplate)) {
+		t.Fatalf("Bad ethernet header in packet: got %x, want %x", got, ethTemplate)
+	}
+}
+
+// TestFillTxQueue sends packets until the queue is full.
+func TestFillTxQueue(t *testing.T) {
+	c := newTestContext(t, 20000, 1500, localLinkAddr)
+	defer c.cleanup()
+
+	// Prepare to send a packet.
+	r := stack.Route{
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+
+	buf := buffer.NewView(100)
+
+	// Each packet is uses no more than 40 bytes, so write that many packets
+	// until the tx queue if full.
+	ids := make(map[uint64]struct{})
+	for i := queuePipeSize / 40; i > 0; i-- {
+		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
+			t.Fatalf("WritePacket failed unexpectedly: %v", err)
+		}
+
+		// Check that they have different IDs.
+		desc := c.txq.tx.Pull()
+		pi := queue.DecodeTxPacketHeader(desc)
+		if _, ok := ids[pi.ID]; ok {
+			t.Fatalf("ID (%v) reused", pi.ID)
+		}
+		ids[pi.ID] = struct{}{}
+	}
+
+	// Next attempt to write must fail.
+	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   buf.ToVectorisedView(),
+	}); err != want {
+		t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
+	}
+}
+
+// TestFillTxQueueAfterBadCompletion sends a bad completion, then sends packets
+// until the queue is full.
+func TestFillTxQueueAfterBadCompletion(t *testing.T) {
+	c := newTestContext(t, 20000, 1500, localLinkAddr)
+	defer c.cleanup()
+
+	// Send a bad completion.
+	queue.EncodeTxCompletion(c.txq.rx.Push(8), 1)
+	c.txq.rx.Flush()
+
+	// Prepare to send a packet.
+	r := stack.Route{
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+
+	buf := buffer.NewView(100)
+
+	// Send two packets so that the id slice has at least two slots.
+	for i := 2; i > 0; i-- {
+		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
+			t.Fatalf("WritePacket failed unexpectedly: %v", err)
+		}
+	}
+
+	// Complete the two writes twice.
+	for i := 2; i > 0; i-- {
+		pi := queue.DecodeTxPacketHeader(c.txq.tx.Pull())
+
+		queue.EncodeTxCompletion(c.txq.rx.Push(8), pi.ID)
+		queue.EncodeTxCompletion(c.txq.rx.Push(8), pi.ID)
+		c.txq.rx.Flush()
+	}
+	c.txq.tx.Flush()
+
+	// Each packet is uses no more than 40 bytes, so write that many packets
+	// until the tx queue if full.
+	ids := make(map[uint64]struct{})
+	for i := queuePipeSize / 40; i > 0; i-- {
+		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
+			t.Fatalf("WritePacket failed unexpectedly: %v", err)
+		}
+
+		// Check that they have different IDs.
+		desc := c.txq.tx.Pull()
+		pi := queue.DecodeTxPacketHeader(desc)
+		if _, ok := ids[pi.ID]; ok {
+			t.Fatalf("ID (%v) reused", pi.ID)
+		}
+		ids[pi.ID] = struct{}{}
+	}
+
+	// Next attempt to write must fail.
+	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+	if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   buf.ToVectorisedView(),
+	}); err != want {
+		t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
+	}
+}
+
+// TestFillTxMemory sends packets until the we run out of shared memory.
+func TestFillTxMemory(t *testing.T) {
+	const bufferSize = 1500
+	c := newTestContext(t, 20000, bufferSize, localLinkAddr)
+	defer c.cleanup()
+
+	// Prepare to send a packet.
+	r := stack.Route{
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+
+	buf := buffer.NewView(100)
+
+	// Each packet is uses up one buffer, so write as many as possible until
+	// we fill the memory.
+	ids := make(map[uint64]struct{})
+	for i := queueDataSize / bufferSize; i > 0; i-- {
+		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
+			t.Fatalf("WritePacket failed unexpectedly: %v", err)
+		}
+
+		// Check that they have different IDs.
+		desc := c.txq.tx.Pull()
+		pi := queue.DecodeTxPacketHeader(desc)
+		if _, ok := ids[pi.ID]; ok {
+			t.Fatalf("ID (%v) reused", pi.ID)
+		}
+		ids[pi.ID] = struct{}{}
+		c.txq.tx.Flush()
+	}
+
+	// Next attempt to write must fail.
+	hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+	err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+		Header: hdr,
+		Data:   buf.ToVectorisedView(),
+	})
+	if want := tcpip.ErrWouldBlock; err != want {
+		t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
+	}
+}
+
+// TestFillTxMemoryWithMultiBuffer sends packets until the we run out of
+// shared memory for a 2-buffer packet, but still with room for a 1-buffer
+// packet.
+func TestFillTxMemoryWithMultiBuffer(t *testing.T) {
+	const bufferSize = 1500
+	c := newTestContext(t, 20000, bufferSize, localLinkAddr)
+	defer c.cleanup()
+
+	// Prepare to send a packet.
+	r := stack.Route{
+		RemoteLinkAddress: remoteLinkAddr,
+	}
+
+	buf := buffer.NewView(100)
+
+	// Each packet is uses up one buffer, so write as many as possible
+	// until there is only one buffer left.
+	for i := queueDataSize/bufferSize - 1; i > 0; i-- {
+		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
+			t.Fatalf("WritePacket failed unexpectedly: %v", err)
+		}
+
+		// Pull the posted buffer.
+		c.txq.tx.Pull()
+		c.txq.tx.Flush()
+	}
+
+	// Attempt to write a two-buffer packet. It must fail.
+	{
+		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+		uu := buffer.NewView(bufferSize).ToVectorisedView()
+		if want, err := tcpip.ErrWouldBlock, c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   uu,
+		}); err != want {
+			t.Fatalf("WritePacket return unexpected result: got %v, want %v", err, want)
+		}
+	}
+
+	// Attempt to write the one-buffer packet again. It must succeed.
+	{
+		hdr := buffer.NewPrependable(int(c.ep.MaxHeaderLength()))
+		if err := c.ep.WritePacket(&r, nil /* gso */, header.IPv4ProtocolNumber, &stack.PacketBuffer{
+			Header: hdr,
+			Data:   buf.ToVectorisedView(),
+		}); err != nil {
+			t.Fatalf("WritePacket failed unexpectedly: %v", err)
+		}
+	}
+}
+
+func pollPull(t *testing.T, p *pipe.Rx, to <-chan time.Time, errStr string) []byte {
+	t.Helper()
+
+	for {
+		b := p.Pull()
+		if b != nil {
+			return b
+		}
+
+		select {
+		case <-time.After(10 * time.Millisecond):
+		case <-to:
+			t.Fatal(errStr)
+		}
+	}
+}
+
+// TestSimpleReceive completes 1000 different receives with random payload and
+// random number of buffers. It checks that the contents match the expected
+// values.
+func TestSimpleReceive(t *testing.T) {
+	const bufferSize = 1500
+	c := newTestContext(t, 20000, bufferSize, localLinkAddr)
+	defer c.cleanup()
+
+	// Check that buffers have been posted.
+	limit := c.ep.rx.q.PostedBuffersLimit()
+	for i := uint64(0); i < limit; i++ {
+		timeout := time.After(2 * time.Second)
+		bi := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, timeout, "Timeout waiting for all buffers to be posted"))
+
+		if want := i * bufferSize; want != bi.Offset {
+			t.Fatalf("Bad posted offset: got %v, want %v", bi.Offset, want)
+		}
+
+		if want := i; want != bi.ID {
+			t.Fatalf("Bad posted ID: got %v, want %v", bi.ID, want)
+		}
+
+		if bufferSize != bi.Size {
+			t.Fatalf("Bad posted bufferSize: got %v, want %v", bi.Size, bufferSize)
+		}
+	}
+	c.rxq.tx.Flush()
+
+	// Create a slice with the indices 0..limit-1.
+	idx := make([]int, limit)
+	for i := range idx {
+		idx[i] = i
+	}
+
+	// Complete random packets 1000 times.
+	for iters := 1000; iters > 0; iters-- {
+		timeout := time.After(2 * time.Second)
+		// Prepare a random packet.
+		shuffle(idx)
+		n := 1 + rand.Intn(10)
+		bufs := make([]queue.RxBuffer, n)
+		contents := make([]byte, bufferSize*n-rand.Intn(500))
+		randomFill(contents)
+		for i := range bufs {
+			j := idx[i]
+			bufs[i].Size = bufferSize
+			bufs[i].Offset = uint64(bufferSize * j)
+			bufs[i].ID = uint64(j)
+
+			copy(c.rxq.data[bufs[i].Offset:][:bufferSize], contents[i*bufferSize:])
+		}
+
+		// Push completion.
+		c.pushRxCompletion(uint32(len(contents)), bufs)
+		c.rxq.rx.Flush()
+		syscall.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+		// Wait for packet to be received, then check it.
+		c.waitForPackets(1, time.After(5*time.Second), "Timeout waiting for packet")
+		c.mu.Lock()
+		rcvd := []byte(c.packets[0].vv.ToView())
+		c.packets = c.packets[:0]
+		c.mu.Unlock()
+
+		if contents := contents[header.EthernetMinimumSize:]; !bytes.Equal(contents, rcvd) {
+			t.Fatalf("Unexpected buffer contents: got %x, want %x", rcvd, contents)
+		}
+
+		// Check that buffers have been reposted.
+		for i := range bufs {
+			bi := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, timeout, "Timeout waiting for buffers to be reposted"))
+			if bi != bufs[i] {
+				t.Fatalf("Unexpected buffer reposted: got %x, want %x", bi, bufs[i])
+			}
+		}
+		c.rxq.tx.Flush()
+	}
+}
+
+// TestRxBuffersReposted tests that rx buffers get reposted after they have been
+// completed.
+func TestRxBuffersReposted(t *testing.T) {
+	const bufferSize = 1500
+	c := newTestContext(t, 20000, bufferSize, localLinkAddr)
+	defer c.cleanup()
+
+	// Receive all posted buffers.
+	limit := c.ep.rx.q.PostedBuffersLimit()
+	buffers := make([]queue.RxBuffer, 0, limit)
+	for i := limit; i > 0; i-- {
+		timeout := time.After(2 * time.Second)
+		buffers = append(buffers, queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, timeout, "Timeout waiting for all buffers")))
+	}
+	c.rxq.tx.Flush()
+
+	// Check that all buffers are reposted when individually completed.
+	for i := range buffers {
+		timeout := time.After(2 * time.Second)
+		// Complete the buffer.
+		c.pushRxCompletion(buffers[i].Size, buffers[i:][:1])
+		c.rxq.rx.Flush()
+		syscall.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+		// Wait for it to be reposted.
+		bi := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, timeout, "Timeout waiting for buffer to be reposted"))
+		if bi != buffers[i] {
+			t.Fatalf("Different buffer posted: got %v, want %v", bi, buffers[i])
+		}
+	}
+	c.rxq.tx.Flush()
+
+	// Check that all buffers are reposted when completed in pairs.
+	for i := 0; i < len(buffers)/2; i++ {
+		timeout := time.After(2 * time.Second)
+		// Complete with two buffers.
+		c.pushRxCompletion(2*bufferSize, buffers[2*i:][:2])
+		c.rxq.rx.Flush()
+		syscall.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+		// Wait for them to be reposted.
+		for j := 0; j < 2; j++ {
+			bi := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, timeout, "Timeout waiting for buffer to be reposted"))
+			if bi != buffers[2*i+j] {
+				t.Fatalf("Different buffer posted: got %v, want %v", bi, buffers[2*i+j])
+			}
+		}
+	}
+	c.rxq.tx.Flush()
+}
+
+// TestReceivePostingIsFull checks that the endpoint will properly handle the
+// case when a received buffer cannot be immediately reposted because it hasn't
+// been pulled from the tx pipe yet.
+func TestReceivePostingIsFull(t *testing.T) {
+	const bufferSize = 1500
+	c := newTestContext(t, 20000, bufferSize, localLinkAddr)
+	defer c.cleanup()
+
+	// Complete first posted buffer before flushing it from the tx pipe.
+	first := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, time.After(time.Second), "Timeout waiting for first buffer to be posted"))
+	c.pushRxCompletion(first.Size, []queue.RxBuffer{first})
+	c.rxq.rx.Flush()
+	syscall.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+	// Check that packet is received.
+	c.waitForPackets(1, time.After(time.Second), "Timeout waiting for completed packet")
+
+	// Complete another buffer.
+	second := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, time.After(time.Second), "Timeout waiting for second buffer to be posted"))
+	c.pushRxCompletion(second.Size, []queue.RxBuffer{second})
+	c.rxq.rx.Flush()
+	syscall.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+	// Check that no packet is received yet, as the worker is blocked trying
+	// to repost.
+	select {
+	case <-time.After(500 * time.Millisecond):
+	case <-c.packetCh:
+		t.Fatalf("Unexpected packet received")
+	}
+
+	// Flush tx queue, which will allow the first buffer to be reposted,
+	// and the second completion to be pulled.
+	c.rxq.tx.Flush()
+	syscall.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+	// Check that second packet completes.
+	c.waitForPackets(1, time.After(time.Second), "Timeout waiting for second completed packet")
+}
+
+// TestCloseWhileWaitingToPost closes the endpoint while it is waiting to
+// repost a buffer. Make sure it backs out.
+func TestCloseWhileWaitingToPost(t *testing.T) {
+	const bufferSize = 1500
+	c := newTestContext(t, 20000, bufferSize, localLinkAddr)
+	cleaned := false
+	defer func() {
+		if !cleaned {
+			c.cleanup()
+		}
+	}()
+
+	// Complete first posted buffer before flushing it from the tx pipe.
+	bi := queue.DecodeRxBufferHeader(pollPull(t, &c.rxq.tx, time.After(time.Second), "Timeout waiting for initial buffer to be posted"))
+	c.pushRxCompletion(bi.Size, []queue.RxBuffer{bi})
+	c.rxq.rx.Flush()
+	syscall.Write(c.rxCfg.EventFD, []byte{1, 0, 0, 0, 0, 0, 0, 0})
+
+	// Wait for packet to be indicated.
+	c.waitForPackets(1, time.After(time.Second), "Timeout waiting for completed packet")
+
+	// Cleanup and wait for worker to complete.
+	c.cleanup()
+	cleaned = true
+	c.ep.Wait()
+}
diff --git a/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go b/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
new file mode 100644
index 000000000..f7e816a41
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/sharedmem_unsafe.go
@@ -0,0 +1,25 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sharedmem
+
+import (
+	"unsafe"
+)
+
+// sharedDataPointer converts the shared data slice into a pointer so that it
+// can be used in atomic operations.
+func sharedDataPointer(sharedData []byte) *uint32 {
+	return (*uint32)(unsafe.Pointer(&sharedData[0:4][0]))
+}
diff --git a/pkg/tcpip/link/sharedmem/tx.go b/pkg/tcpip/link/sharedmem/tx.go
new file mode 100644
index 000000000..6b8d7859d
--- /dev/null
+++ b/pkg/tcpip/link/sharedmem/tx.go
@@ -0,0 +1,272 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sharedmem
+
+import (
+	"math"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/tcpip/link/sharedmem/queue"
+)
+
+const (
+	nilID = math.MaxUint64
+)
+
+// tx holds all state associated with a tx queue.
+type tx struct {
+	data []byte
+	q    queue.Tx
+	ids  idManager
+	bufs bufferManager
+}
+
+// init initializes all state needed by the tx queue based on the information
+// provided.
+//
+// The caller always retains ownership of all file descriptors passed in. The
+// queue implementation will duplicate any that it may need in the future.
+func (t *tx) init(mtu uint32, c *QueueConfig) error {
+	// Map in all buffers.
+	txPipe, err := getBuffer(c.TxPipeFD)
+	if err != nil {
+		return err
+	}
+
+	rxPipe, err := getBuffer(c.RxPipeFD)
+	if err != nil {
+		syscall.Munmap(txPipe)
+		return err
+	}
+
+	data, err := getBuffer(c.DataFD)
+	if err != nil {
+		syscall.Munmap(txPipe)
+		syscall.Munmap(rxPipe)
+		return err
+	}
+
+	// Initialize state based on buffers.
+	t.q.Init(txPipe, rxPipe)
+	t.ids.init()
+	t.bufs.init(0, len(data), int(mtu))
+	t.data = data
+
+	return nil
+}
+
+// cleanup releases all resources allocated during init(). It must only be
+// called if init() has previously succeeded.
+func (t *tx) cleanup() {
+	a, b := t.q.Bytes()
+	syscall.Munmap(a)
+	syscall.Munmap(b)
+	syscall.Munmap(t.data)
+}
+
+// transmit sends a packet made up of up to two buffers. Returns a boolean that
+// specifies whether the packet was successfully transmitted.
+func (t *tx) transmit(a, b []byte) bool {
+	// Pull completions from the tx queue and add their buffers back to the
+	// pool so that we can reuse them.
+	for {
+		id, ok := t.q.CompletedPacket()
+		if !ok {
+			break
+		}
+
+		if buf := t.ids.remove(id); buf != nil {
+			t.bufs.free(buf)
+		}
+	}
+
+	bSize := t.bufs.entrySize
+	total := uint32(len(a) + len(b))
+	bufCount := (total + bSize - 1) / bSize
+
+	// Allocate enough buffers to hold all the data.
+	var buf *queue.TxBuffer
+	for i := bufCount; i != 0; i-- {
+		b := t.bufs.alloc()
+		if b == nil {
+			// Failed to get all buffers. Return to the pool
+			// whatever we had managed to get.
+			if buf != nil {
+				t.bufs.free(buf)
+			}
+			return false
+		}
+		b.Next = buf
+		buf = b
+	}
+
+	// Copy data into allocated buffers.
+	nBuf := buf
+	var dBuf []byte
+	for _, data := range [][]byte{a, b} {
+		for len(data) > 0 {
+			if len(dBuf) == 0 {
+				dBuf = t.data[nBuf.Offset:][:nBuf.Size]
+				nBuf = nBuf.Next
+			}
+			n := copy(dBuf, data)
+			data = data[n:]
+			dBuf = dBuf[n:]
+		}
+	}
+
+	// Get an id for this packet and send it out.
+	id := t.ids.add(buf)
+	if !t.q.Enqueue(id, total, bufCount, buf) {
+		t.ids.remove(id)
+		t.bufs.free(buf)
+		return false
+	}
+
+	return true
+}
+
+// getBuffer returns a memory region mapped to the full contents of the given
+// file descriptor.
+func getBuffer(fd int) ([]byte, error) {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(fd, &s); err != nil {
+		return nil, err
+	}
+
+	// Check that size doesn't overflow an int.
+	if s.Size > int64(^uint(0)>>1) {
+		return nil, syscall.EDOM
+	}
+
+	return syscall.Mmap(fd, 0, int(s.Size), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED|syscall.MAP_FILE)
+}
+
+// idDescriptor is used by idManager to either point to a tx buffer (in case
+// the ID is assigned) or to the next free element (if the id is not assigned).
+type idDescriptor struct {
+	buf      *queue.TxBuffer
+	nextFree uint64
+}
+
+// idManager is a manager of tx buffer identifiers. It assigns unique IDs to
+// tx buffers that are added to it; the IDs can only be reused after they have
+// been removed.
+//
+// The ID assignments are stored so that the tx buffers can be retrieved from
+// the IDs previously assigned to them.
+type idManager struct {
+	// ids is a slice containing all tx buffers. The ID is the index into
+	// this slice.
+	ids []idDescriptor
+
+	// freeList a list of free IDs.
+	freeList uint64
+}
+
+// init initializes the id manager.
+func (m *idManager) init() {
+	m.freeList = nilID
+}
+
+// add assigns an ID to the given tx buffer.
+func (m *idManager) add(b *queue.TxBuffer) uint64 {
+	if i := m.freeList; i != nilID {
+		// There is an id available in the free list, just use it.
+		m.ids[i].buf = b
+		m.freeList = m.ids[i].nextFree
+		return i
+	}
+
+	// We need to expand the id descriptor.
+	m.ids = append(m.ids, idDescriptor{buf: b})
+	return uint64(len(m.ids) - 1)
+}
+
+// remove retrieves the tx buffer associated with the given ID, and removes the
+// ID from the assigned table so that it can be reused in the future.
+func (m *idManager) remove(i uint64) *queue.TxBuffer {
+	if i >= uint64(len(m.ids)) {
+		return nil
+	}
+
+	desc := &m.ids[i]
+	b := desc.buf
+	if b == nil {
+		// The provided id is not currently assigned.
+		return nil
+	}
+
+	desc.buf = nil
+	desc.nextFree = m.freeList
+	m.freeList = i
+
+	return b
+}
+
+// bufferManager manages a buffer region broken up into smaller, equally sized
+// buffers. Smaller buffers can be allocated and freed.
+type bufferManager struct {
+	freeList  *queue.TxBuffer
+	curOffset uint64
+	limit     uint64
+	entrySize uint32
+}
+
+// init initializes the buffer manager.
+func (b *bufferManager) init(initialOffset, size, entrySize int) {
+	b.freeList = nil
+	b.curOffset = uint64(initialOffset)
+	b.limit = uint64(initialOffset + size/entrySize*entrySize)
+	b.entrySize = uint32(entrySize)
+}
+
+// alloc allocates a buffer from the manager, if one is available.
+func (b *bufferManager) alloc() *queue.TxBuffer {
+	if b.freeList != nil {
+		// There is a descriptor ready for reuse in the free list.
+		d := b.freeList
+		b.freeList = d.Next
+		d.Next = nil
+		return d
+	}
+
+	if b.curOffset < b.limit {
+		// There is room available in the never-used range, so create
+		// a new descriptor for it.
+		d := &queue.TxBuffer{
+			Offset: b.curOffset,
+			Size:   b.entrySize,
+		}
+		b.curOffset += uint64(b.entrySize)
+		return d
+	}
+
+	return nil
+}
+
+// free returns all buffers in the list to the buffer manager so that they can
+// be reused.
+func (b *bufferManager) free(d *queue.TxBuffer) {
+	// Find the last buffer in the list.
+	last := d
+	for last.Next != nil {
+		last = last.Next
+	}
+
+	// Push list onto free list.
+	last.Next = b.freeList
+	b.freeList = d
+}
diff --git a/pkg/tcpip/link/sniffer/BUILD b/pkg/tcpip/link/sniffer/BUILD
new file mode 100644
index 000000000..7cbc305e7
--- /dev/null
+++ b/pkg/tcpip/link/sniffer/BUILD
@@ -0,0 +1,20 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "sniffer",
+    srcs = [
+        "pcap.go",
+        "sniffer.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/log",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/nested",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/sniffer/pcap.go b/pkg/tcpip/link/sniffer/pcap.go
new file mode 100644
index 000000000..c16c19647
--- /dev/null
+++ b/pkg/tcpip/link/sniffer/pcap.go
@@ -0,0 +1,66 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sniffer
+
+import "time"
+
+type pcapHeader struct {
+	// MagicNumber is the file magic number.
+	MagicNumber uint32
+
+	// VersionMajor is the major version number.
+	VersionMajor uint16
+
+	// VersionMinor is the minor version number.
+	VersionMinor uint16
+
+	// Thiszone is the GMT to local correction.
+	Thiszone int32
+
+	// Sigfigs is the accuracy of timestamps.
+	Sigfigs uint32
+
+	// Snaplen is the max length of captured packets, in octets.
+	Snaplen uint32
+
+	// Network is the data link type.
+	Network uint32
+}
+
+const pcapPacketHeaderLen = 16
+
+type pcapPacketHeader struct {
+	// Seconds is the timestamp seconds.
+	Seconds uint32
+
+	// Microseconds is the timestamp microseconds.
+	Microseconds uint32
+
+	// IncludedLength is the number of octets of packet saved in file.
+	IncludedLength uint32
+
+	// OriginalLength is the actual length of packet.
+	OriginalLength uint32
+}
+
+func newPCAPPacketHeader(incLen, orgLen uint32) pcapPacketHeader {
+	now := time.Now()
+	return pcapPacketHeader{
+		Seconds:        uint32(now.Unix()),
+		Microseconds:   uint32(now.Nanosecond() / 1000),
+		IncludedLength: incLen,
+		OriginalLength: orgLen,
+	}
+}
diff --git a/pkg/tcpip/link/sniffer/sniffer.go b/pkg/tcpip/link/sniffer/sniffer.go
new file mode 100644
index 000000000..d9cd4e83a
--- /dev/null
+++ b/pkg/tcpip/link/sniffer/sniffer.go
@@ -0,0 +1,394 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sniffer provides the implementation of data-link layer endpoints that
+// wrap another endpoint and logs inbound and outbound packets.
+//
+// Sniffer endpoints can be used in the networking stack by calling New(eID) to
+// create a new endpoint, where eID is the ID of the endpoint being wrapped,
+// and then passing it as an argument to Stack.CreateNIC().
+package sniffer
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"sync/atomic"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/nested"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// LogPackets is a flag used to enable or disable packet logging via the log
+// package. Valid values are 0 or 1.
+//
+// LogPackets must be accessed atomically.
+var LogPackets uint32 = 1
+
+// LogPacketsToPCAP is a flag used to enable or disable logging packets to a
+// pcap writer. Valid values are 0 or 1. A writer must have been specified when the
+// sniffer was created for this flag to have effect.
+//
+// LogPacketsToPCAP must be accessed atomically.
+var LogPacketsToPCAP uint32 = 1
+
+type endpoint struct {
+	nested.Endpoint
+	writer     io.Writer
+	maxPCAPLen uint32
+}
+
+var _ stack.GSOEndpoint = (*endpoint)(nil)
+var _ stack.LinkEndpoint = (*endpoint)(nil)
+var _ stack.NetworkDispatcher = (*endpoint)(nil)
+
+// New creates a new sniffer link-layer endpoint. It wraps around another
+// endpoint and logs packets and they traverse the endpoint.
+func New(lower stack.LinkEndpoint) stack.LinkEndpoint {
+	sniffer := &endpoint{}
+	sniffer.Endpoint.Init(lower, sniffer)
+	return sniffer
+}
+
+func zoneOffset() (int32, error) {
+	loc, err := time.LoadLocation("Local")
+	if err != nil {
+		return 0, err
+	}
+	date := time.Date(0, 0, 0, 0, 0, 0, 0, loc)
+	_, offset := date.Zone()
+	return int32(offset), nil
+}
+
+func writePCAPHeader(w io.Writer, maxLen uint32) error {
+	offset, err := zoneOffset()
+	if err != nil {
+		return err
+	}
+	return binary.Write(w, binary.BigEndian, pcapHeader{
+		// From https://wiki.wireshark.org/Development/LibpcapFileFormat
+		MagicNumber: 0xa1b2c3d4,
+
+		VersionMajor: 2,
+		VersionMinor: 4,
+		Thiszone:     offset,
+		Sigfigs:      0,
+		Snaplen:      maxLen,
+		Network:      101, // LINKTYPE_RAW
+	})
+}
+
+// NewWithWriter creates a new sniffer link-layer endpoint. It wraps around
+// another endpoint and logs packets as they traverse the endpoint.
+//
+// Packets are logged to writer in the pcap format. A sniffer created with this
+// function will not emit packets using the standard log package.
+//
+// snapLen is the maximum amount of a packet to be saved. Packets with a length
+// less than or equal to snapLen will be saved in their entirety. Longer
+// packets will be truncated to snapLen.
+func NewWithWriter(lower stack.LinkEndpoint, writer io.Writer, snapLen uint32) (stack.LinkEndpoint, error) {
+	if err := writePCAPHeader(writer, snapLen); err != nil {
+		return nil, err
+	}
+	sniffer := &endpoint{
+		writer:     writer,
+		maxPCAPLen: snapLen,
+	}
+	sniffer.Endpoint.Init(lower, sniffer)
+	return sniffer, nil
+}
+
+// DeliverNetworkPacket implements the stack.NetworkDispatcher interface. It is
+// called by the link-layer endpoint being wrapped when a packet arrives, and
+// logs the packet before forwarding to the actual dispatcher.
+func (e *endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dumpPacket("recv", nil, protocol, pkt)
+	e.Endpoint.DeliverNetworkPacket(remote, local, protocol, pkt)
+}
+
+func (e *endpoint) dumpPacket(prefix string, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	writer := e.writer
+	if writer == nil && atomic.LoadUint32(&LogPackets) == 1 {
+		logPacket(prefix, protocol, pkt, gso)
+	}
+	if writer != nil && atomic.LoadUint32(&LogPacketsToPCAP) == 1 {
+		totalLength := pkt.Header.UsedLength() + pkt.Data.Size()
+		length := totalLength
+		if max := int(e.maxPCAPLen); length > max {
+			length = max
+		}
+		if err := binary.Write(writer, binary.BigEndian, newPCAPPacketHeader(uint32(length), uint32(totalLength))); err != nil {
+			panic(err)
+		}
+		write := func(b []byte) {
+			if len(b) > length {
+				b = b[:length]
+			}
+			for len(b) != 0 {
+				n, err := writer.Write(b)
+				if err != nil {
+					panic(err)
+				}
+				b = b[n:]
+				length -= n
+			}
+		}
+		write(pkt.Header.View())
+		for _, view := range pkt.Data.Views() {
+			if length == 0 {
+				break
+			}
+			write(view)
+		}
+	}
+}
+
+// WritePacket implements the stack.LinkEndpoint interface. It is called by
+// higher-level protocols to write packets; it just logs the packet and
+// forwards the request to the lower endpoint.
+func (e *endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	e.dumpPacket("send", gso, protocol, pkt)
+	return e.Endpoint.WritePacket(r, gso, protocol, pkt)
+}
+
+// WritePackets implements the stack.LinkEndpoint interface. It is called by
+// higher-level protocols to write packets; it just logs the packet and
+// forwards the request to the lower endpoint.
+func (e *endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	for pkt := pkts.Front(); pkt != nil; pkt = pkt.Next() {
+		e.dumpPacket("send", gso, protocol, pkt)
+	}
+	return e.Endpoint.WritePackets(r, gso, pkts, protocol)
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	e.dumpPacket("send", nil, 0, &stack.PacketBuffer{
+		Data: vv,
+	})
+	return e.Endpoint.WriteRawPacket(vv)
+}
+
+func logPacket(prefix string, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer, gso *stack.GSO) {
+	// Figure out the network layer info.
+	var transProto uint8
+	src := tcpip.Address("unknown")
+	dst := tcpip.Address("unknown")
+	id := 0
+	size := uint16(0)
+	var fragmentOffset uint16
+	var moreFragments bool
+
+	// Create a clone of pkt, including any headers if present. Avoid allocating
+	// backing memory for the clone.
+	views := [8]buffer.View{}
+	vv := buffer.NewVectorisedView(0, views[:0])
+	vv.AppendView(pkt.Header.View())
+	vv.Append(pkt.Data)
+
+	switch protocol {
+	case header.IPv4ProtocolNumber:
+		hdr, ok := vv.PullUp(header.IPv4MinimumSize)
+		if !ok {
+			return
+		}
+		ipv4 := header.IPv4(hdr)
+		fragmentOffset = ipv4.FragmentOffset()
+		moreFragments = ipv4.Flags()&header.IPv4FlagMoreFragments == header.IPv4FlagMoreFragments
+		src = ipv4.SourceAddress()
+		dst = ipv4.DestinationAddress()
+		transProto = ipv4.Protocol()
+		size = ipv4.TotalLength() - uint16(ipv4.HeaderLength())
+		vv.TrimFront(int(ipv4.HeaderLength()))
+		id = int(ipv4.ID())
+
+	case header.IPv6ProtocolNumber:
+		hdr, ok := vv.PullUp(header.IPv6MinimumSize)
+		if !ok {
+			return
+		}
+		ipv6 := header.IPv6(hdr)
+		src = ipv6.SourceAddress()
+		dst = ipv6.DestinationAddress()
+		transProto = ipv6.NextHeader()
+		size = ipv6.PayloadLength()
+		vv.TrimFront(header.IPv6MinimumSize)
+
+	case header.ARPProtocolNumber:
+		hdr, ok := vv.PullUp(header.ARPSize)
+		if !ok {
+			return
+		}
+		vv.TrimFront(header.ARPSize)
+		arp := header.ARP(hdr)
+		log.Infof(
+			"%s arp %s (%s) -> %s (%s) valid:%t",
+			prefix,
+			tcpip.Address(arp.ProtocolAddressSender()), tcpip.LinkAddress(arp.HardwareAddressSender()),
+			tcpip.Address(arp.ProtocolAddressTarget()), tcpip.LinkAddress(arp.HardwareAddressTarget()),
+			arp.IsValid(),
+		)
+		return
+	default:
+		log.Infof("%s unknown network protocol", prefix)
+		return
+	}
+
+	// Figure out the transport layer info.
+	transName := "unknown"
+	srcPort := uint16(0)
+	dstPort := uint16(0)
+	details := ""
+	switch tcpip.TransportProtocolNumber(transProto) {
+	case header.ICMPv4ProtocolNumber:
+		transName = "icmp"
+		hdr, ok := vv.PullUp(header.ICMPv4MinimumSize)
+		if !ok {
+			break
+		}
+		icmp := header.ICMPv4(hdr)
+		icmpType := "unknown"
+		if fragmentOffset == 0 {
+			switch icmp.Type() {
+			case header.ICMPv4EchoReply:
+				icmpType = "echo reply"
+			case header.ICMPv4DstUnreachable:
+				icmpType = "destination unreachable"
+			case header.ICMPv4SrcQuench:
+				icmpType = "source quench"
+			case header.ICMPv4Redirect:
+				icmpType = "redirect"
+			case header.ICMPv4Echo:
+				icmpType = "echo"
+			case header.ICMPv4TimeExceeded:
+				icmpType = "time exceeded"
+			case header.ICMPv4ParamProblem:
+				icmpType = "param problem"
+			case header.ICMPv4Timestamp:
+				icmpType = "timestamp"
+			case header.ICMPv4TimestampReply:
+				icmpType = "timestamp reply"
+			case header.ICMPv4InfoRequest:
+				icmpType = "info request"
+			case header.ICMPv4InfoReply:
+				icmpType = "info reply"
+			}
+		}
+		log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+		return
+
+	case header.ICMPv6ProtocolNumber:
+		transName = "icmp"
+		hdr, ok := vv.PullUp(header.ICMPv6MinimumSize)
+		if !ok {
+			break
+		}
+		icmp := header.ICMPv6(hdr)
+		icmpType := "unknown"
+		switch icmp.Type() {
+		case header.ICMPv6DstUnreachable:
+			icmpType = "destination unreachable"
+		case header.ICMPv6PacketTooBig:
+			icmpType = "packet too big"
+		case header.ICMPv6TimeExceeded:
+			icmpType = "time exceeded"
+		case header.ICMPv6ParamProblem:
+			icmpType = "param problem"
+		case header.ICMPv6EchoRequest:
+			icmpType = "echo request"
+		case header.ICMPv6EchoReply:
+			icmpType = "echo reply"
+		case header.ICMPv6RouterSolicit:
+			icmpType = "router solicit"
+		case header.ICMPv6RouterAdvert:
+			icmpType = "router advert"
+		case header.ICMPv6NeighborSolicit:
+			icmpType = "neighbor solicit"
+		case header.ICMPv6NeighborAdvert:
+			icmpType = "neighbor advert"
+		case header.ICMPv6RedirectMsg:
+			icmpType = "redirect message"
+		}
+		log.Infof("%s %s %s -> %s %s len:%d id:%04x code:%d", prefix, transName, src, dst, icmpType, size, id, icmp.Code())
+		return
+
+	case header.UDPProtocolNumber:
+		transName = "udp"
+		hdr, ok := vv.PullUp(header.UDPMinimumSize)
+		if !ok {
+			break
+		}
+		udp := header.UDP(hdr)
+		if fragmentOffset == 0 {
+			srcPort = udp.SourcePort()
+			dstPort = udp.DestinationPort()
+			details = fmt.Sprintf("xsum: 0x%x", udp.Checksum())
+			size -= header.UDPMinimumSize
+		}
+
+	case header.TCPProtocolNumber:
+		transName = "tcp"
+		hdr, ok := vv.PullUp(header.TCPMinimumSize)
+		if !ok {
+			break
+		}
+		tcp := header.TCP(hdr)
+		if fragmentOffset == 0 {
+			offset := int(tcp.DataOffset())
+			if offset < header.TCPMinimumSize {
+				details += fmt.Sprintf("invalid packet: tcp data offset too small %d", offset)
+				break
+			}
+			if offset > vv.Size() && !moreFragments {
+				details += fmt.Sprintf("invalid packet: tcp data offset %d larger than packet buffer length %d", offset, vv.Size())
+				break
+			}
+
+			srcPort = tcp.SourcePort()
+			dstPort = tcp.DestinationPort()
+			size -= uint16(offset)
+
+			// Initialize the TCP flags.
+			flags := tcp.Flags()
+			flagsStr := []byte("FSRPAU")
+			for i := range flagsStr {
+				if flags&(1<<uint(i)) == 0 {
+					flagsStr[i] = ' '
+				}
+			}
+			details = fmt.Sprintf("flags:0x%02x (%s) seqnum: %d ack: %d win: %d xsum:0x%x", flags, string(flagsStr), tcp.SequenceNumber(), tcp.AckNumber(), tcp.WindowSize(), tcp.Checksum())
+			if flags&header.TCPFlagSyn != 0 {
+				details += fmt.Sprintf(" options: %+v", header.ParseSynOptions(tcp.Options(), flags&header.TCPFlagAck != 0))
+			} else {
+				details += fmt.Sprintf(" options: %+v", tcp.ParsedOptions())
+			}
+		}
+
+	default:
+		log.Infof("%s %s -> %s unknown transport protocol: %d", prefix, src, dst, transProto)
+		return
+	}
+
+	if gso != nil {
+		details += fmt.Sprintf(" gso: %+v", gso)
+	}
+
+	log.Infof("%s %s %s:%d -> %s:%d len:%d id:%04x %s", prefix, transName, src, srcPort, dst, dstPort, size, id, details)
+}
diff --git a/pkg/tcpip/link/tun/BUILD b/pkg/tcpip/link/tun/BUILD
new file mode 100644
index 000000000..e0db6cf54
--- /dev/null
+++ b/pkg/tcpip/link/tun/BUILD
@@ -0,0 +1,25 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "tun",
+    srcs = [
+        "device.go",
+        "protocol.go",
+        "tun_unsafe.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/refs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/link/channel",
+        "//pkg/tcpip/stack",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/tcpip/link/tun/device.go b/pkg/tcpip/link/tun/device.go
new file mode 100644
index 000000000..6bc9033d0
--- /dev/null
+++ b/pkg/tcpip/link/tun/device.go
@@ -0,0 +1,358 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tun
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/tcpip/link/channel"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// drivers/net/tun.c:tun_net_init()
+	defaultDevMtu = 1500
+
+	// Queue length for outbound packet, arriving at fd side for read. Overflow
+	// causes packet drops. gVisor implementation-specific.
+	defaultDevOutQueueLen = 1024
+)
+
+var zeroMAC [6]byte
+
+// Device is an opened /dev/net/tun device.
+//
+// +stateify savable
+type Device struct {
+	waiter.Queue
+
+	mu           sync.RWMutex `state:"nosave"`
+	endpoint     *tunEndpoint
+	notifyHandle *channel.NotificationHandle
+	flags        uint16
+}
+
+// beforeSave is invoked by stateify.
+func (d *Device) beforeSave() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	// TODO(b/110961832): Restore the device to stack. At this moment, the stack
+	// is not savable.
+	if d.endpoint != nil {
+		panic("/dev/net/tun does not support save/restore when a device is associated with it.")
+	}
+}
+
+// Release implements fs.FileOperations.Release.
+func (d *Device) Release() {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	// Decrease refcount if there is an endpoint associated with this file.
+	if d.endpoint != nil {
+		d.endpoint.RemoveNotify(d.notifyHandle)
+		d.endpoint.DecRef()
+		d.endpoint = nil
+	}
+}
+
+// SetIff services TUNSETIFF ioctl(2) request.
+func (d *Device) SetIff(s *stack.Stack, name string, flags uint16) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	if d.endpoint != nil {
+		return syserror.EINVAL
+	}
+
+	// Input validations.
+	isTun := flags&linux.IFF_TUN != 0
+	isTap := flags&linux.IFF_TAP != 0
+	supportedFlags := uint16(linux.IFF_TUN | linux.IFF_TAP | linux.IFF_NO_PI)
+	if isTap && isTun || !isTap && !isTun || flags&^supportedFlags != 0 {
+		return syserror.EINVAL
+	}
+
+	prefix := "tun"
+	if isTap {
+		prefix = "tap"
+	}
+
+	linkCaps := stack.CapabilityNone
+	if isTap {
+		linkCaps |= stack.CapabilityResolutionRequired
+	}
+
+	endpoint, err := attachOrCreateNIC(s, name, prefix, linkCaps)
+	if err != nil {
+		return syserror.EINVAL
+	}
+
+	d.endpoint = endpoint
+	d.notifyHandle = d.endpoint.AddNotify(d)
+	d.flags = flags
+	return nil
+}
+
+func attachOrCreateNIC(s *stack.Stack, name, prefix string, linkCaps stack.LinkEndpointCapabilities) (*tunEndpoint, error) {
+	for {
+		// 1. Try to attach to an existing NIC.
+		if name != "" {
+			if nic, found := s.GetNICByName(name); found {
+				endpoint, ok := nic.LinkEndpoint().(*tunEndpoint)
+				if !ok {
+					// Not a NIC created by tun device.
+					return nil, syserror.EOPNOTSUPP
+				}
+				if !endpoint.TryIncRef() {
+					// Race detected: NIC got deleted in between.
+					continue
+				}
+				return endpoint, nil
+			}
+		}
+
+		// 2. Creating a new NIC.
+		id := tcpip.NICID(s.UniqueID())
+		endpoint := &tunEndpoint{
+			Endpoint: channel.New(defaultDevOutQueueLen, defaultDevMtu, ""),
+			stack:    s,
+			nicID:    id,
+			name:     name,
+		}
+		endpoint.Endpoint.LinkEPCapabilities = linkCaps
+		if endpoint.name == "" {
+			endpoint.name = fmt.Sprintf("%s%d", prefix, id)
+		}
+		err := s.CreateNICWithOptions(endpoint.nicID, endpoint, stack.NICOptions{
+			Name: endpoint.name,
+		})
+		switch err {
+		case nil:
+			return endpoint, nil
+		case tcpip.ErrDuplicateNICID:
+			// Race detected: A NIC has been created in between.
+			continue
+		default:
+			return nil, syserror.EINVAL
+		}
+	}
+}
+
+// Write inject one inbound packet to the network interface.
+func (d *Device) Write(data []byte) (int64, error) {
+	d.mu.RLock()
+	endpoint := d.endpoint
+	d.mu.RUnlock()
+	if endpoint == nil {
+		return 0, syserror.EBADFD
+	}
+	if !endpoint.IsAttached() {
+		return 0, syserror.EIO
+	}
+
+	dataLen := int64(len(data))
+
+	// Packet information.
+	var pktInfoHdr PacketInfoHeader
+	if !d.hasFlags(linux.IFF_NO_PI) {
+		if len(data) < PacketInfoHeaderSize {
+			// Ignore bad packet.
+			return dataLen, nil
+		}
+		pktInfoHdr = PacketInfoHeader(data[:PacketInfoHeaderSize])
+		data = data[PacketInfoHeaderSize:]
+	}
+
+	// Ethernet header (TAP only).
+	var ethHdr header.Ethernet
+	if d.hasFlags(linux.IFF_TAP) {
+		if len(data) < header.EthernetMinimumSize {
+			// Ignore bad packet.
+			return dataLen, nil
+		}
+		ethHdr = header.Ethernet(data[:header.EthernetMinimumSize])
+		data = data[header.EthernetMinimumSize:]
+	}
+
+	// Try to determine network protocol number, default zero.
+	var protocol tcpip.NetworkProtocolNumber
+	switch {
+	case pktInfoHdr != nil:
+		protocol = pktInfoHdr.Protocol()
+	case ethHdr != nil:
+		protocol = ethHdr.Type()
+	}
+
+	// Try to determine remote link address, default zero.
+	var remote tcpip.LinkAddress
+	switch {
+	case ethHdr != nil:
+		remote = ethHdr.SourceAddress()
+	default:
+		remote = tcpip.LinkAddress(zeroMAC[:])
+	}
+
+	pkt := &stack.PacketBuffer{
+		Data: buffer.View(data).ToVectorisedView(),
+	}
+	if ethHdr != nil {
+		pkt.LinkHeader = buffer.View(ethHdr)
+	}
+	endpoint.InjectLinkAddr(protocol, remote, pkt)
+	return dataLen, nil
+}
+
+// Read reads one outgoing packet from the network interface.
+func (d *Device) Read() ([]byte, error) {
+	d.mu.RLock()
+	endpoint := d.endpoint
+	d.mu.RUnlock()
+	if endpoint == nil {
+		return nil, syserror.EBADFD
+	}
+
+	for {
+		info, ok := endpoint.Read()
+		if !ok {
+			return nil, syserror.ErrWouldBlock
+		}
+
+		v, ok := d.encodePkt(&info)
+		if !ok {
+			// Ignore unsupported packet.
+			continue
+		}
+		return v, nil
+	}
+}
+
+// encodePkt encodes packet for fd side.
+func (d *Device) encodePkt(info *channel.PacketInfo) (buffer.View, bool) {
+	var vv buffer.VectorisedView
+
+	// Packet information.
+	if !d.hasFlags(linux.IFF_NO_PI) {
+		hdr := make(PacketInfoHeader, PacketInfoHeaderSize)
+		hdr.Encode(&PacketInfoFields{
+			Protocol: info.Proto,
+		})
+		vv.AppendView(buffer.View(hdr))
+	}
+
+	// If the packet does not already have link layer header, and the route
+	// does not exist, we can't compute it. This is possibly a raw packet, tun
+	// device doesn't support this at the moment.
+	if info.Pkt.LinkHeader == nil && info.Route.RemoteLinkAddress == "" {
+		return nil, false
+	}
+
+	// Ethernet header (TAP only).
+	if d.hasFlags(linux.IFF_TAP) {
+		// Add ethernet header if not provided.
+		if info.Pkt.LinkHeader == nil {
+			hdr := &header.EthernetFields{
+				SrcAddr: info.Route.LocalLinkAddress,
+				DstAddr: info.Route.RemoteLinkAddress,
+				Type:    info.Proto,
+			}
+			if hdr.SrcAddr == "" {
+				hdr.SrcAddr = d.endpoint.LinkAddress()
+			}
+
+			eth := make(header.Ethernet, header.EthernetMinimumSize)
+			eth.Encode(hdr)
+			vv.AppendView(buffer.View(eth))
+		} else {
+			vv.AppendView(info.Pkt.LinkHeader)
+		}
+	}
+
+	// Append upper headers.
+	vv.AppendView(buffer.View(info.Pkt.Header.View()[len(info.Pkt.LinkHeader):]))
+	// Append data payload.
+	vv.Append(info.Pkt.Data)
+
+	return vv.ToView(), true
+}
+
+// Name returns the name of the attached network interface. Empty string if
+// unattached.
+func (d *Device) Name() string {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	if d.endpoint != nil {
+		return d.endpoint.name
+	}
+	return ""
+}
+
+// Flags returns the flags set for d. Zero value if unset.
+func (d *Device) Flags() uint16 {
+	d.mu.RLock()
+	defer d.mu.RUnlock()
+	return d.flags
+}
+
+func (d *Device) hasFlags(flags uint16) bool {
+	return d.flags&flags == flags
+}
+
+// Readiness implements watier.Waitable.Readiness.
+func (d *Device) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if mask&waiter.EventIn != 0 {
+		d.mu.RLock()
+		endpoint := d.endpoint
+		d.mu.RUnlock()
+		if endpoint != nil && endpoint.NumQueued() == 0 {
+			mask &= ^waiter.EventIn
+		}
+	}
+	return mask & (waiter.EventIn | waiter.EventOut)
+}
+
+// WriteNotify implements channel.Notification.WriteNotify.
+func (d *Device) WriteNotify() {
+	d.Notify(waiter.EventIn)
+}
+
+// tunEndpoint is the link endpoint for the NIC created by the tun device.
+//
+// It is ref-counted as multiple opening files can attach to the same NIC.
+// The last owner is responsible for deleting the NIC.
+type tunEndpoint struct {
+	*channel.Endpoint
+
+	refs.AtomicRefCount
+
+	stack *stack.Stack
+	nicID tcpip.NICID
+	name  string
+}
+
+// DecRef decrements refcount of e, removes NIC if refcount goes to 0.
+func (e *tunEndpoint) DecRef() {
+	e.DecRefWithDestructor(func() {
+		e.stack.RemoveNIC(e.nicID)
+	})
+}
diff --git a/pkg/tcpip/link/tun/protocol.go b/pkg/tcpip/link/tun/protocol.go
new file mode 100644
index 000000000..89d9d91a9
--- /dev/null
+++ b/pkg/tcpip/link/tun/protocol.go
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tun
+
+import (
+	"encoding/binary"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+)
+
+const (
+	// PacketInfoHeaderSize is the size of the packet information header.
+	PacketInfoHeaderSize = 4
+
+	offsetFlags    = 0
+	offsetProtocol = 2
+)
+
+// PacketInfoFields contains fields sent through the wire if IFF_NO_PI flag is
+// not set.
+type PacketInfoFields struct {
+	Flags    uint16
+	Protocol tcpip.NetworkProtocolNumber
+}
+
+// PacketInfoHeader is the wire representation of the packet information sent if
+// IFF_NO_PI flag is not set.
+type PacketInfoHeader []byte
+
+// Encode encodes f into h.
+func (h PacketInfoHeader) Encode(f *PacketInfoFields) {
+	binary.BigEndian.PutUint16(h[offsetFlags:][:2], f.Flags)
+	binary.BigEndian.PutUint16(h[offsetProtocol:][:2], uint16(f.Protocol))
+}
+
+// Flags returns the flag field in h.
+func (h PacketInfoHeader) Flags() uint16 {
+	return binary.BigEndian.Uint16(h[offsetFlags:])
+}
+
+// Protocol returns the protocol field in h.
+func (h PacketInfoHeader) Protocol() tcpip.NetworkProtocolNumber {
+	return tcpip.NetworkProtocolNumber(binary.BigEndian.Uint16(h[offsetProtocol:]))
+}
diff --git a/pkg/tcpip/link/tun/tun_unsafe.go b/pkg/tcpip/link/tun/tun_unsafe.go
new file mode 100644
index 000000000..09ca9b527
--- /dev/null
+++ b/pkg/tcpip/link/tun/tun_unsafe.go
@@ -0,0 +1,63 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// +build linux
+
+// Package tun contains methods to open TAP and TUN devices.
+package tun
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+// Open opens the specified TUN device, sets it to non-blocking mode, and
+// returns its file descriptor.
+func Open(name string) (int, error) {
+	return open(name, syscall.IFF_TUN|syscall.IFF_NO_PI)
+}
+
+// OpenTAP opens the specified TAP device, sets it to non-blocking mode, and
+// returns its file descriptor.
+func OpenTAP(name string) (int, error) {
+	return open(name, syscall.IFF_TAP|syscall.IFF_NO_PI)
+}
+
+func open(name string, flags uint16) (int, error) {
+	fd, err := syscall.Open("/dev/net/tun", syscall.O_RDWR, 0)
+	if err != nil {
+		return -1, err
+	}
+
+	var ifr struct {
+		name  [16]byte
+		flags uint16
+		_     [22]byte
+	}
+
+	copy(ifr.name[:], name)
+	ifr.flags = flags
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), syscall.TUNSETIFF, uintptr(unsafe.Pointer(&ifr)))
+	if errno != 0 {
+		syscall.Close(fd)
+		return -1, errno
+	}
+
+	if err = syscall.SetNonblock(fd, true); err != nil {
+		syscall.Close(fd)
+		return -1, err
+	}
+
+	return fd, nil
+}
diff --git a/pkg/tcpip/link/waitable/BUILD b/pkg/tcpip/link/waitable/BUILD
new file mode 100644
index 000000000..0956d2c65
--- /dev/null
+++ b/pkg/tcpip/link/waitable/BUILD
@@ -0,0 +1,30 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "waitable",
+    srcs = [
+        "waitable.go",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//pkg/gate",
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
+    ],
+)
+
+go_test(
+    name = "waitable_test",
+    srcs = [
+        "waitable_test.go",
+    ],
+    library = ":waitable",
+    deps = [
+        "//pkg/tcpip",
+        "//pkg/tcpip/buffer",
+        "//pkg/tcpip/stack",
+    ],
+)
diff --git a/pkg/tcpip/link/waitable/waitable.go b/pkg/tcpip/link/waitable/waitable.go
new file mode 100644
index 000000000..949b3f2b2
--- /dev/null
+++ b/pkg/tcpip/link/waitable/waitable.go
@@ -0,0 +1,149 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package waitable provides the implementation of data-link layer endpoints
+// that wrap other endpoints, and can wait for inflight calls to WritePacket or
+// DeliverNetworkPacket to finish (and new ones to be prevented).
+//
+// Waitable endpoints can be used in the networking stack by calling New(eID) to
+// create a new endpoint, where eID is the ID of the endpoint being wrapped,
+// and then passing it as an argument to Stack.CreateNIC().
+package waitable
+
+import (
+	"gvisor.dev/gvisor/pkg/gate"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+// Endpoint is a waitable link-layer endpoint.
+type Endpoint struct {
+	dispatchGate gate.Gate
+	dispatcher   stack.NetworkDispatcher
+
+	writeGate gate.Gate
+	lower     stack.LinkEndpoint
+}
+
+// New creates a new waitable link-layer endpoint. It wraps around another
+// endpoint and allows the caller to block new write/dispatch calls and wait for
+// the inflight ones to finish before returning.
+func New(lower stack.LinkEndpoint) *Endpoint {
+	return &Endpoint{
+		lower: lower,
+	}
+}
+
+// DeliverNetworkPacket implements stack.NetworkDispatcher.DeliverNetworkPacket.
+// It is called by the link-layer endpoint being wrapped when a packet arrives,
+// and only forwards to the actual dispatcher if Wait or WaitDispatch haven't
+// been called.
+func (e *Endpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	if !e.dispatchGate.Enter() {
+		return
+	}
+
+	e.dispatcher.DeliverNetworkPacket(remote, local, protocol, pkt)
+	e.dispatchGate.Leave()
+}
+
+// Attach implements stack.LinkEndpoint.Attach. It saves the dispatcher and
+// registers with the lower endpoint as its dispatcher so that "e" is called
+// for inbound packets.
+func (e *Endpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.dispatcher = dispatcher
+	e.lower.Attach(e)
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *Endpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+// MTU implements stack.LinkEndpoint.MTU. It just forwards the request to the
+// lower endpoint.
+func (e *Endpoint) MTU() uint32 {
+	return e.lower.MTU()
+}
+
+// Capabilities implements stack.LinkEndpoint.Capabilities. It just forwards the
+// request to the lower endpoint.
+func (e *Endpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.lower.Capabilities()
+}
+
+// MaxHeaderLength implements stack.LinkEndpoint.MaxHeaderLength. It just
+// forwards the request to the lower endpoint.
+func (e *Endpoint) MaxHeaderLength() uint16 {
+	return e.lower.MaxHeaderLength()
+}
+
+// LinkAddress implements stack.LinkEndpoint.LinkAddress. It just forwards the
+// request to the lower endpoint.
+func (e *Endpoint) LinkAddress() tcpip.LinkAddress {
+	return e.lower.LinkAddress()
+}
+
+// WritePacket implements stack.LinkEndpoint.WritePacket. It is called by
+// higher-level protocols to write packets. It only forwards packets to the
+// lower endpoint if Wait or WaitWrite haven't been called.
+func (e *Endpoint) WritePacket(r *stack.Route, gso *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	if !e.writeGate.Enter() {
+		return nil
+	}
+
+	err := e.lower.WritePacket(r, gso, protocol, pkt)
+	e.writeGate.Leave()
+	return err
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets. It is called by
+// higher-level protocols to write packets. It only forwards packets to the
+// lower endpoint if Wait or WaitWrite haven't been called.
+func (e *Endpoint) WritePackets(r *stack.Route, gso *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	if !e.writeGate.Enter() {
+		return pkts.Len(), nil
+	}
+
+	n, err := e.lower.WritePackets(r, gso, pkts, protocol)
+	e.writeGate.Leave()
+	return n, err
+}
+
+// WriteRawPacket implements stack.LinkEndpoint.WriteRawPacket.
+func (e *Endpoint) WriteRawPacket(vv buffer.VectorisedView) *tcpip.Error {
+	if !e.writeGate.Enter() {
+		return nil
+	}
+
+	err := e.lower.WriteRawPacket(vv)
+	e.writeGate.Leave()
+	return err
+}
+
+// WaitWrite prevents new calls to WritePacket from reaching the lower endpoint,
+// and waits for inflight ones to finish before returning.
+func (e *Endpoint) WaitWrite() {
+	e.writeGate.Close()
+}
+
+// WaitDispatch prevents new calls to DeliverNetworkPacket from reaching the
+// actual dispatcher, and waits for inflight ones to finish before returning.
+func (e *Endpoint) WaitDispatch() {
+	e.dispatchGate.Close()
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (e *Endpoint) Wait() {}
diff --git a/pkg/tcpip/link/waitable/waitable_test.go b/pkg/tcpip/link/waitable/waitable_test.go
new file mode 100644
index 000000000..63bf40562
--- /dev/null
+++ b/pkg/tcpip/link/waitable/waitable_test.go
@@ -0,0 +1,173 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package waitable
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/tcpip/buffer"
+	"gvisor.dev/gvisor/pkg/tcpip/stack"
+)
+
+type countedEndpoint struct {
+	dispatchCount int
+	writeCount    int
+	attachCount   int
+
+	mtu          uint32
+	capabilities stack.LinkEndpointCapabilities
+	hdrLen       uint16
+	linkAddr     tcpip.LinkAddress
+
+	dispatcher stack.NetworkDispatcher
+}
+
+func (e *countedEndpoint) DeliverNetworkPacket(remote, local tcpip.LinkAddress, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) {
+	e.dispatchCount++
+}
+
+func (e *countedEndpoint) Attach(dispatcher stack.NetworkDispatcher) {
+	e.attachCount++
+	e.dispatcher = dispatcher
+}
+
+// IsAttached implements stack.LinkEndpoint.IsAttached.
+func (e *countedEndpoint) IsAttached() bool {
+	return e.dispatcher != nil
+}
+
+func (e *countedEndpoint) MTU() uint32 {
+	return e.mtu
+}
+
+func (e *countedEndpoint) Capabilities() stack.LinkEndpointCapabilities {
+	return e.capabilities
+}
+
+func (e *countedEndpoint) MaxHeaderLength() uint16 {
+	return e.hdrLen
+}
+
+func (e *countedEndpoint) LinkAddress() tcpip.LinkAddress {
+	return e.linkAddr
+}
+
+func (e *countedEndpoint) WritePacket(r *stack.Route, _ *stack.GSO, protocol tcpip.NetworkProtocolNumber, pkt *stack.PacketBuffer) *tcpip.Error {
+	e.writeCount++
+	return nil
+}
+
+// WritePackets implements stack.LinkEndpoint.WritePackets.
+func (e *countedEndpoint) WritePackets(r *stack.Route, _ *stack.GSO, pkts stack.PacketBufferList, protocol tcpip.NetworkProtocolNumber) (int, *tcpip.Error) {
+	e.writeCount += pkts.Len()
+	return pkts.Len(), nil
+}
+
+func (e *countedEndpoint) WriteRawPacket(buffer.VectorisedView) *tcpip.Error {
+	e.writeCount++
+	return nil
+}
+
+// Wait implements stack.LinkEndpoint.Wait.
+func (*countedEndpoint) Wait() {}
+
+func TestWaitWrite(t *testing.T) {
+	ep := &countedEndpoint{}
+	wep := New(ep)
+
+	// Write and check that it goes through.
+	wep.WritePacket(nil, nil /* gso */, 0, &stack.PacketBuffer{})
+	if want := 1; ep.writeCount != want {
+		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
+	}
+
+	// Wait on dispatches, then try to write. It must go through.
+	wep.WaitDispatch()
+	wep.WritePacket(nil, nil /* gso */, 0, &stack.PacketBuffer{})
+	if want := 2; ep.writeCount != want {
+		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
+	}
+
+	// Wait on writes, then try to write. It must not go through.
+	wep.WaitWrite()
+	wep.WritePacket(nil, nil /* gso */, 0, &stack.PacketBuffer{})
+	if want := 2; ep.writeCount != want {
+		t.Fatalf("Unexpected writeCount: got=%v, want=%v", ep.writeCount, want)
+	}
+}
+
+func TestWaitDispatch(t *testing.T) {
+	ep := &countedEndpoint{}
+	wep := New(ep)
+
+	// Check that attach happens.
+	wep.Attach(ep)
+	if want := 1; ep.attachCount != want {
+		t.Fatalf("Unexpected attachCount: got=%v, want=%v", ep.attachCount, want)
+	}
+
+	// Dispatch and check that it goes through.
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, &stack.PacketBuffer{})
+	if want := 1; ep.dispatchCount != want {
+		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
+	}
+
+	// Wait on writes, then try to dispatch. It must go through.
+	wep.WaitWrite()
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, &stack.PacketBuffer{})
+	if want := 2; ep.dispatchCount != want {
+		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
+	}
+
+	// Wait on dispatches, then try to dispatch. It must not go through.
+	wep.WaitDispatch()
+	ep.dispatcher.DeliverNetworkPacket("", "", 0, &stack.PacketBuffer{})
+	if want := 2; ep.dispatchCount != want {
+		t.Fatalf("Unexpected dispatchCount: got=%v, want=%v", ep.dispatchCount, want)
+	}
+}
+
+func TestOtherMethods(t *testing.T) {
+	const (
+		mtu          = 0xdead
+		capabilities = 0xbeef
+		hdrLen       = 0x1234
+		linkAddr     = "test address"
+	)
+	ep := &countedEndpoint{
+		mtu:          mtu,
+		capabilities: capabilities,
+		hdrLen:       hdrLen,
+		linkAddr:     linkAddr,
+	}
+	wep := New(ep)
+
+	if v := wep.MTU(); v != mtu {
+		t.Fatalf("Unexpected mtu: got=%v, want=%v", v, mtu)
+	}
+
+	if v := wep.Capabilities(); v != capabilities {
+		t.Fatalf("Unexpected capabilities: got=%v, want=%v", v, capabilities)
+	}
+
+	if v := wep.MaxHeaderLength(); v != hdrLen {
+		t.Fatalf("Unexpected MaxHeaderLength: got=%v, want=%v", v, hdrLen)
+	}
+
+	if v := wep.LinkAddress(); v != linkAddr {
+		t.Fatalf("Unexpected LinkAddress: got=%q, want=%q", v, linkAddr)
+	}
+}