Merge branch 'master' into ip-forwarding

- Merges aleksej-paschenko's with HEAD - Adds vfs2 support for ip_forward
author: Ian Lewis <ianmlewis@gmail.com> 2020-08-17 21:44:31 -0400
committer: Ian Lewis <ianmlewis@gmail.com> 2020-08-17 21:44:31 -0400
commit: ac324f646ee3cb7955b0b45a7453aeb9671cbdf1 (patch)
tree: 0cbc5018e8807421d701d190dc20525726c7ca76 /pkg/sentry/fsimpl
parent: 352ae1022ce19de28fc72e034cc469872ad79d06 (diff)
parent: 6d0c5803d557d453f15ac6f683697eeb46dab680 (diff)
132 files changed, 27759 insertions, 2482 deletions
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
new file mode 100644
index 000000000..93512c9b6
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -0,0 +1,44 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+licenses(["notice"])
+
+go_library(
+    name = "devpts",
+    srcs = [
+        "devpts.go",
+        "line_discipline.go",
+        "master.go",
+        "queue.go",
+        "slave.go",
+        "terminal.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/safemem",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/unimpl",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "devpts_test",
+    size = "small",
+    srcs = ["devpts_test.go"],
+    library = ":devpts",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/contexttest",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
new file mode 100644
index 000000000..7169e91af
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -0,0 +1,233 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package devpts provides a filesystem implementation that behaves like
+// devpts.
+package devpts
+
+import (
+	"fmt"
+	"math"
+	"sort"
+	"strconv"
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the filesystem name.
+const Name = "devpts"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+var _ vfs.FilesystemType = (*FilesystemType)(nil)
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	// No data allowed.
+	if opts.Data != "" {
+		return nil, nil, syserror.EINVAL
+	}
+
+	fs, root, err := fstype.newFilesystem(vfsObj, creds)
+	if err != nil {
+		return nil, nil, err
+	}
+	return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+type filesystem struct {
+	kernfs.Filesystem
+
+	devMinor uint32
+}
+
+// newFilesystem creates a new devpts filesystem with root directory and ptmx
+// master inode. It returns the filesystem and root Dentry.
+func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) {
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	fs := &filesystem{
+		devMinor: devMinor,
+	}
+	fs.Filesystem.VFSFilesystem().Init(vfsObj, fstype, fs)
+
+	// Construct the root directory. This is always inode id 1.
+	root := &rootInode{
+		slaves: make(map[uint32]*slaveInode),
+	}
+	root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
+	root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	root.dentry.Init(root)
+
+	// Construct the pts master inode and dentry. Linux always uses inode
+	// id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx.
+	master := &masterInode{
+		root: root,
+	}
+	master.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666)
+	master.dentry.Init(master)
+
+	// Add the master as a child of the root.
+	links := root.OrderedChildren.Populate(&root.dentry, map[string]*kernfs.Dentry{
+		"ptmx": &master.dentry,
+	})
+	root.IncLinks(links)
+
+	return fs, &root.dentry, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release(ctx)
+}
+
+// rootInode is the root directory inode for the devpts mounts.
+type rootInode struct {
+	kernfs.AlwaysValid
+	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
+
+	// Keep a reference to this inode's dentry.
+	dentry kernfs.Dentry
+
+	// master is the master pty inode. Immutable.
+	master *masterInode
+
+	// root is the root directory inode for this filesystem. Immutable.
+	root *rootInode
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// slaves maps pty ids to slave inodes.
+	slaves map[uint32]*slaveInode
+
+	// nextIdx is the next pty index to use. Must be accessed atomically.
+	//
+	// TODO(b/29356795): reuse indices when ptys are closed.
+	nextIdx uint32
+}
+
+var _ kernfs.Inode = (*rootInode)(nil)
+
+// allocateTerminal creates a new Terminal and installs a pts node for it.
+func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	if i.nextIdx == math.MaxUint32 {
+		return nil, syserror.ENOMEM
+	}
+	idx := i.nextIdx
+	i.nextIdx++
+
+	// Sanity check that slave with idx does not exist.
+	if _, ok := i.slaves[idx]; ok {
+		panic(fmt.Sprintf("pty index collision; index %d already exists", idx))
+	}
+
+	// Create the new terminal and slave.
+	t := newTerminal(idx)
+	slave := &slaveInode{
+		root: i,
+		t:    t,
+	}
+	// Linux always uses pty index + 3 as the inode id. See
+	// fs/devpts/inode.c:devpts_pty_new().
+	slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
+	slave.dentry.Init(slave)
+	i.slaves[idx] = slave
+
+	return t, nil
+}
+
+// masterClose is called when the master end of t is closed.
+func (i *rootInode) masterClose(t *Terminal) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+
+	// Sanity check that slave with idx exists.
+	if _, ok := i.slaves[t.n]; !ok {
+		panic(fmt.Sprintf("pty with index %d does not exist", t.n))
+	}
+	delete(i.slaves, t.n)
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+// Lookup implements kernfs.Inode.Lookup.
+func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	idx, err := strconv.ParseUint(name, 10, 32)
+	if err != nil {
+		return nil, syserror.ENOENT
+	}
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	if si, ok := i.slaves[uint32(idx)]; ok {
+		si.dentry.IncRef()
+		return si.dentry.VFSDentry(), nil
+
+	}
+	return nil, syserror.ENOENT
+}
+
+// IterDirents implements kernfs.Inode.IterDirents.
+func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	ids := make([]int, 0, len(i.slaves))
+	for id := range i.slaves {
+		ids = append(ids, int(id))
+	}
+	sort.Ints(ids)
+	for _, id := range ids[relOffset:] {
+		dirent := vfs.Dirent{
+			Name:    strconv.FormatUint(uint64(id), 10),
+			Type:    linux.DT_CHR,
+			Ino:     i.slaves[uint32(id)].InodeAttrs.Ino(),
+			NextOff: offset + 1,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
+		}
+		offset++
+	}
+	return offset, nil
+}
diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go
new file mode 100644
index 000000000..b7c149047
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/devpts_test.go
@@ -0,0 +1,56 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func TestSimpleMasterToSlave(t *testing.T) {
+	ld := newLineDiscipline(linux.DefaultSlaveTermios)
+	ctx := contexttest.Context(t)
+	inBytes := []byte("hello, tty\n")
+	src := usermem.BytesIOSequence(inBytes)
+	outBytes := make([]byte, 32)
+	dst := usermem.BytesIOSequence(outBytes)
+
+	// Write to the input queue.
+	nw, err := ld.inputQueueWrite(ctx, src)
+	if err != nil {
+		t.Fatalf("error writing to input queue: %v", err)
+	}
+	if nw != int64(len(inBytes)) {
+		t.Fatalf("wrote wrong length: got %d, want %d", nw, len(inBytes))
+	}
+
+	// Read from the input queue.
+	nr, err := ld.inputQueueRead(ctx, dst)
+	if err != nil {
+		t.Fatalf("error reading from input queue: %v", err)
+	}
+	if nr != int64(len(inBytes)) {
+		t.Fatalf("read wrong length: got %d, want %d", nr, len(inBytes))
+	}
+
+	outStr := string(outBytes[:nr])
+	inStr := string(inBytes)
+	if outStr != inStr {
+		t.Fatalf("written and read strings do not match: got %q, want %q", outStr, inStr)
+	}
+}
diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go
new file mode 100644
index 000000000..f7bc325d1
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/line_discipline.go
@@ -0,0 +1,445 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"bytes"
+	"unicode/utf8"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const (
+	// canonMaxBytes is the number of bytes that fit into a single line of
+	// terminal input in canonical mode. This corresponds to N_TTY_BUF_SIZE
+	// in include/linux/tty.h.
+	canonMaxBytes = 4096
+
+	// nonCanonMaxBytes is the maximum number of bytes that can be read at
+	// a time in noncanonical mode.
+	nonCanonMaxBytes = canonMaxBytes - 1
+
+	spacesPerTab = 8
+)
+
+// lineDiscipline dictates how input and output are handled between the
+// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
+// pages are good resources for how to affect the line discipline:
+//
+//   * termios(3)
+//   * tty_ioctl(4)
+//
+// This file corresponds most closely to drivers/tty/n_tty.c.
+//
+// lineDiscipline has a simple structure but supports a multitude of options
+// (see the above man pages). It consists of two queues of bytes: one from the
+// terminal master to slave (the input queue) and one from slave to master (the
+// output queue). When bytes are written to one end of the pty, the line
+// discipline reads the bytes, modifies them or takes special action if
+// required, and enqueues them to be read by the other end of the pty:
+//
+//       input from terminal    +-------------+   input to process (e.g. bash)
+//    +------------------------>| input queue |---------------------------+
+//    |   (inputQueueWrite)     +-------------+     (inputQueueRead)      |
+//    |                                                                   |
+//    |                                                                   v
+// masterFD                                                            slaveFD
+//    ^                                                                   |
+//    |                                                                   |
+//    |   output to terminal   +--------------+    output from process    |
+//    +------------------------| output queue |<--------------------------+
+//        (outputQueueRead)    +--------------+    (outputQueueWrite)
+//
+// Lock order:
+//  termiosMu
+//    inQueue.mu
+//      outQueue.mu
+//
+// +stateify savable
+type lineDiscipline struct {
+	// sizeMu protects size.
+	sizeMu sync.Mutex `state:"nosave"`
+
+	// size is the terminal size (width and height).
+	size linux.WindowSize
+
+	// inQueue is the input queue of the terminal.
+	inQueue queue
+
+	// outQueue is the output queue of the terminal.
+	outQueue queue
+
+	// termiosMu protects termios.
+	termiosMu sync.RWMutex `state:"nosave"`
+
+	// termios is the terminal configuration used by the lineDiscipline.
+	termios linux.KernelTermios
+
+	// column is the location in a row of the cursor. This is important for
+	// handling certain special characters like backspace.
+	column int
+
+	// masterWaiter is used to wait on the master end of the TTY.
+	masterWaiter waiter.Queue `state:"zerovalue"`
+
+	// slaveWaiter is used to wait on the slave end of the TTY.
+	slaveWaiter waiter.Queue `state:"zerovalue"`
+}
+
+func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
+	ld := lineDiscipline{termios: termios}
+	ld.inQueue.transformer = &inputQueueTransformer{}
+	ld.outQueue.transformer = &outputQueueTransformer{}
+	return &ld
+}
+
+// getTermios gets the linux.Termios for the tty.
+func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	// We must copy a Termios struct, not KernelTermios.
+	t := l.termios.ToTermios()
+	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return 0, err
+}
+
+// setTermios sets a linux.Termios for the tty.
+func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	l.termiosMu.Lock()
+	defer l.termiosMu.Unlock()
+	oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
+	// We must copy a Termios struct, not KernelTermios.
+	var t linux.Termios
+	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	l.termios.FromTermios(t)
+
+	// If canonical mode is turned off, move bytes from inQueue's wait
+	// buffer to its read buffer. Anything already in the read buffer is
+	// now readable.
+	if oldCanonEnabled && !l.termios.LEnabled(linux.ICANON) {
+		l.inQueue.mu.Lock()
+		l.inQueue.pushWaitBufLocked(l)
+		l.inQueue.readable = true
+		l.inQueue.mu.Unlock()
+		l.slaveWaiter.Notify(waiter.EventIn)
+	}
+
+	return 0, err
+}
+
+func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	l.sizeMu.Lock()
+	defer l.sizeMu.Unlock()
+	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return err
+}
+
+func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	l.sizeMu.Lock()
+	defer l.sizeMu.Unlock()
+	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return err
+}
+
+func (l *lineDiscipline) masterReadiness() waiter.EventMask {
+	// We don't have to lock a termios because the default master termios
+	// is immutable.
+	return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
+}
+
+func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
+}
+
+func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	return l.inQueue.readableSize(ctx, io, args)
+}
+
+func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	n, pushed, err := l.inQueue.read(ctx, dst, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.masterWaiter.Notify(waiter.EventOut)
+		if pushed {
+			l.slaveWaiter.Notify(waiter.EventIn)
+		}
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	n, err := l.inQueue.write(ctx, src, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.slaveWaiter.Notify(waiter.EventIn)
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	return l.outQueue.readableSize(ctx, io, args)
+}
+
+func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	n, pushed, err := l.outQueue.read(ctx, dst, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.slaveWaiter.Notify(waiter.EventOut)
+		if pushed {
+			l.masterWaiter.Notify(waiter.EventIn)
+		}
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+func (l *lineDiscipline) outputQueueWrite(ctx context.Context, src usermem.IOSequence) (int64, error) {
+	l.termiosMu.RLock()
+	defer l.termiosMu.RUnlock()
+	n, err := l.outQueue.write(ctx, src, l)
+	if err != nil {
+		return 0, err
+	}
+	if n > 0 {
+		l.masterWaiter.Notify(waiter.EventIn)
+		return n, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+// transformer is a helper interface to make it easier to stateify queue.
+type transformer interface {
+	// transform functions require queue's mutex to be held.
+	transform(*lineDiscipline, *queue, []byte) int
+}
+
+// outputQueueTransformer implements transformer. It performs line discipline
+// transformations on the output queue.
+//
+// +stateify savable
+type outputQueueTransformer struct{}
+
+// transform does output processing for one end of the pty. See
+// drivers/tty/n_tty.c:do_output_char for an analogous kernel function.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+// * q.mu must be held.
+func (*outputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int {
+	// transformOutput is effectively always in noncanonical mode, as the
+	// master termios never has ICANON set.
+
+	if !l.termios.OEnabled(linux.OPOST) {
+		q.readBuf = append(q.readBuf, buf...)
+		if len(q.readBuf) > 0 {
+			q.readable = true
+		}
+		return len(buf)
+	}
+
+	var ret int
+	for len(buf) > 0 {
+		size := l.peek(buf)
+		cBytes := append([]byte{}, buf[:size]...)
+		ret += size
+		buf = buf[size:]
+		// We're guaranteed that cBytes has at least one element.
+		switch cBytes[0] {
+		case '\n':
+			if l.termios.OEnabled(linux.ONLRET) {
+				l.column = 0
+			}
+			if l.termios.OEnabled(linux.ONLCR) {
+				q.readBuf = append(q.readBuf, '\r', '\n')
+				continue
+			}
+		case '\r':
+			if l.termios.OEnabled(linux.ONOCR) && l.column == 0 {
+				continue
+			}
+			if l.termios.OEnabled(linux.OCRNL) {
+				cBytes[0] = '\n'
+				if l.termios.OEnabled(linux.ONLRET) {
+					l.column = 0
+				}
+				break
+			}
+			l.column = 0
+		case '\t':
+			spaces := spacesPerTab - l.column%spacesPerTab
+			if l.termios.OutputFlags&linux.TABDLY == linux.XTABS {
+				l.column += spaces
+				q.readBuf = append(q.readBuf, bytes.Repeat([]byte{' '}, spacesPerTab)...)
+				continue
+			}
+			l.column += spaces
+		case '\b':
+			if l.column > 0 {
+				l.column--
+			}
+		default:
+			l.column++
+		}
+		q.readBuf = append(q.readBuf, cBytes...)
+	}
+	if len(q.readBuf) > 0 {
+		q.readable = true
+	}
+	return ret
+}
+
+// inputQueueTransformer implements transformer. It performs line discipline
+// transformations on the input queue.
+//
+// +stateify savable
+type inputQueueTransformer struct{}
+
+// transform does input processing for one end of the pty. Characters read are
+// transformed according to flags set in the termios struct. See
+// drivers/tty/n_tty.c:n_tty_receive_char_special for an analogous kernel
+// function.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+// * q.mu must be held.
+func (*inputQueueTransformer) transform(l *lineDiscipline, q *queue, buf []byte) int {
+	// If there's a line waiting to be read in canonical mode, don't write
+	// anything else to the read buffer.
+	if l.termios.LEnabled(linux.ICANON) && q.readable {
+		return 0
+	}
+
+	maxBytes := nonCanonMaxBytes
+	if l.termios.LEnabled(linux.ICANON) {
+		maxBytes = canonMaxBytes
+	}
+
+	var ret int
+	for len(buf) > 0 && len(q.readBuf) < canonMaxBytes {
+		size := l.peek(buf)
+		cBytes := append([]byte{}, buf[:size]...)
+		// We're guaranteed that cBytes has at least one element.
+		switch cBytes[0] {
+		case '\r':
+			if l.termios.IEnabled(linux.IGNCR) {
+				buf = buf[size:]
+				ret += size
+				continue
+			}
+			if l.termios.IEnabled(linux.ICRNL) {
+				cBytes[0] = '\n'
+			}
+		case '\n':
+			if l.termios.IEnabled(linux.INLCR) {
+				cBytes[0] = '\r'
+			}
+		}
+
+		// In canonical mode, we discard non-terminating characters
+		// after the first 4095.
+		if l.shouldDiscard(q, cBytes) {
+			buf = buf[size:]
+			ret += size
+			continue
+		}
+
+		// Stop if the buffer would be overfilled.
+		if len(q.readBuf)+size > maxBytes {
+			break
+		}
+		buf = buf[size:]
+		ret += size
+
+		// If we get EOF, make the buffer available for reading.
+		if l.termios.LEnabled(linux.ICANON) && l.termios.IsEOF(cBytes[0]) {
+			q.readable = true
+			break
+		}
+
+		q.readBuf = append(q.readBuf, cBytes...)
+
+		// Anything written to the readBuf will have to be echoed.
+		if l.termios.LEnabled(linux.ECHO) {
+			l.outQueue.writeBytes(cBytes, l)
+			l.masterWaiter.Notify(waiter.EventIn)
+		}
+
+		// If we finish a line, make it available for reading.
+		if l.termios.LEnabled(linux.ICANON) && l.termios.IsTerminating(cBytes) {
+			q.readable = true
+			break
+		}
+	}
+
+	// In noncanonical mode, everything is readable.
+	if !l.termios.LEnabled(linux.ICANON) && len(q.readBuf) > 0 {
+		q.readable = true
+	}
+
+	return ret
+}
+
+// shouldDiscard returns whether c should be discarded. In canonical mode, if
+// too many bytes are enqueued, we keep reading input and discarding it until
+// we find a terminating character. Signal/echo processing still occurs.
+//
+// Precondition:
+// * l.termiosMu must be held for reading.
+// * q.mu must be held.
+func (l *lineDiscipline) shouldDiscard(q *queue, cBytes []byte) bool {
+	return l.termios.LEnabled(linux.ICANON) && len(q.readBuf)+len(cBytes) >= canonMaxBytes && !l.termios.IsTerminating(cBytes)
+}
+
+// peek returns the size in bytes of the next character to process. As long as
+// b isn't empty, peek returns a value of at least 1.
+func (l *lineDiscipline) peek(b []byte) int {
+	size := 1
+	// If UTF-8 support is enabled, runes might be multiple bytes.
+	if l.termios.IEnabled(linux.IUTF8) {
+		_, size = utf8.DecodeRune(b)
+	}
+	return size
+}
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
new file mode 100644
index 000000000..3bb397f71
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -0,0 +1,237 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/unimpl"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// masterInode is the inode for the master end of the Terminal.
+type masterInode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	locks vfs.FileLocks
+
+	// Keep a reference to this inode's dentry.
+	dentry kernfs.Dentry
+
+	// root is the devpts root inode.
+	root *rootInode
+}
+
+var _ kernfs.Inode = (*masterInode)(nil)
+
+// Open implements kernfs.Inode.Open.
+func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	t, err := mi.root.allocateTerminal(rp.Credentials())
+	if err != nil {
+		return nil, err
+	}
+
+	mi.IncRef()
+	fd := &masterFileDescription{
+		inode: mi,
+		t:     t,
+	}
+	fd.LockFD.Init(&mi.locks)
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		mi.DecRef(ctx)
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (mi *masterInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	statx, err := mi.InodeAttrs.Stat(ctx, vfsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	statx.Blksize = 1024
+	statx.RdevMajor = linux.TTYAUX_MAJOR
+	statx.RdevMinor = linux.PTMX_MINOR
+	return statx, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat
+func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask&linux.STATX_SIZE != 0 {
+		return syserror.EINVAL
+	}
+	return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
+}
+
+type masterFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+
+	inode *masterInode
+	t     *Terminal
+}
+
+var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil)
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (mfd *masterFileDescription) Release(ctx context.Context) {
+	mfd.inode.root.masterClose(mfd.t)
+	mfd.inode.DecRef(ctx)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (mfd *masterFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	mfd.t.ld.masterWaiter.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (mfd *masterFileDescription) EventUnregister(e *waiter.Entry) {
+	mfd.t.ld.masterWaiter.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (mfd *masterFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return mfd.t.ld.masterReadiness()
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (mfd *masterFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	return mfd.t.ld.outputQueueRead(ctx, dst)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+	return mfd.t.ld.inputQueueWrite(ctx, src)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+		// Get the number of bytes in the output queue read buffer.
+		return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args)
+	case linux.TCGETS:
+		// N.B. TCGETS on the master actually returns the configuration
+		// of the slave end.
+		return mfd.t.ld.getTermios(ctx, io, args)
+	case linux.TCSETS:
+		// N.B. TCSETS on the master actually affects the configuration
+		// of the slave end.
+		return mfd.t.ld.setTermios(ctx, io, args)
+	case linux.TCSETSW:
+		// TODO(b/29356795): This should drain the output queue first.
+		return mfd.t.ld.setTermios(ctx, io, args)
+	case linux.TIOCGPTN:
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	case linux.TIOCSPTLCK:
+		// TODO(b/29356795): Implement pty locking. For now just pretend we do.
+		return 0, nil
+	case linux.TIOCGWINSZ:
+		return 0, mfd.t.ld.windowSize(ctx, io, args)
+	case linux.TIOCSWINSZ:
+		return 0, mfd.t.ld.setWindowSize(ctx, io, args)
+	case linux.TIOCSCTTY:
+		// Make the given terminal the controlling terminal of the
+		// calling process.
+		return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
+		return 0, syserror.ENOTTY
+	}
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (mfd *masterFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return mfd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (mfd *masterFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := mfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return mfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (mfd *masterFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return mfd.Locks().LockPOSIX(ctx, &mfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (mfd *masterFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return mfd.Locks().UnlockPOSIX(ctx, &mfd.vfsfd, uid, start, length, whence)
+}
+
+// maybeEmitUnimplementedEvent emits unimplemented event if cmd is valid.
+func maybeEmitUnimplementedEvent(ctx context.Context, cmd uint32) {
+	switch cmd {
+	case linux.TCGETS,
+		linux.TCSETS,
+		linux.TCSETSW,
+		linux.TCSETSF,
+		linux.TIOCGWINSZ,
+		linux.TIOCSWINSZ,
+		linux.TIOCSETD,
+		linux.TIOCSBRK,
+		linux.TIOCCBRK,
+		linux.TCSBRK,
+		linux.TCSBRKP,
+		linux.TIOCSTI,
+		linux.TIOCCONS,
+		linux.FIONBIO,
+		linux.TIOCEXCL,
+		linux.TIOCNXCL,
+		linux.TIOCGEXCL,
+		linux.TIOCGSID,
+		linux.TIOCGETD,
+		linux.TIOCVHANGUP,
+		linux.TIOCGDEV,
+		linux.TIOCMGET,
+		linux.TIOCMSET,
+		linux.TIOCMBIC,
+		linux.TIOCMBIS,
+		linux.TIOCGICOUNT,
+		linux.TCFLSH,
+		linux.TIOCSSERIAL,
+		linux.TIOCGPTPEER:
+
+		unimpl.EmitUnimplementedEvent(ctx)
+	}
+}
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
new file mode 100644
index 000000000..dffb4232c
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -0,0 +1,236 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// waitBufMaxBytes is the maximum size of a wait buffer. It is based on
+// TTYB_DEFAULT_MEM_LIMIT.
+const waitBufMaxBytes = 131072
+
+// queue represents one of the input or output queues between a pty master and
+// slave. Bytes written to a queue are added to the read buffer until it is
+// full, at which point they are written to the wait buffer. Bytes are
+// processed (i.e. undergo termios transformations) as they are added to the
+// read buffer. The read buffer is readable when its length is nonzero and
+// readable is true.
+//
+// +stateify savable
+type queue struct {
+	// mu protects everything in queue.
+	mu sync.Mutex `state:"nosave"`
+
+	// readBuf is buffer of data ready to be read when readable is true.
+	// This data has been processed.
+	readBuf []byte
+
+	// waitBuf contains data that can't fit into readBuf. It is put here
+	// until it can be loaded into the read buffer. waitBuf contains data
+	// that hasn't been processed.
+	waitBuf    [][]byte
+	waitBufLen uint64
+
+	// readable indicates whether the read buffer can be read from.  In
+	// canonical mode, there can be an unterminated line in the read buffer,
+	// so readable must be checked.
+	readable bool
+
+	// transform is the the queue's function for transforming bytes
+	// entering the queue. For example, transform might convert all '\r's
+	// entering the queue to '\n's.
+	transformer
+}
+
+// readReadiness returns whether q is ready to be read from.
+func (q *queue) readReadiness(t *linux.KernelTermios) waiter.EventMask {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if len(q.readBuf) > 0 && q.readable {
+		return waiter.EventIn
+	}
+	return waiter.EventMask(0)
+}
+
+// writeReadiness returns whether q is ready to be written to.
+func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if q.waitBufLen < waitBufMaxBytes {
+		return waiter.EventOut
+	}
+	return waiter.EventMask(0)
+}
+
+// readableSize writes the number of readable bytes to userspace.
+func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	var size int32
+	if q.readable {
+		size = int32(len(q.readBuf))
+	}
+
+	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return err
+
+}
+
+// read reads from q to userspace. It returns the number of bytes read as well
+// as whether the read caused more readable data to become available (whether
+// data was pushed from the wait buffer to the read buffer).
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	if !q.readable {
+		return 0, false, syserror.ErrWouldBlock
+	}
+
+	if dst.NumBytes() > canonMaxBytes {
+		dst = dst.TakeFirst(canonMaxBytes)
+	}
+
+	n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dst safemem.BlockSeq) (uint64, error) {
+		src := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(q.readBuf))
+		n, err := safemem.CopySeq(dst, src)
+		if err != nil {
+			return 0, err
+		}
+		q.readBuf = q.readBuf[n:]
+
+		// If we read everything, this queue is no longer readable.
+		if len(q.readBuf) == 0 {
+			q.readable = false
+		}
+
+		return n, nil
+	}))
+	if err != nil {
+		return 0, false, err
+	}
+
+	// Move data from the queue's wait buffer to its read buffer.
+	nPushed := q.pushWaitBufLocked(l)
+
+	return int64(n), nPushed > 0, nil
+}
+
+// write writes to q from userspace.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	// Copy data into the wait buffer.
+	n, err := src.CopyInTo(ctx, safemem.WriterFunc(func(src safemem.BlockSeq) (uint64, error) {
+		copyLen := src.NumBytes()
+		room := waitBufMaxBytes - q.waitBufLen
+		// If out of room, return EAGAIN.
+		if room == 0 && copyLen > 0 {
+			return 0, syserror.ErrWouldBlock
+		}
+		// Cap the size of the wait buffer.
+		if copyLen > room {
+			copyLen = room
+			src = src.TakeFirst64(room)
+		}
+		buf := make([]byte, copyLen)
+
+		// Copy the data into the wait buffer.
+		dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf))
+		n, err := safemem.CopySeq(dst, src)
+		if err != nil {
+			return 0, err
+		}
+		q.waitBufAppend(buf)
+
+		return n, nil
+	}))
+	if err != nil {
+		return 0, err
+	}
+
+	// Push data from the wait to the read buffer.
+	q.pushWaitBufLocked(l)
+
+	return n, nil
+}
+
+// writeBytes writes to q from b.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
+	q.mu.Lock()
+	defer q.mu.Unlock()
+
+	// Write to the wait buffer.
+	q.waitBufAppend(b)
+	q.pushWaitBufLocked(l)
+}
+
+// pushWaitBufLocked fills the queue's read buffer with data from the wait
+// buffer.
+//
+// Preconditions:
+// * l.termiosMu must be held for reading.
+// * q.mu must be locked.
+func (q *queue) pushWaitBufLocked(l *lineDiscipline) int {
+	if q.waitBufLen == 0 {
+		return 0
+	}
+
+	// Move data from the wait to the read buffer.
+	var total int
+	var i int
+	for i = 0; i < len(q.waitBuf); i++ {
+		n := q.transform(l, q, q.waitBuf[i])
+		total += n
+		if n != len(q.waitBuf[i]) {
+			// The read buffer filled up without consuming the
+			// entire buffer.
+			q.waitBuf[i] = q.waitBuf[i][n:]
+			break
+		}
+	}
+
+	// Update wait buffer based on consumed data.
+	q.waitBuf = q.waitBuf[i:]
+	q.waitBufLen -= uint64(total)
+
+	return total
+}
+
+// Precondition: q.mu must be locked.
+func (q *queue) waitBufAppend(b []byte) {
+	q.waitBuf = append(q.waitBuf, b)
+	q.waitBufLen += uint64(len(b))
+}
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
new file mode 100644
index 000000000..32e4e1908
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/slave.go
@@ -0,0 +1,197 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// slaveInode is the inode for the slave end of the Terminal.
+type slaveInode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	locks vfs.FileLocks
+
+	// Keep a reference to this inode's dentry.
+	dentry kernfs.Dentry
+
+	// root is the devpts root inode.
+	root *rootInode
+
+	// t is the connected Terminal.
+	t *Terminal
+}
+
+var _ kernfs.Inode = (*slaveInode)(nil)
+
+// Open implements kernfs.Inode.Open.
+func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	si.IncRef()
+	fd := &slaveFileDescription{
+		inode: si,
+	}
+	fd.LockFD.Init(&si.locks)
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		si.DecRef(ctx)
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+
+}
+
+// Valid implements kernfs.Inode.Valid.
+func (si *slaveInode) Valid(context.Context) bool {
+	// Return valid if the slave still exists.
+	si.root.mu.Lock()
+	defer si.root.mu.Unlock()
+	_, ok := si.root.slaves[si.t.n]
+	return ok
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (si *slaveInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	statx, err := si.InodeAttrs.Stat(ctx, vfsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	statx.Blksize = 1024
+	statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR
+	statx.RdevMinor = si.t.n
+	return statx, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat
+func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask&linux.STATX_SIZE != 0 {
+		return syserror.EINVAL
+	}
+	return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
+}
+
+type slaveFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+
+	inode *slaveInode
+}
+
+var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (sfd *slaveFileDescription) Release(ctx context.Context) {
+	sfd.inode.DecRef(ctx)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) {
+	sfd.inode.t.ld.slaveWaiter.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return sfd.inode.t.ld.slaveReadiness()
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	return sfd.inode.t.ld.inputQueueRead(ctx, dst)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+	return sfd.inode.t.ld.outputQueueWrite(ctx, src)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+		// Get the number of bytes in the input queue read buffer.
+		return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args)
+	case linux.TCGETS:
+		return sfd.inode.t.ld.getTermios(ctx, io, args)
+	case linux.TCSETS:
+		return sfd.inode.t.ld.setTermios(ctx, io, args)
+	case linux.TCSETSW:
+		// TODO(b/29356795): This should drain the output queue first.
+		return sfd.inode.t.ld.setTermios(ctx, io, args)
+	case linux.TIOCGPTN:
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+	case linux.TIOCGWINSZ:
+		return 0, sfd.inode.t.ld.windowSize(ctx, io, args)
+	case linux.TIOCSWINSZ:
+		return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args)
+	case linux.TIOCSCTTY:
+		// Make the given terminal the controlling terminal of the
+		// calling process.
+		return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
+	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
+		return 0, syserror.ENOTTY
+	}
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return sfd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return sfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (sfd *slaveFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (sfd *slaveFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go
new file mode 100644
index 000000000..7d2781c54
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/terminal.go
@@ -0,0 +1,120 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Terminal is a pseudoterminal.
+//
+// +stateify savable
+type Terminal struct {
+	// n is the terminal index. It is immutable.
+	n uint32
+
+	// ld is the line discipline of the terminal. It is immutable.
+	ld *lineDiscipline
+
+	// masterKTTY contains the controlling process of the master end of
+	// this terminal. This field is immutable.
+	masterKTTY *kernel.TTY
+
+	// slaveKTTY contains the controlling process of the slave end of this
+	// terminal. This field is immutable.
+	slaveKTTY *kernel.TTY
+}
+
+func newTerminal(n uint32) *Terminal {
+	termios := linux.DefaultSlaveTermios
+	t := Terminal{
+		n:          n,
+		ld:         newLineDiscipline(termios),
+		masterKTTY: &kernel.TTY{Index: n},
+		slaveKTTY:  &kernel.TTY{Index: n},
+	}
+	return &t
+}
+
+// setControllingTTY makes tm the controlling terminal of the calling thread
+// group.
+func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("setControllingTTY must be called from a task context")
+	}
+
+	return task.ThreadGroup().SetControllingTTY(tm.tty(isMaster), args[2].Int())
+}
+
+// releaseControllingTTY removes tm as the controlling terminal of the calling
+// thread group.
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("releaseControllingTTY must be called from a task context")
+	}
+
+	return task.ThreadGroup().ReleaseControllingTTY(tm.tty(isMaster))
+}
+
+// foregroundProcessGroup gets the process group ID of tm's foreground process.
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("foregroundProcessGroup must be called from a task context")
+	}
+
+	ret, err := task.ThreadGroup().ForegroundProcessGroup(tm.tty(isMaster))
+	if err != nil {
+		return 0, err
+	}
+
+	// Write it out to *arg.
+	_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
+		AddressSpaceActive: true,
+	})
+	return 0, err
+}
+
+// foregroundProcessGroup sets tm's foreground process.
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		panic("setForegroundProcessGroup must be called from a task context")
+	}
+
+	// Read in the process group ID.
+	var pgid int32
+	if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
+		AddressSpaceActive: true,
+	}); err != nil {
+		return 0, err
+	}
+
+	ret, err := task.ThreadGroup().SetForegroundProcessGroup(tm.tty(isMaster), kernel.ProcessGroupID(pgid))
+	return uintptr(ret), err
+}
+
+func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
+	if isMaster {
+		return tm.masterKTTY
+	}
+	return tm.slaveKTTY
+}
diff --git a/pkg/sentry/fsimpl/devtmpfs/BUILD b/pkg/sentry/fsimpl/devtmpfs/BUILD
new file mode 100644
index 000000000..aa0c2ad8c
--- /dev/null
+++ b/pkg/sentry/fsimpl/devtmpfs/BUILD
@@ -0,0 +1,33 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+licenses(["notice"])
+
+go_library(
+    name = "devtmpfs",
+    srcs = ["devtmpfs.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+    ],
+)
+
+go_test(
+    name = "devtmpfs_test",
+    size = "small",
+    srcs = ["devtmpfs_test.go"],
+    library = ":devtmpfs",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/fspath",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
new file mode 100644
index 000000000..2ed5fa8a9
--- /dev/null
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -0,0 +1,219 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package devtmpfs provides an implementation of /dev based on tmpfs,
+// analogous to Linux's devtmpfs.
+package devtmpfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// Name is the default filesystem name.
+const Name = "devtmpfs"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct {
+	initOnce sync.Once
+	initErr  error
+
+	// fs is the tmpfs filesystem that backs all mounts of this FilesystemType.
+	// root is fs' root. fs and root are immutable.
+	fs   *vfs.Filesystem
+	root *vfs.Dentry
+}
+
+// Name implements vfs.FilesystemType.Name.
+func (*FilesystemType) Name() string {
+	return Name
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	fst.initOnce.Do(func() {
+		fs, root, err := tmpfs.FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "" /* source */, vfs.GetFilesystemOptions{
+			Data: "mode=0755", // opts from drivers/base/devtmpfs.c:devtmpfs_init()
+		})
+		if err != nil {
+			fst.initErr = err
+			return
+		}
+		fst.fs = fs
+		fst.root = root
+	})
+	if fst.initErr != nil {
+		return nil, nil, fst.initErr
+	}
+	fst.fs.IncRef()
+	fst.root.IncRef()
+	return fst.fs, fst.root, nil
+}
+
+// Accessor allows devices to create device special files in devtmpfs.
+type Accessor struct {
+	vfsObj *vfs.VirtualFilesystem
+	mntns  *vfs.MountNamespace
+	root   vfs.VirtualDentry
+	creds  *auth.Credentials
+}
+
+// NewAccessor returns an Accessor that supports creation of device special
+// files in the devtmpfs instance registered with name fsTypeName in vfsObj.
+func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) {
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.GetFilesystemOptions{})
+	if err != nil {
+		return nil, err
+	}
+	return &Accessor{
+		vfsObj: vfsObj,
+		mntns:  mntns,
+		root:   mntns.Root(),
+		creds:  creds,
+	}, nil
+}
+
+// Release must be called when a is no longer in use.
+func (a *Accessor) Release(ctx context.Context) {
+	a.root.DecRef(ctx)
+	a.mntns.DecRef(ctx)
+}
+
+// accessorContext implements context.Context by extending an existing
+// context.Context with an Accessor's values for VFS-relevant state.
+type accessorContext struct {
+	context.Context
+	a *Accessor
+}
+
+func (a *Accessor) wrapContext(ctx context.Context) *accessorContext {
+	return &accessorContext{
+		Context: ctx,
+		a:       a,
+	}
+}
+
+// Value implements context.Context.Value.
+func (ac *accessorContext) Value(key interface{}) interface{} {
+	switch key {
+	case vfs.CtxMountNamespace:
+		ac.a.mntns.IncRef()
+		return ac.a.mntns
+	case vfs.CtxRoot:
+		ac.a.root.IncRef()
+		return ac.a.root
+	default:
+		return ac.Context.Value(key)
+	}
+}
+
+func (a *Accessor) pathOperationAt(pathname string) *vfs.PathOperation {
+	return &vfs.PathOperation{
+		Root:  a.root,
+		Start: a.root,
+		Path:  fspath.Parse(pathname),
+	}
+}
+
+// CreateDeviceFile creates a device special file at the given pathname in the
+// devtmpfs instance accessed by the Accessor.
+func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind vfs.DeviceKind, major, minor uint32, perms uint16) error {
+	actx := a.wrapContext(ctx)
+
+	mode := (linux.FileMode)(perms)
+	switch kind {
+	case vfs.BlockDevice:
+		mode |= linux.S_IFBLK
+	case vfs.CharDevice:
+		mode |= linux.S_IFCHR
+	default:
+		panic(fmt.Sprintf("invalid vfs.DeviceKind: %v", kind))
+	}
+
+	// Create any parent directories. See
+	// devtmpfs.c:handle_create()=>path_create().
+	for it := fspath.Parse(pathname).Begin; it.NextOk(); it = it.Next() {
+		pop := a.pathOperationAt(it.String())
+		if err := a.vfsObj.MkdirAt(actx, a.creds, pop, &vfs.MkdirOptions{
+			Mode: 0755,
+		}); err != nil {
+			return fmt.Errorf("failed to create directory %q: %v", it.String(), err)
+		}
+	}
+
+	// NOTE: Linux's devtmpfs refuses to automatically delete files it didn't
+	// create, which it recognizes by storing a pointer to the kdevtmpfs struct
+	// thread in struct inode::i_private. Accessor doesn't yet support deletion
+	// of files at all, and probably won't as long as we don't need to support
+	// kernel modules, so this is moot for now.
+	return a.vfsObj.MknodAt(actx, a.creds, a.pathOperationAt(pathname), &vfs.MknodOptions{
+		Mode:     mode,
+		DevMajor: major,
+		DevMinor: minor,
+	})
+}
+
+// UserspaceInit creates symbolic links and mount points in the devtmpfs
+// instance accessed by the Accessor that are created by userspace in Linux. It
+// does not create mounts.
+func (a *Accessor) UserspaceInit(ctx context.Context) error {
+	actx := a.wrapContext(ctx)
+
+	// Initialize symlinks.
+	for _, symlink := range []struct {
+		source string
+		target string
+	}{
+		// systemd: src/shared/dev-setup.c:dev_setup()
+		{source: "fd", target: "/proc/self/fd"},
+		{source: "stdin", target: "/proc/self/fd/0"},
+		{source: "stdout", target: "/proc/self/fd/1"},
+		{source: "stderr", target: "/proc/self/fd/2"},
+		// /proc/kcore is not implemented.
+
+		// Linux implements /dev/ptmx as a device node, but advises
+		// container implementations to create /dev/ptmx as a symlink
+		// to pts/ptmx (Documentation/filesystems/devpts.txt). Systemd
+		// follows this advice (src/nspawn/nspawn.c:setup_pts()), while
+		// LXC tries to create a bind mount and falls back to a symlink
+		// (src/lxc/conf.c:lxc_setup_devpts()).
+		{source: "ptmx", target: "pts/ptmx"},
+	} {
+		if err := a.vfsObj.SymlinkAt(actx, a.creds, a.pathOperationAt(symlink.source), symlink.target); err != nil {
+			return fmt.Errorf("failed to create symlink %q => %q: %v", symlink.source, symlink.target, err)
+		}
+	}
+
+	// systemd: src/core/mount-setup.c:mount_table
+	for _, dir := range []string{
+		"shm",
+		"pts",
+	} {
+		if err := a.vfsObj.MkdirAt(actx, a.creds, a.pathOperationAt(dir), &vfs.MkdirOptions{
+			// systemd: src/core/mount-setup.c:mount_one()
+			Mode: 0755,
+		}); err != nil {
+			return fmt.Errorf("failed to create directory %q: %v", dir, err)
+		}
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
new file mode 100644
index 000000000..747867cca
--- /dev/null
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -0,0 +1,122 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devtmpfs
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func TestDevtmpfs(t *testing.T) {
+	ctx := contexttest.Context(t)
+	creds := auth.CredentialsFromContext(ctx)
+
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(ctx); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
+	// Register tmpfs just so that we can have a root filesystem that isn't
+	// devtmpfs.
+	vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	vfsObj.MustRegisterFilesystemType("devtmpfs", &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+
+	// Create a test mount namespace with devtmpfs mounted at "/dev".
+	const devPath = "/dev"
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("failed to create tmpfs root mount: %v", err)
+	}
+	defer mntns.DecRef(ctx)
+	root := mntns.Root()
+	defer root.DecRef(ctx)
+	devpop := vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(devPath),
+	}
+	if err := vfsObj.MkdirAt(ctx, creds, &devpop, &vfs.MkdirOptions{
+		Mode: 0755,
+	}); err != nil {
+		t.Fatalf("failed to create mount point: %v", err)
+	}
+	if err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil {
+		t.Fatalf("failed to mount devtmpfs: %v", err)
+	}
+
+	a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs")
+	if err != nil {
+		t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
+	}
+	defer a.Release(ctx)
+
+	// Create "userspace-initialized" files using a devtmpfs.Accessor.
+	if err := a.UserspaceInit(ctx); err != nil {
+		t.Fatalf("failed to userspace-initialize devtmpfs: %v", err)
+	}
+	// Created files should be visible in the test mount namespace.
+	abspath := devPath + "/fd"
+	target, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(abspath),
+	})
+	if want := "/proc/self/fd"; err != nil || target != want {
+		t.Fatalf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, target, err, want)
+	}
+
+	// Create a dummy device special file using a devtmpfs.Accessor.
+	const (
+		pathInDev = "dummy"
+		kind      = vfs.CharDevice
+		major     = 12
+		minor     = 34
+		perms     = 0600
+		wantMode  = linux.S_IFCHR | perms
+	)
+	if err := a.CreateDeviceFile(ctx, pathInDev, kind, major, minor, perms); err != nil {
+		t.Fatalf("failed to create device file: %v", err)
+	}
+	// The device special file should be visible in the test mount namespace.
+	abspath = devPath + "/" + pathInDev
+	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(abspath),
+	}, &vfs.StatOptions{
+		Mask: linux.STATX_TYPE | linux.STATX_MODE,
+	})
+	if err != nil {
+		t.Fatalf("failed to stat device file at %q: %v", abspath, err)
+	}
+	if stat.Mode != wantMode {
+		t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode)
+	}
+	if stat.RdevMajor != major {
+		t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, major)
+	}
+	if stat.RdevMinor != minor {
+		t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, minor)
+	}
+}
diff --git a/pkg/sentry/fsimpl/eventfd/BUILD b/pkg/sentry/fsimpl/eventfd/BUILD
new file mode 100644
index 000000000..ea167d38c
--- /dev/null
+++ b/pkg/sentry/fsimpl/eventfd/BUILD
@@ -0,0 +1,33 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+licenses(["notice"])
+
+go_library(
+    name = "eventfd",
+    srcs = ["eventfd.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fdnotifier",
+        "//pkg/log",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
+
+go_test(
+    name = "eventfd_test",
+    size = "small",
+    srcs = ["eventfd_test.go"],
+    library = ":eventfd",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/vfs",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
new file mode 100644
index 000000000..812171fa3
--- /dev/null
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -0,0 +1,285 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package eventfd implements event fds.
+package eventfd
+
+import (
+	"math"
+	"sync"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// EventFileDescription implements FileDescriptionImpl for file-based event
+// notification (eventfd). Eventfds are usually internal to the Sentry but in
+// certain situations they may be converted into a host-backed eventfd.
+type EventFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
+
+	// queue is used to notify interested parties when the event object
+	// becomes readable or writable.
+	queue waiter.Queue `state:"zerovalue"`
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// val is the current value of the event counter.
+	val uint64
+
+	// semMode specifies whether the event is in "semaphore" mode.
+	semMode bool
+
+	// hostfd indicates whether this eventfd is passed through to the host.
+	hostfd int
+}
+
+var _ vfs.FileDescriptionImpl = (*EventFileDescription)(nil)
+
+// New creates a new event fd.
+func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, initVal uint64, semMode bool, flags uint32) (*vfs.FileDescription, error) {
+	vd := vfsObj.NewAnonVirtualDentry("[eventfd]")
+	defer vd.DecRef(ctx)
+	efd := &EventFileDescription{
+		val:     initVal,
+		semMode: semMode,
+		hostfd:  -1,
+	}
+	if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+		DenyPRead:         true,
+		DenyPWrite:        true,
+	}); err != nil {
+		return nil, err
+	}
+	return &efd.vfsfd, nil
+}
+
+// HostFD returns the host eventfd associated with this event.
+func (efd *EventFileDescription) HostFD() (int, error) {
+	efd.mu.Lock()
+	defer efd.mu.Unlock()
+	if efd.hostfd >= 0 {
+		return efd.hostfd, nil
+	}
+
+	flags := linux.EFD_NONBLOCK
+	if efd.semMode {
+		flags |= linux.EFD_SEMAPHORE
+	}
+
+	fd, _, errno := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(efd.val), uintptr(flags), 0)
+	if errno != 0 {
+		return -1, errno
+	}
+
+	if err := fdnotifier.AddFD(int32(fd), &efd.queue); err != nil {
+		if closeErr := syscall.Close(int(fd)); closeErr != nil {
+			log.Warningf("close(%d) eventfd failed: %v", fd, closeErr)
+		}
+		return -1, err
+	}
+
+	efd.hostfd = int(fd)
+	return efd.hostfd, nil
+}
+
+// Release implements FileDescriptionImpl.Release()
+func (efd *EventFileDescription) Release(context.Context) {
+	efd.mu.Lock()
+	defer efd.mu.Unlock()
+	if efd.hostfd >= 0 {
+		fdnotifier.RemoveFD(int32(efd.hostfd))
+		if closeErr := syscall.Close(int(efd.hostfd)); closeErr != nil {
+			log.Warningf("close(%d) eventfd failed: %v", efd.hostfd, closeErr)
+		}
+		efd.hostfd = -1
+	}
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	if dst.NumBytes() < 8 {
+		return 0, syscall.EINVAL
+	}
+	if err := efd.read(ctx, dst); err != nil {
+		return 0, err
+	}
+	return 8, nil
+}
+
+// Write implements FileDescriptionImpl.Write.
+func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+	if src.NumBytes() < 8 {
+		return 0, syscall.EINVAL
+	}
+	if err := efd.write(ctx, src); err != nil {
+		return 0, err
+	}
+	return 8, nil
+}
+
+// Preconditions: Must be called with efd.mu locked.
+func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem.IOSequence) error {
+	var buf [8]byte
+	if _, err := syscall.Read(efd.hostfd, buf[:]); err != nil {
+		if err == syscall.EWOULDBLOCK {
+			return syserror.ErrWouldBlock
+		}
+		return err
+	}
+	_, err := dst.CopyOut(ctx, buf[:])
+	return err
+}
+
+func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequence) error {
+	efd.mu.Lock()
+	if efd.hostfd >= 0 {
+		defer efd.mu.Unlock()
+		return efd.hostReadLocked(ctx, dst)
+	}
+
+	// We can't complete the read if the value is currently zero.
+	if efd.val == 0 {
+		efd.mu.Unlock()
+		return syserror.ErrWouldBlock
+	}
+
+	// Update the value based on the mode the event is operating in.
+	var val uint64
+	if efd.semMode {
+		val = 1
+		// Consistent with Linux, this is done even if writing to memory fails.
+		efd.val--
+	} else {
+		val = efd.val
+		efd.val = 0
+	}
+
+	efd.mu.Unlock()
+
+	// Notify writers. We do this even if we were already writable because
+	// it is possible that a writer is waiting to write the maximum value
+	// to the event.
+	efd.queue.Notify(waiter.EventOut)
+
+	var buf [8]byte
+	usermem.ByteOrder.PutUint64(buf[:], val)
+	_, err := dst.CopyOut(ctx, buf[:])
+	return err
+}
+
+// Preconditions: Must be called with efd.mu locked.
+func (efd *EventFileDescription) hostWriteLocked(val uint64) error {
+	var buf [8]byte
+	usermem.ByteOrder.PutUint64(buf[:], val)
+	_, err := syscall.Write(efd.hostfd, buf[:])
+	if err == syscall.EWOULDBLOCK {
+		return syserror.ErrWouldBlock
+	}
+	return err
+}
+
+func (efd *EventFileDescription) write(ctx context.Context, src usermem.IOSequence) error {
+	var buf [8]byte
+	if _, err := src.CopyIn(ctx, buf[:]); err != nil {
+		return err
+	}
+	val := usermem.ByteOrder.Uint64(buf[:])
+
+	return efd.Signal(val)
+}
+
+// Signal is an internal function to signal the event fd.
+func (efd *EventFileDescription) Signal(val uint64) error {
+	if val == math.MaxUint64 {
+		return syscall.EINVAL
+	}
+
+	efd.mu.Lock()
+
+	if efd.hostfd >= 0 {
+		defer efd.mu.Unlock()
+		return efd.hostWriteLocked(val)
+	}
+
+	// We only allow writes that won't cause the value to go over the max
+	// uint64 minus 1.
+	if val > math.MaxUint64-1-efd.val {
+		efd.mu.Unlock()
+		return syserror.ErrWouldBlock
+	}
+
+	efd.val += val
+	efd.mu.Unlock()
+
+	// Always trigger a notification.
+	efd.queue.Notify(waiter.EventIn)
+
+	return nil
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (efd *EventFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	efd.mu.Lock()
+	defer efd.mu.Unlock()
+
+	if efd.hostfd >= 0 {
+		return fdnotifier.NonBlockingPoll(int32(efd.hostfd), mask)
+	}
+
+	ready := waiter.EventMask(0)
+	if efd.val > 0 {
+		ready |= waiter.EventIn
+	}
+
+	if efd.val < math.MaxUint64-1 {
+		ready |= waiter.EventOut
+	}
+
+	return mask & ready
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (efd *EventFileDescription) EventRegister(entry *waiter.Entry, mask waiter.EventMask) {
+	efd.queue.EventRegister(entry, mask)
+
+	efd.mu.Lock()
+	defer efd.mu.Unlock()
+	if efd.hostfd >= 0 {
+		fdnotifier.UpdateFD(int32(efd.hostfd))
+	}
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (efd *EventFileDescription) EventUnregister(entry *waiter.Entry) {
+	efd.queue.EventUnregister(entry)
+
+	efd.mu.Lock()
+	defer efd.mu.Unlock()
+	if efd.hostfd >= 0 {
+		fdnotifier.UpdateFD(int32(efd.hostfd))
+	}
+}
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd_test.go b/pkg/sentry/fsimpl/eventfd/eventfd_test.go
new file mode 100644
index 000000000..49916fa81
--- /dev/null
+++ b/pkg/sentry/fsimpl/eventfd/eventfd_test.go
@@ -0,0 +1,97 @@
+// Copyright 2018 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package eventfd
+
+import (
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func TestEventFD(t *testing.T) {
+	initVals := []uint64{
+		0,
+		// Using a non-zero initial value verifies that writing to an
+		// eventfd signals when the eventfd's counter was already
+		// non-zero.
+		343,
+	}
+
+	for _, initVal := range initVals {
+		ctx := contexttest.Context(t)
+		vfsObj := &vfs.VirtualFilesystem{}
+		if err := vfsObj.Init(ctx); err != nil {
+			t.Fatalf("VFS init: %v", err)
+		}
+
+		// Make a new eventfd that is writable.
+		eventfd, err := New(ctx, vfsObj, initVal, false, linux.O_RDWR)
+		if err != nil {
+			t.Fatalf("New() failed: %v", err)
+		}
+		defer eventfd.DecRef(ctx)
+
+		// Register a callback for a write event.
+		w, ch := waiter.NewChannelEntry(nil)
+		eventfd.EventRegister(&w, waiter.EventIn)
+		defer eventfd.EventUnregister(&w)
+
+		data := []byte("00000124")
+		// Create and submit a write request.
+		n, err := eventfd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+		if err != nil {
+			t.Fatal(err)
+		}
+		if n != 8 {
+			t.Errorf("eventfd.write wrote %d bytes, not full int64", n)
+		}
+
+		// Check if the callback fired due to the write event.
+		select {
+		case <-ch:
+		default:
+			t.Errorf("Didn't get notified of EventIn after write")
+		}
+	}
+}
+
+func TestEventFDStat(t *testing.T) {
+	ctx := contexttest.Context(t)
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(ctx); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
+
+	// Make a new eventfd that is writable.
+	eventfd, err := New(ctx, vfsObj, 0, false, linux.O_RDWR)
+	if err != nil {
+		t.Fatalf("New() failed: %v", err)
+	}
+	defer eventfd.DecRef(ctx)
+
+	statx, err := eventfd.Stat(ctx, vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS,
+	})
+	if err != nil {
+		t.Fatalf("eventfd.Stat failed: %v", err)
+	}
+	if statx.Size != 0 {
+		t.Errorf("eventfd size should be 0")
+	}
+}
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index 7ccff8b0d..abc610ef3 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
@@ -16,6 +15,17 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "ext",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "dentry",
+    },
+)
+
 go_library(
     name = "ext",
     srcs = [
@@ -27,29 +37,33 @@ go_library(
         "extent_file.go",
         "file_description.go",
         "filesystem.go",
+        "fstree.go",
         "inode.go",
         "regular_file.go",
         "symlink.go",
         "utils.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
+        "//pkg/context",
         "//pkg/fd",
+        "//pkg/fspath",
         "//pkg/log",
+        "//pkg/safemem",
         "//pkg/sentry/arch",
-        "//pkg/sentry/context",
         "//pkg/sentry/fs",
+        "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/ext/disklayout",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
-        "//pkg/sentry/safemem",
+        "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/syscalls/linux",
-        "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
+        "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
         "//pkg/waiter",
     ],
 )
@@ -69,19 +83,20 @@ go_test(
         "//pkg/sentry/fsimpl/ext:assets/tiny.ext3",
         "//pkg/sentry/fsimpl/ext:assets/tiny.ext4",
     ],
-    embed = [":ext"],
+    library = ":ext",
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/ext/disklayout",
         "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
-        "//runsc/testutil",
-        "@com_github_google_go-cmp//cmp:go_default_library",
-        "@com_github_google_go-cmp//cmp/cmpopts:go_default_library",
+        "//pkg/test/testutil",
+        "//pkg/usermem",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+        "@com_github_google_go_cmp//cmp/cmpopts:go_default_library",
     ],
 )
diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD
index bfc46dfa6..6c5a559fd 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/BUILD
+++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD
@@ -1,4 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_test")
 
 package(licenses = ["notice"])
 
@@ -7,8 +7,9 @@ go_test(
     size = "small",
     srcs = ["benchmark_test.go"],
     deps = [
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/ext",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/vfs",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 10a8083a0..8f7d5a9bb 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -15,6 +15,9 @@
 // These benchmarks emulate memfs benchmarks. Ext4 images must be created
 // before this benchmark is run using the `make_deep_ext4.sh` script at
 // /tmp/image-{depth}.ext4 for all the depths tested below.
+//
+// The benchmark itself cannot run the script because the script requires
+// sudo privileges to create the file system images.
 package benchmark_test
 
 import (
@@ -24,8 +27,9 @@ import (
 	"strings"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -48,9 +52,14 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
 	creds := auth.CredentialsFromContext(ctx)
 
 	// Create VFS.
-	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(ctx); err != nil {
+		return nil, nil, nil, nil, err
+	}
+	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
@@ -59,7 +68,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
 	root := mntns.Root()
 
 	tearDown := func() {
-		root.DecRef()
+		root.DecRef(ctx)
 
 		if err := f.Close(); err != nil {
 			b.Fatalf("tearDown failed: %v", err)
@@ -81,7 +90,11 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf
 	ctx := contexttest.Context(b)
 	creds := auth.CredentialsFromContext(ctx)
 
-	if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())}); err != nil {
+	if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: int(f.Fd()),
+		},
+	}); err != nil {
 		b.Fatalf("failed to mount tmpfs submount: %v", err)
 	}
 	return func() {
@@ -117,7 +130,7 @@ func BenchmarkVFS2Ext4fsStat(b *testing.B) {
 				stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               *root,
 					Start:              *root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
@@ -146,9 +159,9 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
 			creds := auth.CredentialsFromContext(ctx)
 			mountPointName := "/1/"
 			pop := vfs.PathOperation{
-				Root:     *root,
-				Start:    *root,
-				Pathname: mountPointName,
+				Root:  *root,
+				Start: *root,
+				Path:  fspath.Parse(mountPointName),
 			}
 
 			// Save the mount point for later use.
@@ -156,7 +169,7 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to walk to mount point: %v", err)
 			}
-			defer mountPoint.DecRef()
+			defer mountPoint.DecRef(ctx)
 
 			// Create extfs submount.
 			mountTearDown := mount(b, fmt.Sprintf("/tmp/image-%d.ext4", depth), vfsfs, &pop)
@@ -177,7 +190,7 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
 				stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               *root,
 					Start:              *root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go
index cea89bcd9..8bb104ff0 100644
--- a/pkg/sentry/fsimpl/ext/block_map_file.go
+++ b/pkg/sentry/fsimpl/ext/block_map_file.go
@@ -58,15 +58,16 @@ var _ io.ReaderAt = (*blockMapFile)(nil)
 
 // newBlockMapFile is the blockMapFile constructor. It initializes the file to
 // physical blocks map with (at most) the first 12 (direct) blocks.
-func newBlockMapFile(regFile regularFile) (*blockMapFile, error) {
-	file := &blockMapFile{regFile: regFile}
+func newBlockMapFile(args inodeArgs) (*blockMapFile, error) {
+	file := &blockMapFile{}
 	file.regFile.impl = file
+	file.regFile.inode.init(args, &file.regFile)
 
 	for i := uint(0); i < 4; i++ {
-		file.coverage[i] = getCoverage(regFile.inode.blkSize, i)
+		file.coverage[i] = getCoverage(file.regFile.inode.blkSize, i)
 	}
 
-	blkMap := regFile.inode.diskInode.Data()
+	blkMap := file.regFile.inode.diskInode.Data()
 	binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks)
 	binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk)
 	binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk)
@@ -154,7 +155,7 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds
 			toRead = len(dst)
 		}
 
-		n, _ := f.regFile.inode.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff))
+		n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff))
 		if n < toRead {
 			return n, syserror.EIO
 		}
@@ -174,7 +175,7 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds
 	curChildOff := relFileOff % childCov
 	for i := startIdx; i < endIdx; i++ {
 		var childPhyBlk uint32
-		err := readFromDisk(f.regFile.inode.dev, curPhyBlkOff+int64(i*4), &childPhyBlk)
+		err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk)
 		if err != nil {
 			return read, err
 		}
diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go
index 213aa3919..6fa84e7aa 100644
--- a/pkg/sentry/fsimpl/ext/block_map_test.go
+++ b/pkg/sentry/fsimpl/ext/block_map_test.go
@@ -85,18 +85,6 @@ func (n *blkNumGen) next() uint32 {
 // the inode covers and that is written to disk.
 func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
 	mockDisk := make([]byte, mockBMDiskSize)
-	regFile := regularFile{
-		inode: inode{
-			diskInode: &disklayout.InodeNew{
-				InodeOld: disklayout.InodeOld{
-					SizeLo: getMockBMFileFize(),
-				},
-			},
-			dev:     bytes.NewReader(mockDisk),
-			blkSize: uint64(mockBMBlkSize),
-		},
-	}
-
 	var fileData []byte
 	blkNums := newBlkNumGen()
 	var data []byte
@@ -123,9 +111,20 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
 	data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk)
 	fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...)
 
-	copy(regFile.inode.diskInode.Data(), data)
+	args := inodeArgs{
+		fs: &filesystem{
+			dev: bytes.NewReader(mockDisk),
+		},
+		diskInode: &disklayout.InodeNew{
+			InodeOld: disklayout.InodeOld{
+				SizeLo: getMockBMFileFize(),
+			},
+		},
+		blkSize: uint64(mockBMBlkSize),
+	}
+	copy(args.diskInode.Data(), data)
 
-	mockFile, err := newBlockMapFile(regFile)
+	mockFile, err := newBlockMapFile(args)
 	if err != nil {
 		t.Fatalf("newBlockMapFile failed: %v", err)
 	}
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index 054fb42b6..7a1b4219f 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -15,6 +15,7 @@
 package ext
 
 import (
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
@@ -22,6 +23,10 @@ import (
 type dentry struct {
 	vfsd vfs.Dentry
 
+	// Protected by filesystem.mu.
+	parent *dentry
+	name   string
+
 	// inode is the inode represented by this dentry. Multiple Dentries may
 	// share a single non-directory Inode (with hard links). inode is
 	// immutable.
@@ -41,16 +46,35 @@ func newDentry(in *inode) *dentry {
 }
 
 // IncRef implements vfs.DentryImpl.IncRef.
-func (d *dentry) IncRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) IncRef() {
 	d.inode.incRef()
 }
 
 // TryIncRef implements vfs.DentryImpl.TryIncRef.
-func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
+func (d *dentry) TryIncRef() bool {
 	return d.inode.tryIncRef()
 }
 
 // DecRef implements vfs.DentryImpl.DecRef.
-func (d *dentry) DecRef(vfsfs *vfs.Filesystem) {
-	d.inode.decRef(vfsfs.Impl().(*filesystem))
+func (d *dentry) DecRef(ctx context.Context) {
+	// FIXME(b/134676337): filesystem.mu may not be locked as required by
+	// inode.decRef().
+	d.inode.decRef()
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// TODO(b/134676337): Implement inotify.
+func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+//
+// TODO(b/134676337): Implement inotify.
+func (d *dentry) Watches() *vfs.Watches {
+	return nil
 }
+
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+//
+// TODO(b/134676337): Implement inotify.
+func (d *dentry) OnZeroWatches(context.Context) {}
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index 91802dc1e..0fc01668d 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -15,16 +15,15 @@
 package ext
 
 import (
-	"sync"
-
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
-	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -32,6 +31,10 @@ import (
 type directory struct {
 	inode inode
 
+	// childCache maps filenames to dentries for children for which dentries
+	// have been instantiated. childCache is protected by filesystem.mu.
+	childCache map[string]*dentry
+
 	// mu serializes the changes to childList.
 	// Lock Order (outermost locks must be taken first):
 	//   directory.mu
@@ -51,13 +54,16 @@ type directory struct {
 	childMap map[string]*dirent
 }
 
-// newDirectroy is the directory constructor.
-func newDirectroy(inode inode, newDirent bool) (*directory, error) {
-	file := &directory{inode: inode, childMap: make(map[string]*dirent)}
-	file.inode.impl = file
+// newDirectory is the directory constructor.
+func newDirectory(args inodeArgs, newDirent bool) (*directory, error) {
+	file := &directory{
+		childCache: make(map[string]*dentry),
+		childMap:   make(map[string]*dirent),
+	}
+	file.inode.init(args, file)
 
 	// Initialize childList by reading dirents from the underlying file.
-	if inode.diskInode.Flags().Index {
+	if args.diskInode.Flags().Index {
 		// TODO(b/134676337): Support hash tree directories. Currently only the '.'
 		// and '..' entries are read in.
 
@@ -68,7 +74,7 @@ func newDirectroy(inode inode, newDirent bool) (*directory, error) {
 
 	// The dirents are organized in a linear array in the file data.
 	// Extract the file data and decode the dirents.
-	regFile, err := newRegularFile(inode)
+	regFile, err := newRegularFile(args)
 	if err != nil {
 		return nil, err
 	}
@@ -76,7 +82,7 @@ func newDirectroy(inode inode, newDirent bool) (*directory, error) {
 	// buf is used as scratch space for reading in dirents from disk and
 	// unmarshalling them into dirent structs.
 	buf := make([]byte, disklayout.DirentSize)
-	size := inode.diskInode.Size()
+	size := args.diskInode.Size()
 	for off, inc := uint64(0), uint64(0); off < size; off += inc {
 		toRead := size - off
 		if toRead > disklayout.DirentSize {
@@ -136,7 +142,7 @@ type directoryFD struct {
 var _ vfs.FileDescriptionImpl = (*directoryFD)(nil)
 
 // Release implements vfs.FileDescriptionImpl.Release.
-func (fd *directoryFD) Release() {
+func (fd *directoryFD) Release(ctx context.Context) {
 	if fd.iter == nil {
 		return
 	}
@@ -189,14 +195,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 				childType = fs.ToInodeType(childInode.diskInode.Mode().FileType())
 			}
 
-			if !cb.Handle(vfs.Dirent{
+			if err := cb.Handle(vfs.Dirent{
 				Name:    child.diskDirent.FileName(),
 				Type:    fs.ToDirentType(childType),
 				Ino:     uint64(child.diskDirent.Inode()),
 				NextOff: fd.off + 1,
-			}) {
+			}); err != nil {
 				dir.childList.InsertBefore(child, fd.iter)
-				return nil
+				return err
 			}
 			fd.off++
 		}
@@ -301,8 +307,12 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
 	return offset, nil
 }
 
-// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
-func (fd *directoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
-	// mmap(2) specifies that EACCESS should be returned for non-regular file fds.
-	return syserror.EACCES
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *directoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *directoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
 }
diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD
index fcfaf5c3e..9bd9c76c0 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/BUILD
+++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD
@@ -1,5 +1,4 @@
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_stateify:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 package(licenses = ["notice"])
 
@@ -23,7 +22,6 @@ go_library(
         "superblock_old.go",
         "test_utils.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout",
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -44,6 +42,6 @@ go_test(
         "inode_test.go",
         "superblock_test.go",
     ],
-    embed = [":disklayout"],
+    library = ":disklayout",
     deps = ["//pkg/sentry/kernel/time"],
 )
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go
index 567523d32..4110649ab 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go
@@ -29,8 +29,12 @@ package disklayout
 //       byte (i * sb.BlockSize()) to ((i+1) * sb.BlockSize()).
 
 const (
-	// ExtentStructsSize is the size of all the three extent on-disk structs.
-	ExtentStructsSize = 12
+	// ExtentHeaderSize is the size of the header of an extent tree node.
+	ExtentHeaderSize = 12
+
+	// ExtentEntrySize is the size of an entry in an extent tree node.
+	// This size is the same for both leaf and internal nodes.
+	ExtentEntrySize = 12
 
 	// ExtentMagic is the magic number which must be present in the header.
 	ExtentMagic = 0xf30a
@@ -57,7 +61,7 @@ type ExtentNode struct {
 	Entries []ExtentEntryPair
 }
 
-// ExtentEntry reprsents an extent tree node entry. The entry can either be
+// ExtentEntry represents an extent tree node entry. The entry can either be
 // an ExtentIdx or Extent itself. This exists to simplify navigation logic.
 type ExtentEntry interface {
 	// FileBlock returns the first file block number covered by this entry.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
index b0fad9b71..8762b90db 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
@@ -21,7 +21,7 @@ import (
 // TestExtentSize tests that the extent structs are of the correct
 // size.
 func TestExtentSize(t *testing.T) {
-	assertSize(t, ExtentHeader{}, ExtentStructsSize)
-	assertSize(t, ExtentIdx{}, ExtentStructsSize)
-	assertSize(t, Extent{}, ExtentStructsSize)
+	assertSize(t, ExtentHeader{}, ExtentHeaderSize)
+	assertSize(t, ExtentIdx{}, ExtentEntrySize)
+	assertSize(t, Extent{}, ExtentEntrySize)
 }
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
index f10accafc..08ffc2834 100644
--- a/pkg/sentry/fsimpl/ext/ext.go
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -21,15 +21,18 @@ import (
 	"io"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// Name is the name of this filesystem.
+const Name = "ext"
+
 // FilesystemType implements vfs.FilesystemType.
 type FilesystemType struct{}
 
@@ -40,14 +43,14 @@ var _ vfs.FilesystemType = (*FilesystemType)(nil)
 // Currently there are two ways of mounting an ext(2/3/4) fs:
 //   1. Specify a mount with our internal special MountType in the OCI spec.
 //   2. Expose the device to the container and mount it from application layer.
-func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, error) {
+func getDeviceFd(source string, opts vfs.GetFilesystemOptions) (io.ReaderAt, error) {
 	if opts.InternalData == nil {
 		// User mount call.
 		// TODO(b/134676337): Open the device specified by `source` and return that.
 		panic("unimplemented")
 	}
 
-	// NewFilesystem call originated from within the sentry.
+	// GetFilesystem call originated from within the sentry.
 	devFd, ok := opts.InternalData.(int)
 	if !ok {
 		return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device")
@@ -91,42 +94,61 @@ func isCompatible(sb disklayout.SuperBlock) bool {
 	return true
 }
 
-// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
-func (FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	// TODO(b/134676337): Ensure that the user is mounting readonly. If not,
 	// EACCESS should be returned according to mount(2). Filesystem independent
 	// flags (like readonly) are currently not available in pkg/sentry/vfs.
 
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+
 	dev, err := getDeviceFd(source, opts)
 	if err != nil {
 		return nil, nil, err
 	}
 
-	fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)}
-	fs.vfsfs.Init(&fs)
+	fs := filesystem{
+		dev:        dev,
+		inodeCache: make(map[uint32]*inode),
+		devMinor:   devMinor,
+	}
+	fs.vfsfs.Init(vfsObj, &fsType, &fs)
 	fs.sb, err = readSuperBlock(dev)
 	if err != nil {
+		fs.vfsfs.DecRef(ctx)
 		return nil, nil, err
 	}
 
 	if fs.sb.Magic() != linux.EXT_SUPER_MAGIC {
 		// mount(2) specifies that EINVAL should be returned if the superblock is
 		// invalid.
+		fs.vfsfs.DecRef(ctx)
 		return nil, nil, syserror.EINVAL
 	}
 
 	// Refuse to mount if the filesystem is incompatible.
 	if !isCompatible(fs.sb) {
+		fs.vfsfs.DecRef(ctx)
 		return nil, nil, syserror.EINVAL
 	}
 
 	fs.bgs, err = readBlockGroups(dev, fs.sb)
 	if err != nil {
+		fs.vfsfs.DecRef(ctx)
 		return nil, nil, err
 	}
 
 	rootInode, err := fs.getOrCreateInodeLocked(disklayout.RootDirInode)
 	if err != nil {
+		fs.vfsfs.DecRef(ctx)
 		return nil, nil, err
 	}
 	rootInode.incRef()
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 1aa2bd6a4..2dbaee287 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -25,15 +25,15 @@ import (
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
-
-	"gvisor.dev/gvisor/runsc/testutil"
+	"gvisor.dev/gvisor/pkg/test/testutil"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const (
@@ -64,9 +64,14 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
 	creds := auth.CredentialsFromContext(ctx)
 
 	// Create VFS.
-	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(ctx); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
+	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
@@ -75,7 +80,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
 	root := mntns.Root()
 
 	tearDown := func() {
-		root.DecRef()
+		root.DecRef(ctx)
 
 		if err := f.Close(); err != nil {
 			t.Fatalf("tearDown failed: %v", err)
@@ -140,62 +145,61 @@ func TestSeek(t *testing.T) {
 			fd, err := vfsfs.OpenAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
 				t.Fatalf("vfsfs.OpenAt failed: %v", err)
 			}
 
-			if n, err := fd.Impl().Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil {
+			if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil {
 				t.Errorf("expected seek position 0, got %d and error %v", n, err)
 			}
 
-			stat, err := fd.Impl().Stat(ctx, vfs.StatOptions{})
+			stat, err := fd.Stat(ctx, vfs.StatOptions{})
 			if err != nil {
 				t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err)
 			}
 
 			// We should be able to seek beyond the end of file.
 			size := int64(stat.Size)
-			if n, err := fd.Impl().Seek(ctx, size, linux.SEEK_SET); n != size || err != nil {
+			if n, err := fd.Seek(ctx, size, linux.SEEK_SET); n != size || err != nil {
 				t.Errorf("expected seek position %d, got %d and error %v", size, n, err)
 			}
 
 			// EINVAL should be returned if the resulting offset is negative.
-			if _, err := fd.Impl().Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL {
+			if _, err := fd.Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL {
 				t.Errorf("expected error EINVAL but got %v", err)
 			}
 
-			if n, err := fd.Impl().Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil {
+			if n, err := fd.Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil {
 				t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err)
 			}
 
 			// Make sure negative offsets work with SEEK_CUR.
-			if n, err := fd.Impl().Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil {
+			if n, err := fd.Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil {
 				t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
 			}
 
 			// EINVAL should be returned if the resulting offset is negative.
-			if _, err := fd.Impl().Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL {
+			if _, err := fd.Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL {
 				t.Errorf("expected error EINVAL but got %v", err)
 			}
 
 			// Make sure SEEK_END works with regular files.
-			switch fd.Impl().(type) {
-			case *regularFileFD:
+			if _, ok := fd.Impl().(*regularFileFD); ok {
 				// Seek back to 0.
-				if n, err := fd.Impl().Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil {
+				if n, err := fd.Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil {
 					t.Errorf("expected seek position %d, got %d and error %v", 0, n, err)
 				}
 
 				// Seek forward beyond EOF.
-				if n, err := fd.Impl().Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil {
+				if n, err := fd.Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil {
 					t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
 				}
 
 				// EINVAL should be returned if the resulting offset is negative.
-				if _, err := fd.Impl().Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL {
+				if _, err := fd.Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL {
 					t.Errorf("expected error EINVAL but got %v", err)
 				}
 			}
@@ -360,7 +364,7 @@ func TestStatAt(t *testing.T) {
 
 			got, err := vfsfs.StatAt(ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
 				&vfs.StatOptions{},
 			)
 			if err != nil {
@@ -430,7 +434,7 @@ func TestRead(t *testing.T) {
 			fd, err := vfsfs.OpenAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.absPath},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.absPath)},
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
@@ -456,7 +460,7 @@ func TestRead(t *testing.T) {
 			want := make([]byte, 1)
 			for {
 				n, err := f.Read(want)
-				fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{})
+				fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{})
 
 				if diff := cmp.Diff(got, want); diff != "" {
 					t.Errorf("file data mismatch (-want +got):\n%s", diff)
@@ -464,7 +468,7 @@ func TestRead(t *testing.T) {
 
 				// Make sure there is no more file data left after getting EOF.
 				if n == 0 || err == io.EOF {
-					if n, _ := fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 {
+					if n, _ := fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 {
 						t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image)
 					}
 
@@ -494,9 +498,9 @@ func newIterDirentCb() *iterDirentsCb {
 }
 
 // Handle implements vfs.IterDirentsCallback.Handle.
-func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) bool {
+func (cb *iterDirentsCb) Handle(dirent vfs.Dirent) error {
 	cb.dirents = append(cb.dirents, dirent)
-	return true
+	return nil
 }
 
 // TestIterDirents tests the FileDescriptionImpl.IterDirents functionality.
@@ -509,27 +513,27 @@ func TestIterDirents(t *testing.T) {
 	}
 
 	wantDirents := []vfs.Dirent{
-		vfs.Dirent{
+		{
 			Name: ".",
 			Type: linux.DT_DIR,
 		},
-		vfs.Dirent{
+		{
 			Name: "..",
 			Type: linux.DT_DIR,
 		},
-		vfs.Dirent{
+		{
 			Name: "lost+found",
 			Type: linux.DT_DIR,
 		},
-		vfs.Dirent{
+		{
 			Name: "file.txt",
 			Type: linux.DT_REG,
 		},
-		vfs.Dirent{
+		{
 			Name: "bigfile.txt",
 			Type: linux.DT_REG,
 		},
-		vfs.Dirent{
+		{
 			Name: "symlink.txt",
 			Type: linux.DT_LNK,
 		},
@@ -566,7 +570,7 @@ func TestIterDirents(t *testing.T) {
 			fd, err := vfsfs.OpenAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
@@ -574,7 +578,7 @@ func TestIterDirents(t *testing.T) {
 			}
 
 			cb := &iterDirentsCb{}
-			if err = fd.Impl().IterDirents(ctx, cb); err != nil {
+			if err = fd.IterDirents(ctx, cb); err != nil {
 				t.Fatalf("dir fd.IterDirents() failed: %v", err)
 			}
 
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index 38b68a2d3..c36225a7c 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -38,9 +38,10 @@ var _ io.ReaderAt = (*extentFile)(nil)
 // newExtentFile is the extent file constructor. It reads the entire extent
 // tree into memory.
 // TODO(b/134676337): Build extent tree on demand to reduce memory usage.
-func newExtentFile(regFile regularFile) (*extentFile, error) {
-	file := &extentFile{regFile: regFile}
+func newExtentFile(args inodeArgs) (*extentFile, error) {
+	file := &extentFile{}
 	file.regFile.impl = file
+	file.regFile.inode.init(args, &file.regFile)
 	err := file.buildExtTree()
 	if err != nil {
 		return nil, err
@@ -57,7 +58,7 @@ func newExtentFile(regFile regularFile) (*extentFile, error) {
 func (f *extentFile) buildExtTree() error {
 	rootNodeData := f.regFile.inode.diskInode.Data()
 
-	binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &f.root.Header)
+	binary.Unmarshal(rootNodeData[:disklayout.ExtentHeaderSize], binary.LittleEndian, &f.root.Header)
 
 	// Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
 	if f.root.Header.NumEntries > 4 {
@@ -67,7 +68,7 @@ func (f *extentFile) buildExtTree() error {
 	}
 
 	f.root.Entries = make([]disklayout.ExtentEntryPair, f.root.Header.NumEntries)
-	for i, off := uint16(0), disklayout.ExtentStructsSize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+	for i, off := uint16(0), disklayout.ExtentEntrySize; i < f.root.Header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize {
 		var curEntry disklayout.ExtentEntry
 		if f.root.Header.Height == 0 {
 			// Leaf node.
@@ -76,7 +77,7 @@ func (f *extentFile) buildExtTree() error {
 			// Internal node.
 			curEntry = &disklayout.ExtentIdx{}
 		}
-		binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentStructsSize], binary.LittleEndian, curEntry)
+		binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentEntrySize], binary.LittleEndian, curEntry)
 		f.root.Entries[i].Entry = curEntry
 	}
 
@@ -99,13 +100,13 @@ func (f *extentFile) buildExtTree() error {
 func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*disklayout.ExtentNode, error) {
 	var header disklayout.ExtentHeader
 	off := entry.PhysicalBlock() * f.regFile.inode.blkSize
-	err := readFromDisk(f.regFile.inode.dev, int64(off), &header)
+	err := readFromDisk(f.regFile.inode.fs.dev, int64(off), &header)
 	if err != nil {
 		return nil, err
 	}
 
 	entries := make([]disklayout.ExtentEntryPair, header.NumEntries)
-	for i, off := uint16(0), off+disklayout.ExtentStructsSize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentStructsSize {
+	for i, off := uint16(0), off+disklayout.ExtentEntrySize; i < header.NumEntries; i, off = i+1, off+disklayout.ExtentEntrySize {
 		var curEntry disklayout.ExtentEntry
 		if header.Height == 0 {
 			// Leaf node.
@@ -115,7 +116,7 @@ func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*diskla
 			curEntry = &disklayout.ExtentIdx{}
 		}
 
-		err := readFromDisk(f.regFile.inode.dev, int64(off), curEntry)
+		err := readFromDisk(f.regFile.inode.fs.dev, int64(off), curEntry)
 		if err != nil {
 			return nil, err
 		}
@@ -229,7 +230,7 @@ func (f *extentFile) readFromExtent(ex *disklayout.Extent, off uint64, dst []byt
 		toRead = len(dst)
 	}
 
-	n, _ := f.regFile.inode.dev.ReadAt(dst[:toRead], int64(readStart))
+	n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], int64(readStart))
 	if n < toRead {
 		return n, syserror.EIO
 	}
diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go
index 42d0a484b..cd10d46ee 100644
--- a/pkg/sentry/fsimpl/ext/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/extent_test.go
@@ -177,19 +177,19 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []
 	t.Helper()
 
 	mockDisk := make([]byte, mockExtentBlkSize*10)
-	mockExtentFile := &extentFile{
-		regFile: regularFile{
-			inode: inode{
-				diskInode: &disklayout.InodeNew{
-					InodeOld: disklayout.InodeOld{
-						SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
-					},
-				},
-				blkSize: mockExtentBlkSize,
-				dev:     bytes.NewReader(mockDisk),
+	mockExtentFile := &extentFile{}
+	args := inodeArgs{
+		fs: &filesystem{
+			dev: bytes.NewReader(mockDisk),
+		},
+		diskInode: &disklayout.InodeNew{
+			InodeOld: disklayout.InodeOld{
+				SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
 			},
 		},
+		blkSize: mockExtentBlkSize,
 	}
+	mockExtentFile.regFile.inode.init(args, &mockExtentFile.regFile)
 
 	fileData := writeTree(&mockExtentFile.regFile.inode, mockDisk, node0, mockExtentBlkSize)
 
diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
index 4d18b28cb..90b086468 100644
--- a/pkg/sentry/fsimpl/ext/file_description.go
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -16,7 +16,7 @@ package ext
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
@@ -26,33 +26,15 @@ import (
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
-
-	// flags is the same as vfs.OpenOptions.Flags which are passed to
-	// vfs.FilesystemImpl.OpenAt.
-	// TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2),
-	// fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set.
-	// Only close(2), fstat(2), fstatfs(2) should work.
-	flags uint32
+	vfs.LockFD
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
-	return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
 }
 
 func (fd *fileDescription) inode() *inode {
-	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
-}
-
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
-	return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
-	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
-	// no-op.
-	return nil
+	return fd.vfsfd.Dentry().Impl().(*dentry).inode
 }
 
 // Stat implements vfs.FileDescriptionImpl.Stat.
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 2d15e8aaf..c714ddf73 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -17,12 +17,15 @@ package ext
 import (
 	"errors"
 	"io"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -61,6 +64,10 @@ type filesystem struct {
 	// bgs represents all the block group descriptors for the filesystem.
 	// Immutable after initialization.
 	bgs []disklayout.BlockGroup
+
+	// devMinor is this filesystem's device minor number. Immutable after
+	// initialization.
+	devMinor uint32
 }
 
 // Compiles only if filesystem implements vfs.FilesystemImpl.
@@ -77,7 +84,7 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil)
 //     - filesystem.mu must be locked (for writing if write param is true).
 //     - !rp.Done().
 //     - inode == vfsd.Impl().(*Dentry).inode.
-func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
+func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
 	if !inode.isDir() {
 		return nil, nil, syserror.ENOTDIR
 	}
@@ -86,14 +93,33 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
 	}
 
 	for {
-		nextVFSD, err := rp.ResolveComponent(vfsd)
-		if err != nil {
-			return nil, nil, err
+		name := rp.Component()
+		if name == "." {
+			rp.Advance()
+			return vfsd, inode, nil
 		}
-		if nextVFSD == nil {
-			// Since the Dentry tree is not the sole source of truth for extfs, if it's
-			// not in the Dentry tree, it might need to be pulled from disk.
-			childDirent, ok := inode.impl.(*directory).childMap[rp.Component()]
+		d := vfsd.Impl().(*dentry)
+		if name == ".." {
+			isRoot, err := rp.CheckRoot(ctx, vfsd)
+			if err != nil {
+				return nil, nil, err
+			}
+			if isRoot || d.parent == nil {
+				rp.Advance()
+				return vfsd, inode, nil
+			}
+			if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+				return nil, nil, err
+			}
+			rp.Advance()
+			return &d.parent.vfsd, d.parent.inode, nil
+		}
+
+		dir := inode.impl.(*directory)
+		child, ok := dir.childCache[name]
+		if !ok {
+			// We may need to instantiate a new dentry for this child.
+			childDirent, ok := dir.childMap[name]
 			if !ok {
 				// The underlying inode does not exist on disk.
 				return nil, nil, syserror.ENOENT
@@ -112,21 +138,22 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
 			}
 			// incRef because this is being added to the dentry tree.
 			childInode.incRef()
-			child := newDentry(childInode)
-			vfsd.InsertChild(&child.vfsd, rp.Component())
-
-			// Continue as usual now that nextVFSD is not nil.
-			nextVFSD = &child.vfsd
+			child = newDentry(childInode)
+			child.parent = d
+			child.name = name
+			dir.childCache[name] = child
 		}
-		nextInode := nextVFSD.Impl().(*dentry).inode
-		if nextInode.isSymlink() && rp.ShouldFollowSymlink() {
-			if err := rp.HandleSymlink(inode.impl.(*symlink).target); err != nil {
+		if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+			return nil, nil, err
+		}
+		if child.inode.isSymlink() && rp.ShouldFollowSymlink() {
+			if err := rp.HandleSymlink(child.inode.impl.(*symlink).target); err != nil {
 				return nil, nil, err
 			}
 			continue
 		}
 		rp.Advance()
-		return nextVFSD, nextInode, nil
+		return &child.vfsd, child.inode, nil
 	}
 }
 
@@ -140,12 +167,12 @@ func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write boo
 //
 // Preconditions:
 //     - filesystem.mu must be locked (for writing if write param is true).
-func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
 	vfsd := rp.Start()
 	inode := vfsd.Impl().(*dentry).inode
 	for !rp.Done() {
 		var err error
-		vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+		vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write)
 		if err != nil {
 			return nil, nil, err
 		}
@@ -169,12 +196,12 @@ func walkLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error)
 // Preconditions:
 //     - filesystem.mu must be locked (for writing if write param is true).
 //     - !rp.Done().
-func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
+func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
 	vfsd := rp.Start()
 	inode := vfsd.Impl().(*dentry).inode
 	for !rp.Final() {
 		var err error
-		vfsd, inode, err = stepLocked(rp, vfsd, inode, write)
+		vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write)
 		if err != nil {
 			return nil, nil, err
 		}
@@ -189,7 +216,7 @@ func walkParentLocked(rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, e
 // the rp till the parent of the last component which should be an existing
 // directory. If parent is false then resolves rp entirely. Attemps to resolve
 // the path as far as it can with a read lock and upgrades the lock if needed.
-func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) {
+func (fs *filesystem) walk(ctx context.Context, rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) {
 	var (
 		vfsd  *vfs.Dentry
 		inode *inode
@@ -200,9 +227,9 @@ func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *in
 	// of disk. This reduces congestion (allows concurrent walks).
 	fs.mu.RLock()
 	if parent {
-		vfsd, inode, err = walkParentLocked(rp, false)
+		vfsd, inode, err = walkParentLocked(ctx, rp, false)
 	} else {
-		vfsd, inode, err = walkLocked(rp, false)
+		vfsd, inode, err = walkLocked(ctx, rp, false)
 	}
 	fs.mu.RUnlock()
 
@@ -211,9 +238,9 @@ func (fs *filesystem) walk(rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *in
 		// walk is fine as this is a read only filesystem.
 		fs.mu.Lock()
 		if parent {
-			vfsd, inode, err = walkParentLocked(rp, true)
+			vfsd, inode, err = walkParentLocked(ctx, rp, true)
 		} else {
-			vfsd, inode, err = walkLocked(rp, true)
+			vfsd, inode, err = walkLocked(ctx, rp, true)
 		}
 		fs.mu.Unlock()
 	}
@@ -254,9 +281,18 @@ func (fs *filesystem) statTo(stat *linux.Statfs) {
 	// TODO(b/134676337): Set Statfs.Flags and Statfs.FSID.
 }
 
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	_, inode, err := fs.walk(ctx, rp, false)
+	if err != nil {
+		return err
+	}
+	return inode.checkPermissions(rp.Credentials(), ats)
+}
+
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
 func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
-	vfsd, inode, err := fs.walk(rp, false)
+	vfsd, inode, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return nil, err
 	}
@@ -274,9 +310,19 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 	return vfsd, nil
 }
 
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	vfsd, inode, err := fs.walk(ctx, rp, true)
+	if err != nil {
+		return nil, err
+	}
+	inode.incRef()
+	return vfsd, nil
+}
+
 // OpenAt implements vfs.FilesystemImpl.OpenAt.
 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	vfsd, inode, err := fs.walk(rp, false)
+	vfsd, inode, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return nil, err
 	}
@@ -285,12 +331,12 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 {
 		return nil, syserror.EROFS
 	}
-	return inode.open(rp, vfsd, opts.Flags)
+	return inode.open(rp, vfsd, &opts)
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
 func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
-	_, inode, err := fs.walk(rp, false)
+	_, inode, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return "", err
 	}
@@ -303,7 +349,7 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
 func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
-	_, inode, err := fs.walk(rp, false)
+	_, inode, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return linux.Statx{}, err
 	}
@@ -314,7 +360,7 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 
 // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
 func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
-	if _, _, err := fs.walk(rp, false); err != nil {
+	if _, _, err := fs.walk(ctx, rp, false); err != nil {
 		return linux.Statfs{}, err
 	}
 
@@ -324,7 +370,9 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 }
 
 // Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {}
+func (fs *filesystem) Release(ctx context.Context) {
+	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+}
 
 // Sync implements vfs.FilesystemImpl.Sync.
 func (fs *filesystem) Sync(ctx context.Context) error {
@@ -342,7 +390,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 		return syserror.EEXIST
 	}
 
-	if _, _, err := fs.walk(rp, true); err != nil {
+	if _, _, err := fs.walk(ctx, rp, true); err != nil {
 		return err
 	}
 
@@ -355,7 +403,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return syserror.EEXIST
 	}
 
-	if _, _, err := fs.walk(rp, true); err != nil {
+	if _, _, err := fs.walk(ctx, rp, true); err != nil {
 		return err
 	}
 
@@ -368,7 +416,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return syserror.EEXIST
 	}
 
-	_, _, err := fs.walk(rp, true)
+	_, _, err := fs.walk(ctx, rp, true)
 	if err != nil {
 		return err
 	}
@@ -377,12 +425,12 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 }
 
 // RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
 	if rp.Done() {
 		return syserror.ENOENT
 	}
 
-	_, _, err := fs.walk(rp, false)
+	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return err
 	}
@@ -392,7 +440,7 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vf
 
 // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
 func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
-	_, inode, err := fs.walk(rp, false)
+	_, inode, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return err
 	}
@@ -406,7 +454,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 
 // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
-	_, _, err := fs.walk(rp, false)
+	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return err
 	}
@@ -420,7 +468,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 		return syserror.EEXIST
 	}
 
-	_, _, err := fs.walk(rp, true)
+	_, _, err := fs.walk(ctx, rp, true)
 	if err != nil {
 		return err
 	}
@@ -430,7 +478,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 
 // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
 func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
-	_, inode, err := fs.walk(rp, false)
+	_, inode, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return err
 	}
@@ -441,3 +489,60 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 
 	return syserror.EROFS
 }
+
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	_, inode, err := fs.walk(ctx, rp, false)
+	if err != nil {
+		return nil, err
+	}
+	if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+		return nil, err
+	}
+
+	// TODO(b/134676337): Support sockets.
+	return nil, syserror.ECONNREFUSED
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+	_, _, err := fs.walk(ctx, rp, false)
+	if err != nil {
+		return nil, err
+	}
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+	_, _, err := fs.walk(ctx, rp, false)
+	if err != nil {
+		return "", err
+	}
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	_, _, err := fs.walk(ctx, rp, false)
+	if err != nil {
+		return err
+	}
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	_, _, err := fs.walk(ctx, rp, false)
+	if err != nil {
+		return err
+	}
+	return syserror.ENOTSUP
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
+}
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index e6c847a71..30636cf66 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -16,7 +16,6 @@ package ext
 
 import (
 	"fmt"
-	"io"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -42,19 +41,21 @@ type inode struct {
 	// refs is a reference count. refs is accessed using atomic memory operations.
 	refs int64
 
+	// fs is the containing filesystem.
+	fs *filesystem
+
 	// inodeNum is the inode number of this inode on disk. This is used to
 	// identify inodes within the ext filesystem.
 	inodeNum uint32
 
-	// dev represents the underlying device. Same as filesystem.dev.
-	dev io.ReaderAt
-
 	// blkSize is the fs data block size. Same as filesystem.sb.BlockSize().
 	blkSize uint64
 
 	// diskInode gives us access to the inode struct on disk. Immutable.
 	diskInode disklayout.Inode
 
+	locks vfs.FileLocks
+
 	// This is immutable. The first field of the implementations must have inode
 	// as the first field to ensure temporality.
 	impl interface{}
@@ -81,10 +82,10 @@ func (in *inode) tryIncRef() bool {
 // decRef decrements the inode ref count and releases the inode resources if
 // the ref count hits 0.
 //
-// Precondition: Must have locked fs.mu.
-func (in *inode) decRef(fs *filesystem) {
+// Precondition: Must have locked filesystem.mu.
+func (in *inode) decRef() {
 	if refs := atomic.AddInt64(&in.refs, -1); refs == 0 {
-		delete(fs.inodeCache, in.inodeNum)
+		delete(in.fs.inodeCache, in.inodeNum)
 	} else if refs < 0 {
 		panic("ext.inode.decRef() called without holding a reference")
 	}
@@ -116,28 +117,28 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 	}
 
 	// Build the inode based on its type.
-	inode := inode{
+	args := inodeArgs{
+		fs:        fs,
 		inodeNum:  inodeNum,
-		dev:       fs.dev,
 		blkSize:   blkSize,
 		diskInode: diskInode,
 	}
 
 	switch diskInode.Mode().FileType() {
 	case linux.ModeSymlink:
-		f, err := newSymlink(inode)
+		f, err := newSymlink(args)
 		if err != nil {
 			return nil, err
 		}
 		return &f.inode, nil
 	case linux.ModeRegular:
-		f, err := newRegularFile(inode)
+		f, err := newRegularFile(args)
 		if err != nil {
 			return nil, err
 		}
 		return &f.inode, nil
 	case linux.ModeDirectory:
-		f, err := newDirectroy(inode, fs.sb.IncompatibleFeatures().DirentFileType)
+		f, err := newDirectory(args, fs.sb.IncompatibleFeatures().DirentFileType)
 		if err != nil {
 			return nil, err
 		}
@@ -148,17 +149,35 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 	}
 }
 
+type inodeArgs struct {
+	fs        *filesystem
+	inodeNum  uint32
+	blkSize   uint64
+	diskInode disklayout.Inode
+}
+
+func (in *inode) init(args inodeArgs, impl interface{}) {
+	in.fs = args.fs
+	in.inodeNum = args.inodeNum
+	in.blkSize = args.blkSize
+	in.diskInode = args.diskInode
+	in.impl = impl
+}
+
 // open creates and returns a file description for the dentry passed in.
-func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
-	ats := vfs.AccessTypesForOpenFlags(flags)
+func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(opts)
 	if err := in.checkPermissions(rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
+	mnt := rp.Mount()
 	switch in.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		fd.flags = flags
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		fd.LockFD.Init(&in.locks)
+		if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			return nil, err
+		}
 		return &fd.vfsfd, nil
 	case *directory:
 		// Can't open directories writably. This check is not necessary for a read
@@ -167,17 +186,19 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
-		fd.flags = flags
+		fd.LockFD.Init(&in.locks)
+		if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			return nil, err
+		}
 		return &fd.vfsfd, nil
 	case *symlink:
-		if flags&linux.O_PATH == 0 {
+		if opts.Flags&linux.O_PATH == 0 {
 			// Can't open symlinks without O_PATH.
 			return nil, syserror.ELOOP
 		}
 		var fd symlinkFD
-		fd.flags = flags
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		fd.LockFD.Init(&in.locks)
+		fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", in.impl))
@@ -185,7 +206,7 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 }
 
 func (in *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
-	return vfs.GenericCheckPermissions(creds, ats, in.isDir(), uint16(in.diskInode.Mode()), in.diskInode.UID(), in.diskInode.GID())
+	return vfs.GenericCheckPermissions(creds, ats, in.diskInode.Mode(), in.diskInode.UID(), in.diskInode.GID())
 }
 
 // statTo writes the statx fields to the output parameter.
@@ -203,6 +224,8 @@ func (in *inode) statTo(stat *linux.Statx) {
 	stat.Atime = in.diskInode.AccessTime().StatxTimestamp()
 	stat.Ctime = in.diskInode.ChangeTime().StatxTimestamp()
 	stat.Mtime = in.diskInode.ModificationTime().StatxTimestamp()
+	stat.DevMajor = linux.UNNAMED_MAJOR
+	stat.DevMinor = in.fs.devMinor
 	// TODO(b/134676337): Set stat.Blocks which is the number of 512 byte blocks
 	// (including metadata blocks) required to represent this file.
 }
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index aec33e00a..e73e740d6 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -16,15 +16,16 @@ package ext
 
 import (
 	"io"
-	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // regularFile represents a regular file's inode. This too follows the
@@ -43,28 +44,19 @@ type regularFile struct {
 
 // newRegularFile is the regularFile constructor. It figures out what kind of
 // file this is and initializes the fileReader.
-func newRegularFile(inode inode) (*regularFile, error) {
-	regFile := regularFile{
-		inode: inode,
-	}
-
-	inodeFlags := inode.diskInode.Flags()
-
-	if inodeFlags.Extents {
-		file, err := newExtentFile(regFile)
+func newRegularFile(args inodeArgs) (*regularFile, error) {
+	if args.diskInode.Flags().Extents {
+		file, err := newExtentFile(args)
 		if err != nil {
 			return nil, err
 		}
-
-		file.regFile.inode.impl = &file.regFile
 		return &file.regFile, nil
 	}
 
-	file, err := newBlockMapFile(regFile)
+	file, err := newBlockMapFile(args)
 	if err != nil {
 		return nil, err
 	}
-	file.regFile.inode.impl = &file.regFile
 	return &file.regFile, nil
 }
 
@@ -77,6 +69,7 @@ func (in *inode) isRegular() bool {
 // vfs.FileDescriptionImpl.
 type regularFileFD struct {
 	fileDescription
+	vfs.LockFD
 
 	// off is the file offset. off is accessed using atomic memory operations.
 	off int64
@@ -86,7 +79,7 @@ type regularFileFD struct {
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
-func (fd *regularFileFD) Release() {}
+func (fd *regularFileFD) Release(context.Context) {}
 
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
@@ -157,3 +150,13 @@ func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpt
 	// TODO(b/134676337): Implement mmap(2).
 	return syserror.ENODEV
 }
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *regularFileFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *regularFileFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
index bdf8705c1..2fd0d1fa8 100644
--- a/pkg/sentry/fsimpl/ext/symlink.go
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -15,11 +15,11 @@
 package ext
 
 import (
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // symlink represents a symlink inode.
@@ -30,18 +30,17 @@ type symlink struct {
 
 // newSymlink is the symlink constructor. It reads out the symlink target from
 // the inode (however it might have been stored).
-func newSymlink(inode inode) (*symlink, error) {
-	var file *symlink
+func newSymlink(args inodeArgs) (*symlink, error) {
 	var link []byte
 
 	// If the symlink target is lesser than 60 bytes, its stores in inode.Data().
 	// Otherwise either extents or block maps will be used to store the link.
-	size := inode.diskInode.Size()
+	size := args.diskInode.Size()
 	if size < 60 {
-		link = inode.diskInode.Data()[:size]
+		link = args.diskInode.Data()[:size]
 	} else {
 		// Create a regular file out of this inode and read out the target.
-		regFile, err := newRegularFile(inode)
+		regFile, err := newRegularFile(args)
 		if err != nil {
 			return nil, err
 		}
@@ -52,8 +51,8 @@ func newSymlink(inode inode) (*symlink, error) {
 		}
 	}
 
-	file = &symlink{inode: inode, target: string(link)}
-	file.inode.impl = file
+	file := &symlink{target: string(link)}
+	file.inode.init(args, file)
 	return file, nil
 }
 
@@ -67,13 +66,14 @@ func (in *inode) isSymlink() bool {
 // O_PATH. For this reason most of the functions return EBADF.
 type symlinkFD struct {
 	fileDescription
+	vfs.NoLockFD
 }
 
 // Compiles only if symlinkFD implements vfs.FileDescriptionImpl.
 var _ vfs.FileDescriptionImpl = (*symlinkFD)(nil)
 
 // Release implements vfs.FileDescriptionImpl.Release.
-func (fd *symlinkFD) Release() {}
+func (fd *symlinkFD) Release(context.Context) {}
 
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *symlinkFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD
new file mode 100644
index 000000000..999111deb
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/BUILD
@@ -0,0 +1,63 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+    name = "request_list",
+    out = "request_list.go",
+    package = "fuse",
+    prefix = "request",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Request",
+        "Linker": "*Request",
+    },
+)
+
+go_library(
+    name = "fuse",
+    srcs = [
+        "connection.go",
+        "dev.go",
+        "fusefs.go",
+        "init.go",
+        "register.go",
+        "request_list.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/log",
+        "//pkg/sentry/fsimpl/devtmpfs",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+        "//tools/go_marshal/marshal",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "fuse_test",
+    size = "small",
+    srcs = ["dev_test.go"],
+    library = ":fuse",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/sentry/fsimpl/testutil",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+        "//tools/go_marshal/marshal",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go
new file mode 100644
index 000000000..6df2728ab
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/connection.go
@@ -0,0 +1,437 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"errors"
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+)
+
+// maxActiveRequestsDefault is the default setting controlling the upper bound
+// on the number of active requests at any given time.
+const maxActiveRequestsDefault = 10000
+
+// Ordinary requests have even IDs, while interrupts IDs are odd.
+// Used to increment the unique ID for each FUSE request.
+var reqIDStep uint64 = 2
+
+const (
+	// fuseDefaultMaxBackground is the default value for MaxBackground.
+	fuseDefaultMaxBackground = 12
+
+	// fuseDefaultCongestionThreshold is the default value for CongestionThreshold,
+	// and is 75% of the default maximum of MaxGround.
+	fuseDefaultCongestionThreshold = (fuseDefaultMaxBackground * 3 / 4)
+
+	// fuseDefaultMaxPagesPerReq is the default value for MaxPagesPerReq.
+	fuseDefaultMaxPagesPerReq = 32
+)
+
+// Request represents a FUSE operation request that hasn't been sent to the
+// server yet.
+//
+// +stateify savable
+type Request struct {
+	requestEntry
+
+	id   linux.FUSEOpID
+	hdr  *linux.FUSEHeaderIn
+	data []byte
+}
+
+// Response represents an actual response from the server, including the
+// response payload.
+//
+// +stateify savable
+type Response struct {
+	opcode linux.FUSEOpcode
+	hdr    linux.FUSEHeaderOut
+	data   []byte
+}
+
+// connection is the struct by which the sentry communicates with the FUSE server daemon.
+type connection struct {
+	fd *DeviceFD
+
+	// The following FUSE_INIT flags are currently unsupported by this implementation:
+	// - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC)
+	// - FUSE_EXPORT_SUPPORT
+	// - FUSE_HANDLE_KILLPRIV
+	// - FUSE_POSIX_LOCKS: requires POSIX locks
+	// - FUSE_FLOCK_LOCKS: requires POSIX locks
+	// - FUSE_AUTO_INVAL_DATA: requires page caching eviction
+	// - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction
+	// - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation
+	// - FUSE_ASYNC_DIO
+	// - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler
+
+	// initialized after receiving FUSE_INIT reply.
+	// Until it's set, suspend sending FUSE requests.
+	// Use SetInitialized() and IsInitialized() for atomic access.
+	initialized int32
+
+	// initializedChan is used to block requests before initialization.
+	initializedChan chan struct{}
+
+	// blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground).
+	// TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block.
+	blocked bool
+
+	// connected (connection established) when a new FUSE file system is created.
+	// Set to false when:
+	//   umount,
+	//   connection abort,
+	//   device release.
+	connected bool
+
+	// aborted via sysfs.
+	// TODO(gvisor.dev/issue/3185): abort all queued requests.
+	aborted bool
+
+	// connInitError if FUSE_INIT encountered error (major version mismatch).
+	// Only set in INIT.
+	connInitError bool
+
+	// connInitSuccess if FUSE_INIT is successful.
+	// Only set in INIT.
+	// Used for destory.
+	connInitSuccess bool
+
+	// TODO(gvisor.dev/issue/3185): All the queue logic are working in progress.
+
+	// NumberBackground is the number of requests in the background.
+	numBackground uint16
+
+	// congestionThreshold for NumBackground.
+	// Negotiated in FUSE_INIT.
+	congestionThreshold uint16
+
+	// maxBackground is the maximum number of NumBackground.
+	// Block connection when it is reached.
+	// Negotiated in FUSE_INIT.
+	maxBackground uint16
+
+	// numActiveBackground is the number of requests in background and has being marked as active.
+	numActiveBackground uint16
+
+	// numWating is the number of requests waiting for completion.
+	numWaiting uint32
+
+	// TODO(gvisor.dev/issue/3185): BgQueue
+	// some queue for background queued requests.
+
+	// bgLock protects:
+	// MaxBackground, CongestionThreshold, NumBackground,
+	// NumActiveBackground, BgQueue, Blocked.
+	bgLock sync.Mutex
+
+	// maxRead is the maximum size of a read buffer in in bytes.
+	maxRead uint32
+
+	// maxWrite is the maximum size of a write buffer in bytes.
+	// Negotiated in FUSE_INIT.
+	maxWrite uint32
+
+	// maxPages is the maximum number of pages for a single request to use.
+	// Negotiated in FUSE_INIT.
+	maxPages uint16
+
+	// minor version of the FUSE protocol.
+	// Negotiated and only set in INIT.
+	minor uint32
+
+	// asyncRead if read pages asynchronously.
+	// Negotiated and only set in INIT.
+	asyncRead bool
+
+	// abortErr is true if kernel need to return an unique read error after abort.
+	// Negotiated and only set in INIT.
+	abortErr bool
+
+	// writebackCache is true for write-back cache policy,
+	// false for write-through policy.
+	// Negotiated and only set in INIT.
+	writebackCache bool
+
+	// cacheSymlinks if filesystem needs to cache READLINK responses in page cache.
+	// Negotiated and only set in INIT.
+	cacheSymlinks bool
+
+	// bigWrites if doing multi-page cached writes.
+	// Negotiated and only set in INIT.
+	bigWrites bool
+
+	// dontMask if filestestem does not apply umask to creation modes.
+	// Negotiated in INIT.
+	dontMask bool
+}
+
+// newFUSEConnection creates a FUSE connection to fd.
+func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) {
+	// Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to
+	// mount a FUSE filesystem.
+	fuseFD := fd.Impl().(*DeviceFD)
+	fuseFD.mounted = true
+
+	// Create the writeBuf for the header to be stored in.
+	hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+	fuseFD.writeBuf = make([]byte, hdrLen)
+	fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse)
+	fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests)
+	fuseFD.writeCursor = 0
+
+	return &connection{
+		fd:                  fuseFD,
+		maxBackground:       fuseDefaultMaxBackground,
+		congestionThreshold: fuseDefaultCongestionThreshold,
+		maxPages:            fuseDefaultMaxPagesPerReq,
+		initializedChan:     make(chan struct{}),
+		connected:           true,
+	}, nil
+}
+
+// SetInitialized atomically sets the connection as initialized.
+func (conn *connection) SetInitialized() {
+	// Unblock the requests sent before INIT.
+	close(conn.initializedChan)
+
+	// Close the channel first to avoid the non-atomic situation
+	// where conn.initialized is true but there are
+	// tasks being blocked on the channel.
+	// And it prevents the newer tasks from gaining
+	// unnecessary higher chance to be issued before the blocked one.
+
+	atomic.StoreInt32(&(conn.initialized), int32(1))
+}
+
+// IsInitialized atomically check if the connection is initialized.
+// pairs with SetInitialized().
+func (conn *connection) Initialized() bool {
+	return atomic.LoadInt32(&(conn.initialized)) != 0
+}
+
+// NewRequest creates a new request that can be sent to the FUSE server.
+func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
+	conn.fd.mu.Lock()
+	defer conn.fd.mu.Unlock()
+	conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
+
+	hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
+	hdr := linux.FUSEHeaderIn{
+		Len:    uint32(hdrLen + payload.SizeBytes()),
+		Opcode: opcode,
+		Unique: conn.fd.nextOpID,
+		NodeID: ino,
+		UID:    uint32(creds.EffectiveKUID),
+		GID:    uint32(creds.EffectiveKGID),
+		PID:    pid,
+	}
+
+	buf := make([]byte, hdr.Len)
+	hdr.MarshalUnsafe(buf[:hdrLen])
+	payload.MarshalUnsafe(buf[hdrLen:])
+
+	return &Request{
+		id:   hdr.Unique,
+		hdr:  &hdr,
+		data: buf,
+	}, nil
+}
+
+// Call makes a request to the server and blocks the invoking task until a
+// server responds with a response. Task should never be nil.
+// Requests will not be sent before the connection is initialized.
+// For async tasks, use CallAsync().
+func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
+	// Block requests sent before connection is initalized.
+	if !conn.Initialized() {
+		if err := t.Block(conn.initializedChan); err != nil {
+			return nil, err
+		}
+	}
+
+	return conn.call(t, r)
+}
+
+// CallAsync makes an async (aka background) request.
+// Those requests either do not expect a response (e.g. release) or
+// the response should be handled by others (e.g. init).
+// Return immediately unless the connection is blocked (before initialization).
+// Async call example: init, release, forget, aio, interrupt.
+// When the Request is FUSE_INIT, it will not be blocked before initialization.
+func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+	// Block requests sent before connection is initalized.
+	if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT {
+		if err := t.Block(conn.initializedChan); err != nil {
+			return err
+		}
+	}
+
+	// This should be the only place that invokes call() with a nil task.
+	_, err := conn.call(nil, r)
+	return err
+}
+
+// call makes a call without blocking checks.
+func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
+	if !conn.connected {
+		return nil, syserror.ENOTCONN
+	}
+
+	if conn.connInitError {
+		return nil, syserror.ECONNREFUSED
+	}
+
+	fut, err := conn.callFuture(t, r)
+	if err != nil {
+		return nil, err
+	}
+
+	return fut.resolve(t)
+}
+
+// Error returns the error of the FUSE call.
+func (r *Response) Error() error {
+	errno := r.hdr.Error
+	if errno >= 0 {
+		return nil
+	}
+
+	sysErrNo := syscall.Errno(-errno)
+	return error(sysErrNo)
+}
+
+// UnmarshalPayload unmarshals the response data into m.
+func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
+	hdrLen := r.hdr.SizeBytes()
+	haveDataLen := r.hdr.Len - uint32(hdrLen)
+	wantDataLen := uint32(m.SizeBytes())
+
+	if haveDataLen < wantDataLen {
+		return fmt.Errorf("payload too small. Minimum data lenth required: %d,  but got data length %d", wantDataLen, haveDataLen)
+	}
+
+	m.UnmarshalUnsafe(r.data[hdrLen:])
+	return nil
+}
+
+// callFuture makes a request to the server and returns a future response.
+// Call resolve() when the response needs to be fulfilled.
+func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) {
+	conn.fd.mu.Lock()
+	defer conn.fd.mu.Unlock()
+
+	// Is the queue full?
+	//
+	// We must busy wait here until the request can be queued. We don't
+	// block on the fd.fullQueueCh with a lock - so after being signalled,
+	// before we acquire the lock, it is possible that a barging task enters
+	// and queues a request. As a result, upon acquiring the lock we must
+	// again check if the room is available.
+	//
+	// This can potentially starve a request forever but this can only happen
+	// if there are always too many ongoing requests all the time. The
+	// supported maxActiveRequests setting should be really high to avoid this.
+	for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
+		if t == nil {
+			// Since there is no task that is waiting. We must error out.
+			return nil, errors.New("FUSE request queue full")
+		}
+
+		log.Infof("Blocking request %v from being queued. Too many active requests: %v",
+			r.id, conn.fd.numActiveRequests)
+		conn.fd.mu.Unlock()
+		err := t.Block(conn.fd.fullQueueCh)
+		conn.fd.mu.Lock()
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	return conn.callFutureLocked(t, r)
+}
+
+// callFutureLocked makes a request to the server and returns a future response.
+func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) {
+	conn.fd.queue.PushBack(r)
+	conn.fd.numActiveRequests += 1
+	fut := newFutureResponse(r.hdr.Opcode)
+	conn.fd.completions[r.id] = fut
+
+	// Signal the readers that there is something to read.
+	conn.fd.waitQueue.Notify(waiter.EventIn)
+
+	return fut, nil
+}
+
+// futureResponse represents an in-flight request, that may or may not have
+// completed yet. Convert it to a resolved Response by calling Resolve, but note
+// that this may block.
+//
+// +stateify savable
+type futureResponse struct {
+	opcode linux.FUSEOpcode
+	ch     chan struct{}
+	hdr    *linux.FUSEHeaderOut
+	data   []byte
+}
+
+// newFutureResponse creates a future response to a FUSE request.
+func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse {
+	return &futureResponse{
+		opcode: opcode,
+		ch:     make(chan struct{}),
+	}
+}
+
+// resolve blocks the task until the server responds to its corresponding request,
+// then returns a resolved response.
+func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
+	// If there is no Task associated with this request  - then we don't try to resolve
+	// the response.  Instead, the task writing the response (proxy to the server) will
+	// process the response on our behalf.
+	if t == nil {
+		log.Infof("fuse.Response.resolve: Not waiting on a response from server.")
+		return nil, nil
+	}
+
+	if err := t.Block(f.ch); err != nil {
+		return nil, err
+	}
+
+	return f.getResponse(), nil
+}
+
+// getResponse creates a Response from the data the futureResponse has.
+func (f *futureResponse) getResponse() *Response {
+	return &Response{
+		opcode: f.opcode,
+		hdr:    *f.hdr,
+		data:   f.data,
+	}
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
new file mode 100644
index 000000000..e522ff9a0
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -0,0 +1,397 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+const fuseDevMinor = 229
+
+// fuseDevice implements vfs.Device for /dev/fuse.
+type fuseDevice struct{}
+
+// Open implements vfs.Device.Open.
+func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	if !kernel.FUSEEnabled {
+		return nil, syserror.ENOENT
+	}
+
+	var fd DeviceFD
+	if err := fd.vfsfd.Init(&fd, opts.Flags, mnt, vfsd, &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
+type DeviceFD struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
+
+	// mounted specifies whether a FUSE filesystem was mounted using the DeviceFD.
+	mounted bool
+
+	// nextOpID is used to create new requests.
+	nextOpID linux.FUSEOpID
+
+	// queue is the list of requests that need to be processed by the FUSE server.
+	queue requestList
+
+	// numActiveRequests is the number of requests made by the Sentry that has
+	// yet to be responded to.
+	numActiveRequests uint64
+
+	// completions is used to map a request to its response. A Writer will use this
+	// to notify the caller of a completed response.
+	completions map[linux.FUSEOpID]*futureResponse
+
+	writeCursor uint32
+
+	// writeBuf is the memory buffer used to copy in the FUSE out header from
+	// userspace.
+	writeBuf []byte
+
+	// writeCursorFR current FR being copied from server.
+	writeCursorFR *futureResponse
+
+	// mu protects all the queues, maps, buffers and cursors and nextOpID.
+	mu sync.Mutex
+
+	// waitQueue is used to notify interested parties when the device becomes
+	// readable or writable.
+	waitQueue waiter.Queue
+
+	// fullQueueCh is a channel used to synchronize the readers with the writers.
+	// Writers (inbound requests to the filesystem) block if there are too many
+	// unprocessed in-flight requests.
+	fullQueueCh chan struct{}
+
+	// fs is the FUSE filesystem that this FD is being used for.
+	fs *filesystem
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *DeviceFD) Release(context.Context) {
+	fd.fs.conn.connected = false
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	return 0, syserror.ENOSYS
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	// We require that any Read done on this filesystem have a sane minimum
+	// read buffer. It must have the capacity for the fixed parts of any request
+	// header (Linux uses the request header and the FUSEWriteIn header for this
+	// calculation) + the negotiated MaxWrite room for the data.
+	minBuffSize := linux.FUSE_MIN_READ_BUFFER
+	inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes())
+	writeHdrLen := uint32((*linux.FUSEWriteIn)(nil).SizeBytes())
+	negotiatedMinBuffSize := inHdrLen + writeHdrLen + fd.fs.conn.maxWrite
+	if minBuffSize < negotiatedMinBuffSize {
+		minBuffSize = negotiatedMinBuffSize
+	}
+
+	// If the read buffer is too small, error out.
+	if dst.NumBytes() < int64(minBuffSize) {
+		return 0, syserror.EINVAL
+	}
+
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	return fd.readLocked(ctx, dst, opts)
+}
+
+// readLocked implements the reading of the fuse device while locked with DeviceFD.mu.
+func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	if fd.queue.Empty() {
+		return 0, syserror.ErrWouldBlock
+	}
+
+	var readCursor uint32
+	var bytesRead int64
+	for {
+		req := fd.queue.Front()
+		if dst.NumBytes() < int64(req.hdr.Len) {
+			// The request is too large. Cannot process it. All requests must be smaller than the
+			// negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
+			// handshake.
+			errno := -int32(syscall.EIO)
+			if req.hdr.Opcode == linux.FUSE_SETXATTR {
+				errno = -int32(syscall.E2BIG)
+			}
+
+			// Return the error to the calling task.
+			if err := fd.sendError(ctx, errno, req); err != nil {
+				return 0, err
+			}
+
+			// We're done with this request.
+			fd.queue.Remove(req)
+
+			// Restart the read as this request was invalid.
+			log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.")
+			return fd.readLocked(ctx, dst, opts)
+		}
+
+		n, err := dst.CopyOut(ctx, req.data[readCursor:])
+		if err != nil {
+			return 0, err
+		}
+		readCursor += uint32(n)
+		bytesRead += int64(n)
+
+		if readCursor >= req.hdr.Len {
+			// Fully done with this req, remove it from the queue.
+			fd.queue.Remove(req)
+			break
+		}
+	}
+
+	return bytesRead, nil
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	return 0, syserror.ENOSYS
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	return fd.writeLocked(ctx, src, opts)
+}
+
+// writeLocked implements writing to the fuse device while locked with DeviceFD.mu.
+func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	var cn, n int64
+	hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+
+	for src.NumBytes() > 0 {
+		if fd.writeCursorFR != nil {
+			// Already have common header, and we're now copying the payload.
+			wantBytes := fd.writeCursorFR.hdr.Len
+
+			// Note that the FR data doesn't have the header. Copy it over if its necessary.
+			if fd.writeCursorFR.data == nil {
+				fd.writeCursorFR.data = make([]byte, wantBytes)
+			}
+
+			bytesCopied, err := src.CopyIn(ctx, fd.writeCursorFR.data[fd.writeCursor:wantBytes])
+			if err != nil {
+				return 0, err
+			}
+			src = src.DropFirst(bytesCopied)
+
+			cn = int64(bytesCopied)
+			n += cn
+			fd.writeCursor += uint32(cn)
+			if fd.writeCursor == wantBytes {
+				// Done reading this full response. Clean up and unblock the
+				// initiator.
+				break
+			}
+
+			// Check if we have more data in src.
+			continue
+		}
+
+		// Assert that the header isn't read into the writeBuf yet.
+		if fd.writeCursor >= hdrLen {
+			return 0, syserror.EINVAL
+		}
+
+		// We don't have the full common response header yet.
+		wantBytes := hdrLen - fd.writeCursor
+		bytesCopied, err := src.CopyIn(ctx, fd.writeBuf[fd.writeCursor:wantBytes])
+		if err != nil {
+			return 0, err
+		}
+		src = src.DropFirst(bytesCopied)
+
+		cn = int64(bytesCopied)
+		n += cn
+		fd.writeCursor += uint32(cn)
+		if fd.writeCursor == hdrLen {
+			// Have full header in the writeBuf. Use it to fetch the actual futureResponse
+			// from the device's completions map.
+			var hdr linux.FUSEHeaderOut
+			hdr.UnmarshalBytes(fd.writeBuf)
+
+			// We have the header now and so the writeBuf has served its purpose.
+			// We could reset it manually here but instead of doing that, at the
+			// end of the write, the writeCursor will be set to 0 thereby allowing
+			// the next request to overwrite whats in the buffer,
+
+			fut, ok := fd.completions[hdr.Unique]
+			if !ok {
+				// Server sent us a response for a request we never sent?
+				return 0, syserror.EINVAL
+			}
+
+			delete(fd.completions, hdr.Unique)
+
+			// Copy over the header into the future response. The rest of the payload
+			// will be copied over to the FR's data in the next iteration.
+			fut.hdr = &hdr
+			fd.writeCursorFR = fut
+
+			// Next iteration will now try read the complete request, if src has
+			// any data remaining. Otherwise we're done.
+		}
+	}
+
+	if fd.writeCursorFR != nil {
+		if err := fd.sendResponse(ctx, fd.writeCursorFR); err != nil {
+			return 0, err
+		}
+
+		// Ready the device for the next request.
+		fd.writeCursorFR = nil
+		fd.writeCursor = 0
+	}
+
+	return n, nil
+}
+
+// Readiness implements vfs.FileDescriptionImpl.Readiness.
+func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	var ready waiter.EventMask
+	ready |= waiter.EventOut // FD is always writable
+	if !fd.queue.Empty() {
+		// Have reqs available, FD is readable.
+		ready |= waiter.EventIn
+	}
+
+	return ready & mask
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *DeviceFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fd.waitQueue.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
+	fd.waitQueue.EventUnregister(e)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
+	if !fd.mounted {
+		return 0, syserror.EPERM
+	}
+
+	return 0, syserror.ENOSYS
+}
+
+// sendResponse sends a response to the waiting task (if any).
+func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
+	// See if the running task need to perform some action before returning.
+	// Since we just finished writing the future, we can be sure that
+	// getResponse generates a populated response.
+	if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil {
+		return err
+	}
+
+	// Signal that the queue is no longer full.
+	select {
+	case fd.fullQueueCh <- struct{}{}:
+	default:
+	}
+	fd.numActiveRequests -= 1
+
+	// Signal the task waiting on a response.
+	close(fut.ch)
+	return nil
+}
+
+// sendError sends an error response to the waiting task (if any).
+func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error {
+	// Return the error to the calling task.
+	outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+	respHdr := linux.FUSEHeaderOut{
+		Len:    outHdrLen,
+		Error:  errno,
+		Unique: req.hdr.Unique,
+	}
+
+	fut, ok := fd.completions[respHdr.Unique]
+	if !ok {
+		// Server sent us a response for a request we never sent?
+		return syserror.EINVAL
+	}
+	delete(fd.completions, respHdr.Unique)
+
+	fut.hdr = &respHdr
+	if err := fd.sendResponse(ctx, fut); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// noReceiverAction has the calling kernel.Task do some action if its known that no
+// receiver is going to be waiting on the future channel. This is to be used by:
+// FUSE_INIT.
+func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error {
+	if r.opcode == linux.FUSE_INIT {
+		creds := auth.CredentialsFromContext(ctx)
+		rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
+		return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go
new file mode 100644
index 000000000..1ffe7ccd2
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/dev_test.go
@@ -0,0 +1,428 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"fmt"
+	"io"
+	"math/rand"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+	"gvisor.dev/gvisor/tools/go_marshal/marshal"
+)
+
+// echoTestOpcode is the Opcode used during testing. The server used in tests
+// will simply echo the payload back with the appropriate headers.
+const echoTestOpcode linux.FUSEOpcode = 1000
+
+type testPayload struct {
+	data uint32
+}
+
+// TestFUSECommunication tests that the communication layer between the Sentry and the
+// FUSE server daemon works as expected.
+func TestFUSECommunication(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	k := kernel.KernelFromContext(s.Ctx)
+	creds := auth.CredentialsFromContext(s.Ctx)
+
+	// Create test cases with different number of concurrent clients and servers.
+	testCases := []struct {
+		Name              string
+		NumClients        int
+		NumServers        int
+		MaxActiveRequests uint64
+	}{
+		{
+			Name:              "SingleClientSingleServer",
+			NumClients:        1,
+			NumServers:        1,
+			MaxActiveRequests: maxActiveRequestsDefault,
+		},
+		{
+			Name:              "SingleClientMultipleServers",
+			NumClients:        1,
+			NumServers:        10,
+			MaxActiveRequests: maxActiveRequestsDefault,
+		},
+		{
+			Name:              "MultipleClientsSingleServer",
+			NumClients:        10,
+			NumServers:        1,
+			MaxActiveRequests: maxActiveRequestsDefault,
+		},
+		{
+			Name:              "MultipleClientsMultipleServers",
+			NumClients:        10,
+			NumServers:        10,
+			MaxActiveRequests: maxActiveRequestsDefault,
+		},
+		{
+			Name:              "RequestCapacityFull",
+			NumClients:        10,
+			NumServers:        1,
+			MaxActiveRequests: 1,
+		},
+		{
+			Name:              "RequestCapacityContinuouslyFull",
+			NumClients:        100,
+			NumServers:        2,
+			MaxActiveRequests: 2,
+		},
+	}
+
+	for _, testCase := range testCases {
+		t.Run(testCase.Name, func(t *testing.T) {
+			conn, fd, err := newTestConnection(s, k, testCase.MaxActiveRequests)
+			if err != nil {
+				t.Fatalf("newTestConnection: %v", err)
+			}
+
+			clientsDone := make([]chan struct{}, testCase.NumClients)
+			serversDone := make([]chan struct{}, testCase.NumServers)
+			serversKill := make([]chan struct{}, testCase.NumServers)
+
+			// FUSE clients.
+			for i := 0; i < testCase.NumClients; i++ {
+				clientsDone[i] = make(chan struct{})
+				go func(i int) {
+					fuseClientRun(t, s, k, conn, creds, uint32(i), uint64(i), clientsDone[i])
+				}(i)
+			}
+
+			// FUSE servers.
+			for j := 0; j < testCase.NumServers; j++ {
+				serversDone[j] = make(chan struct{})
+				serversKill[j] = make(chan struct{}, 1) // The kill command shouldn't block.
+				go func(j int) {
+					fuseServerRun(t, s, k, fd, serversDone[j], serversKill[j])
+				}(j)
+			}
+
+			// Tear down.
+			//
+			// Make sure all the clients are done.
+			for i := 0; i < testCase.NumClients; i++ {
+				<-clientsDone[i]
+			}
+
+			// Kill any server that is potentially waiting.
+			for j := 0; j < testCase.NumServers; j++ {
+				serversKill[j] <- struct{}{}
+			}
+
+			// Make sure all the servers are done.
+			for j := 0; j < testCase.NumServers; j++ {
+				<-serversDone[j]
+			}
+		})
+	}
+}
+
+// CallTest makes a request to the server and blocks the invoking
+// goroutine until a server responds with a response. Doesn't block
+// a kernel.Task. Analogous to Connection.Call but used for testing.
+func CallTest(conn *connection, t *kernel.Task, r *Request, i uint32) (*Response, error) {
+	conn.fd.mu.Lock()
+
+	// Wait until we're certain that a new request can be processed.
+	for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
+		conn.fd.mu.Unlock()
+		select {
+		case <-conn.fd.fullQueueCh:
+		}
+		conn.fd.mu.Lock()
+	}
+
+	fut, err := conn.callFutureLocked(t, r) // No task given.
+	conn.fd.mu.Unlock()
+
+	if err != nil {
+		return nil, err
+	}
+
+	// Resolve the response.
+	//
+	// Block without a task.
+	select {
+	case <-fut.ch:
+	}
+
+	// A response is ready. Resolve and return it.
+	return fut.getResponse(), nil
+}
+
+// ReadTest is analogous to vfs.FileDescription.Read and reads from the FUSE
+// device. However, it does so by - not blocking the task that is calling - and
+// instead just waits on a channel. The behaviour is essentially the same as
+// DeviceFD.Read except it guarantees that the task is not blocked.
+func ReadTest(serverTask *kernel.Task, fd *vfs.FileDescription, inIOseq usermem.IOSequence, killServer chan struct{}) (int64, bool, error) {
+	var err error
+	var n, total int64
+
+	dev := fd.Impl().(*DeviceFD)
+
+	// Register for notifications.
+	w, ch := waiter.NewChannelEntry(nil)
+	dev.EventRegister(&w, waiter.EventIn)
+	for {
+		// Issue the request and break out if it completes with anything other than
+		// "would block".
+		n, err = dev.Read(serverTask, inIOseq, vfs.ReadOptions{})
+		total += n
+		if err != syserror.ErrWouldBlock {
+			break
+		}
+
+		// Wait for a notification that we should retry.
+		// Emulate the blocking for when no requests are available
+		select {
+		case <-ch:
+		case <-killServer:
+			// Server killed by the main program.
+			return 0, true, nil
+		}
+	}
+
+	dev.EventUnregister(&w)
+	return total, false, err
+}
+
+// fuseClientRun emulates all the actions of a normal FUSE request. It creates
+// a header, a payload, calls the server, waits for the response, and processes
+// the response.
+func fuseClientRun(t *testing.T, s *testutil.System, k *kernel.Kernel, conn *connection, creds *auth.Credentials, pid uint32, inode uint64, clientDone chan struct{}) {
+	defer func() { clientDone <- struct{}{} }()
+
+	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+	clientTask, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("fuse-client-%v", pid), tc, s.MntNs, s.Root, s.Root)
+	if err != nil {
+		t.Fatal(err)
+	}
+	testObj := &testPayload{
+		data: rand.Uint32(),
+	}
+
+	req, err := conn.NewRequest(creds, pid, inode, echoTestOpcode, testObj)
+	if err != nil {
+		t.Fatalf("NewRequest creation failed: %v", err)
+	}
+
+	// Queue up a request.
+	// Analogous to Call except it doesn't block on the task.
+	resp, err := CallTest(conn, clientTask, req, pid)
+	if err != nil {
+		t.Fatalf("CallTaskNonBlock failed: %v", err)
+	}
+
+	if err = resp.Error(); err != nil {
+		t.Fatalf("Server responded with an error: %v", err)
+	}
+
+	var respTestPayload testPayload
+	if err := resp.UnmarshalPayload(&respTestPayload); err != nil {
+		t.Fatalf("Unmarshalling payload error: %v", err)
+	}
+
+	if resp.hdr.Unique != req.hdr.Unique {
+		t.Fatalf("got response for another request. Expected response for req %v but got response for req %v",
+			req.hdr.Unique, resp.hdr.Unique)
+	}
+
+	if respTestPayload.data != testObj.data {
+		t.Fatalf("read incorrect data. Data expected: %v, but got %v", testObj.data, respTestPayload.data)
+	}
+
+}
+
+// fuseServerRun creates a task and emulates all the actions of a simple FUSE server
+// that simply reads a request and echos the same struct back as a response using the
+// appropriate headers.
+func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.FileDescription, serverDone, killServer chan struct{}) {
+	defer func() { serverDone <- struct{}{} }()
+
+	// Create the tasks that the server will be using.
+	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+	var readPayload testPayload
+
+	serverTask, err := testutil.CreateTask(s.Ctx, "fuse-server", tc, s.MntNs, s.Root, s.Root)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Read the request.
+	for {
+		inHdrLen := uint32((*linux.FUSEHeaderIn)(nil).SizeBytes())
+		payloadLen := uint32(readPayload.SizeBytes())
+
+		// The raed buffer must meet some certain size criteria.
+		buffSize := inHdrLen + payloadLen
+		if buffSize < linux.FUSE_MIN_READ_BUFFER {
+			buffSize = linux.FUSE_MIN_READ_BUFFER
+		}
+		inBuf := make([]byte, buffSize)
+		inIOseq := usermem.BytesIOSequence(inBuf)
+
+		n, serverKilled, err := ReadTest(serverTask, fd, inIOseq, killServer)
+		if err != nil {
+			t.Fatalf("Read failed :%v", err)
+		}
+
+		// Server should shut down. No new requests are going to be made.
+		if serverKilled {
+			break
+		}
+
+		if n <= 0 {
+			t.Fatalf("Read read no bytes")
+		}
+
+		var readFUSEHeaderIn linux.FUSEHeaderIn
+		readFUSEHeaderIn.UnmarshalUnsafe(inBuf[:inHdrLen])
+		readPayload.UnmarshalUnsafe(inBuf[inHdrLen : inHdrLen+payloadLen])
+
+		if readFUSEHeaderIn.Opcode != echoTestOpcode {
+			t.Fatalf("read incorrect data. Header: %v, Payload: %v", readFUSEHeaderIn, readPayload)
+		}
+
+		// Write the response.
+		outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
+		outBuf := make([]byte, outHdrLen+payloadLen)
+		outHeader := linux.FUSEHeaderOut{
+			Len:    outHdrLen + payloadLen,
+			Error:  0,
+			Unique: readFUSEHeaderIn.Unique,
+		}
+
+		// Echo the payload back.
+		outHeader.MarshalUnsafe(outBuf[:outHdrLen])
+		readPayload.MarshalUnsafe(outBuf[outHdrLen:])
+		outIOseq := usermem.BytesIOSequence(outBuf)
+
+		n, err = fd.Write(s.Ctx, outIOseq, vfs.WriteOptions{})
+		if err != nil {
+			t.Fatalf("Write failed :%v", err)
+		}
+	}
+}
+
+func setup(t *testing.T) *testutil.System {
+	k, err := testutil.Boot()
+	if err != nil {
+		t.Fatalf("Error creating kernel: %v", err)
+	}
+
+	ctx := k.SupervisorContext()
+	creds := auth.CredentialsFromContext(ctx)
+
+	k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList:  true,
+		AllowUserMount: true,
+	})
+
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("NewMountNamespace(): %v", err)
+	}
+
+	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
+}
+
+// newTestConnection creates a fuse connection that the sentry can communicate with
+// and the FD for the server to communicate with.
+func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
+	vfsObj := &vfs.VirtualFilesystem{}
+	fuseDev := &DeviceFD{}
+
+	if err := vfsObj.Init(system.Ctx); err != nil {
+		return nil, nil, err
+	}
+
+	vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+	defer vd.DecRef(system.Ctx)
+	if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, nil, err
+	}
+
+	fsopts := filesystemOptions{
+		maxActiveRequests: maxActiveRequests,
+	}
+	fs, err := NewFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	return fs.conn, &fuseDev.vfsfd, nil
+}
+
+// SizeBytes implements marshal.Marshallable.SizeBytes.
+func (t *testPayload) SizeBytes() int {
+	return 4
+}
+
+// MarshalBytes implements marshal.Marshallable.MarshalBytes.
+func (t *testPayload) MarshalBytes(dst []byte) {
+	usermem.ByteOrder.PutUint32(dst[:4], t.data)
+}
+
+// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
+func (t *testPayload) UnmarshalBytes(src []byte) {
+	*t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
+}
+
+// Packed implements marshal.Marshallable.Packed.
+func (t *testPayload) Packed() bool {
+	return true
+}
+
+// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
+func (t *testPayload) MarshalUnsafe(dst []byte) {
+	t.MarshalBytes(dst)
+}
+
+// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
+func (t *testPayload) UnmarshalUnsafe(src []byte) {
+	t.UnmarshalBytes(src)
+}
+
+// CopyOutN implements marshal.Marshallable.CopyOutN.
+func (t *testPayload) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {
+	panic("not implemented")
+}
+
+// CopyOut implements marshal.Marshallable.CopyOut.
+func (t *testPayload) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {
+	panic("not implemented")
+}
+
+// CopyIn implements marshal.Marshallable.CopyIn.
+func (t *testPayload) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {
+	panic("not implemented")
+}
+
+// WriteTo implements io.WriterTo.WriteTo.
+func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
+	panic("not implemented")
+}
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
new file mode 100644
index 000000000..83c24ec25
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -0,0 +1,324 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package fuse implements fusefs.
+package fuse
+
+import (
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the default filesystem name.
+const Name = "fuse"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+type filesystemOptions struct {
+	// userID specifies the numeric uid of the mount owner.
+	// This option should not be specified by the filesystem owner.
+	// It is set by libfuse (or, if libfuse is not used, must be set
+	// by the filesystem itself). For more information, see man page
+	// for fuse(8)
+	userID uint32
+
+	// groupID specifies the numeric gid of the mount owner.
+	// This option should not be specified by the filesystem owner.
+	// It is set by libfuse (or, if libfuse is not used, must be set
+	// by the filesystem itself). For more information, see man page
+	// for fuse(8)
+	groupID uint32
+
+	// rootMode specifies the the file mode of the filesystem's root.
+	rootMode linux.FileMode
+
+	// maxActiveRequests specifies the maximum number of active requests that can
+	// exist at any time. Any further requests will block when trying to
+	// Call the server.
+	maxActiveRequests uint64
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	kernfs.Filesystem
+	devMinor uint32
+
+	// conn is used for communication between the FUSE server
+	// daemon and the sentry fusefs.
+	conn *connection
+
+	// opts is the options the fusefs is initialized with.
+	opts *filesystemOptions
+}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	var fsopts filesystemOptions
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	deviceDescriptorStr, ok := mopts["fd"]
+	if !ok {
+		log.Warningf("%s.GetFilesystem: communication file descriptor N (obtained by opening /dev/fuse) must be specified as 'fd=N'", fsType.Name())
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "fd")
+
+	deviceDescriptor, err := strconv.ParseInt(deviceDescriptorStr, 10 /* base */, 32 /* bitSize */)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	kernelTask := kernel.TaskFromContext(ctx)
+	if kernelTask == nil {
+		log.Warningf("%s.GetFilesystem: couldn't get kernel task from context", fsType.Name())
+		return nil, nil, syserror.EINVAL
+	}
+	fuseFd := kernelTask.GetFileVFS2(int32(deviceDescriptor))
+
+	// Parse and set all the other supported FUSE mount options.
+	// TODO(gVisor.dev/issue/3229): Expand the supported mount options.
+	if userIDStr, ok := mopts["user_id"]; ok {
+		delete(mopts, "user_id")
+		userID, err := strconv.ParseUint(userIDStr, 10, 32)
+		if err != nil {
+			log.Warningf("%s.GetFilesystem: invalid user_id: user_id=%s", fsType.Name(), userIDStr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.userID = uint32(userID)
+	}
+
+	if groupIDStr, ok := mopts["group_id"]; ok {
+		delete(mopts, "group_id")
+		groupID, err := strconv.ParseUint(groupIDStr, 10, 32)
+		if err != nil {
+			log.Warningf("%s.GetFilesystem: invalid group_id: group_id=%s", fsType.Name(), groupIDStr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.groupID = uint32(groupID)
+	}
+
+	rootMode := linux.FileMode(0777)
+	modeStr, ok := mopts["rootmode"]
+	if ok {
+		delete(mopts, "rootmode")
+		mode, err := strconv.ParseUint(modeStr, 8, 32)
+		if err != nil {
+			log.Warningf("%s.GetFilesystem: invalid mode: %q", fsType.Name(), modeStr)
+			return nil, nil, syserror.EINVAL
+		}
+		rootMode = linux.FileMode(mode)
+	}
+	fsopts.rootMode = rootMode
+
+	// Set the maxInFlightRequests option.
+	fsopts.maxActiveRequests = maxActiveRequestsDefault
+
+	// Check for unparsed options.
+	if len(mopts) != 0 {
+		log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Create a new FUSE filesystem.
+	fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
+	if err != nil {
+		log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err)
+		return nil, nil, err
+	}
+
+	fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
+
+	// Send a FUSE_INIT request to the FUSE daemon server before returning.
+	// This call is not blocking.
+	if err := fs.conn.InitSend(creds, uint32(kernelTask.ThreadID())); err != nil {
+		log.Warningf("%s.InitSend: failed with error: %v", fsType.Name(), err)
+		return nil, nil, err
+	}
+
+	// root is the fusefs root directory.
+	root := fs.newInode(creds, fsopts.rootMode)
+
+	return fs.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+// NewFUSEFilesystem creates a new FUSE filesystem.
+func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
+	fs := &filesystem{
+		devMinor: devMinor,
+		opts:     opts,
+	}
+
+	conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests)
+	if err != nil {
+		log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err)
+		return nil, syserror.EINVAL
+	}
+
+	fs.conn = conn
+	fuseFD := device.Impl().(*DeviceFD)
+	fuseFD.fs = fs
+
+	return fs, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release(ctx)
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoDynamicLookup
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
+
+	dentry kernfs.Dentry
+}
+
+func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
+	i := &inode{}
+	i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
+	i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	i.dentry.Init(i)
+
+	return &i.dentry
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+// statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The
+// opts.Sync attribute is ignored since the synchronization is handled by the
+// FUSE server.
+func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx {
+	var stat linux.Statx
+	stat.Blksize = attr.BlkSize
+	stat.DevMajor, stat.DevMinor = linux.UNNAMED_MAJOR, devMinor
+
+	rdevMajor, rdevMinor := linux.DecodeDeviceID(attr.Rdev)
+	stat.RdevMajor, stat.RdevMinor = uint32(rdevMajor), rdevMinor
+
+	if mask&linux.STATX_MODE != 0 {
+		stat.Mode = uint16(attr.Mode)
+	}
+	if mask&linux.STATX_NLINK != 0 {
+		stat.Nlink = attr.Nlink
+	}
+	if mask&linux.STATX_UID != 0 {
+		stat.UID = attr.UID
+	}
+	if mask&linux.STATX_GID != 0 {
+		stat.GID = attr.GID
+	}
+	if mask&linux.STATX_ATIME != 0 {
+		stat.Atime = linux.StatxTimestamp{
+			Sec:  int64(attr.Atime),
+			Nsec: attr.AtimeNsec,
+		}
+	}
+	if mask&linux.STATX_MTIME != 0 {
+		stat.Mtime = linux.StatxTimestamp{
+			Sec:  int64(attr.Mtime),
+			Nsec: attr.MtimeNsec,
+		}
+	}
+	if mask&linux.STATX_CTIME != 0 {
+		stat.Ctime = linux.StatxTimestamp{
+			Sec:  int64(attr.Ctime),
+			Nsec: attr.CtimeNsec,
+		}
+	}
+	if mask&linux.STATX_INO != 0 {
+		stat.Ino = attr.Ino
+	}
+	if mask&linux.STATX_SIZE != 0 {
+		stat.Size = attr.Size
+	}
+	if mask&linux.STATX_BLOCKS != 0 {
+		stat.Blocks = attr.Blocks
+	}
+	return stat
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	fusefs := fs.Impl().(*filesystem)
+	conn := fusefs.conn
+	task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+	if task == nil {
+		log.Warningf("couldn't get kernel task from context")
+		return linux.Statx{}, syserror.EINVAL
+	}
+
+	var in linux.FUSEGetAttrIn
+	// We don't set any attribute in the request, because in VFS2 fstat(2) will
+	// finally be translated into vfs.FilesystemImpl.StatAt() (see
+	// pkg/sentry/syscalls/linux/vfs2/stat.go), resulting in the same flow
+	// as stat(2). Thus GetAttrFlags and Fh variable will never be used in VFS2.
+	req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.Ino(), linux.FUSE_GETATTR, &in)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+
+	res, err := conn.Call(task, req)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	if err := res.Error(); err != nil {
+		return linux.Statx{}, err
+	}
+
+	var out linux.FUSEGetAttrOut
+	if err := res.UnmarshalPayload(&out); err != nil {
+		return linux.Statx{}, err
+	}
+
+	// Set all metadata into kernfs.InodeAttrs.
+	if err := i.SetStat(ctx, fs, creds, vfs.SetStatOptions{
+		Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, fusefs.devMinor),
+	}); err != nil {
+		return linux.Statx{}, err
+	}
+
+	return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/init.go b/pkg/sentry/fsimpl/fuse/init.go
new file mode 100644
index 000000000..779c2bd3f
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/init.go
@@ -0,0 +1,166 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+// consts used by FUSE_INIT negotiation.
+const (
+	// fuseMaxMaxPages is the maximum value for MaxPages received in InitOut.
+	// Follow the same behavior as unix fuse implementation.
+	fuseMaxMaxPages = 256
+
+	// Maximum value for the time granularity for file time stamps, 1s.
+	// Follow the same behavior as unix fuse implementation.
+	fuseMaxTimeGranNs = 1000000000
+
+	// Minimum value for MaxWrite.
+	// Follow the same behavior as unix fuse implementation.
+	fuseMinMaxWrite = 4096
+
+	// Temporary default value for max readahead, 128kb.
+	fuseDefaultMaxReadahead = 131072
+
+	// The FUSE_INIT_IN flags sent to the daemon.
+	// TODO(gvisor.dev/issue/3199): complete the flags.
+	fuseDefaultInitFlags = linux.FUSE_MAX_PAGES
+)
+
+// Adjustable maximums for Connection's cogestion control parameters.
+// Used as the upperbound of the config values.
+// Currently we do not support adjustment to them.
+var (
+	MaxUserBackgroundRequest   uint16 = fuseDefaultMaxBackground
+	MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold
+)
+
+// InitSend sends a FUSE_INIT request.
+func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
+	in := linux.FUSEInitIn{
+		Major: linux.FUSE_KERNEL_VERSION,
+		Minor: linux.FUSE_KERNEL_MINOR_VERSION,
+		// TODO(gvisor.dev/issue/3196): find appropriate way to calculate this
+		MaxReadahead: fuseDefaultMaxReadahead,
+		Flags:        fuseDefaultInitFlags,
+	}
+
+	req, err := conn.NewRequest(creds, pid, 0, linux.FUSE_INIT, &in)
+	if err != nil {
+		return err
+	}
+
+	// Since there is no task to block on and FUSE_INIT is the request
+	// to unblock other requests, use nil.
+	return conn.CallAsync(nil, req)
+}
+
+// InitRecv receives a FUSE_INIT reply and process it.
+func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error {
+	if err := res.Error(); err != nil {
+		return err
+	}
+
+	var out linux.FUSEInitOut
+	if err := res.UnmarshalPayload(&out); err != nil {
+		return err
+	}
+
+	return conn.initProcessReply(&out, hasSysAdminCap)
+}
+
+// Process the FUSE_INIT reply from the FUSE server.
+func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error {
+	// No support for old major fuse versions.
+	if out.Major != linux.FUSE_KERNEL_VERSION {
+		conn.connInitError = true
+
+		// Set the connection as initialized and unblock the blocked requests
+		// (i.e. return error for them).
+		conn.SetInitialized()
+
+		return nil
+	}
+
+	// Start processing the reply.
+	conn.connInitSuccess = true
+	conn.minor = out.Minor
+
+	// No support for limits before minor version 13.
+	if out.Minor >= 13 {
+		conn.bgLock.Lock()
+
+		if out.MaxBackground > 0 {
+			conn.maxBackground = out.MaxBackground
+
+			if !hasSysAdminCap &&
+				conn.maxBackground > MaxUserBackgroundRequest {
+				conn.maxBackground = MaxUserBackgroundRequest
+			}
+		}
+
+		if out.CongestionThreshold > 0 {
+			conn.congestionThreshold = out.CongestionThreshold
+
+			if !hasSysAdminCap &&
+				conn.congestionThreshold > MaxUserCongestionThreshold {
+				conn.congestionThreshold = MaxUserCongestionThreshold
+			}
+		}
+
+		conn.bgLock.Unlock()
+	}
+
+	// No support for the following flags before minor version 6.
+	if out.Minor >= 6 {
+		conn.asyncRead = out.Flags&linux.FUSE_ASYNC_READ != 0
+		conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0
+		conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0
+		conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0
+		conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0
+		conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0
+
+		// TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs).
+
+		if out.Flags&linux.FUSE_MAX_PAGES != 0 {
+			maxPages := out.MaxPages
+			if maxPages < 1 {
+				maxPages = 1
+			}
+			if maxPages > fuseMaxMaxPages {
+				maxPages = fuseMaxMaxPages
+			}
+			conn.maxPages = maxPages
+		}
+	}
+
+	// No support for negotiating MaxWrite before minor version 5.
+	if out.Minor >= 5 {
+		conn.maxWrite = out.MaxWrite
+	} else {
+		conn.maxWrite = fuseMinMaxWrite
+	}
+	if conn.maxWrite < fuseMinMaxWrite {
+		conn.maxWrite = fuseMinMaxWrite
+	}
+
+	// Set connection as initialized and unblock the requests
+	// issued before init.
+	conn.SetInitialized()
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/register.go b/pkg/sentry/fsimpl/fuse/register.go
new file mode 100644
index 000000000..b5b581152
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/register.go
@@ -0,0 +1,42 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// Register registers the FUSE device with vfsObj.
+func Register(vfsObj *vfs.VirtualFilesystem) error {
+	if err := vfsObj.RegisterDevice(vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, fuseDevice{}, &vfs.RegisterDeviceOptions{
+		GroupName: "misc",
+	}); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// CreateDevtmpfsFile creates a device special file in devtmpfs.
+func CreateDevtmpfsFile(ctx context.Context, dev *devtmpfs.Accessor) error {
+	if err := dev.CreateDeviceFile(ctx, "fuse", vfs.CharDevice, linux.MISC_MAJOR, fuseDevMinor, 0666 /* mode */); err != nil {
+		return err
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
new file mode 100644
index 000000000..16787116f
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -0,0 +1,90 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+    name = "dentry_list",
+    out = "dentry_list.go",
+    package = "gofer",
+    prefix = "dentry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*dentry",
+        "Linker": "*dentry",
+    },
+)
+
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "gofer",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "dentry",
+    },
+)
+
+go_library(
+    name = "gofer",
+    srcs = [
+        "dentry_list.go",
+        "directory.go",
+        "filesystem.go",
+        "fstree.go",
+        "gofer.go",
+        "handle.go",
+        "host_named_pipe.go",
+        "p9file.go",
+        "regular_file.go",
+        "socket.go",
+        "special_file.go",
+        "symlink.go",
+        "time.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fd",
+        "//pkg/fdnotifier",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/p9",
+        "//pkg/safemem",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/fsimpl/host",
+        "//pkg/sentry/hostfd",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/unix",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
+        "//pkg/syserr",
+        "//pkg/syserror",
+        "//pkg/unet",
+        "//pkg/usermem",
+        "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
+
+go_test(
+    name = "gofer_test",
+    srcs = ["gofer_test.go"],
+    library = ":gofer",
+    deps = [
+        "//pkg/p9",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/pgalloc",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
new file mode 100644
index 000000000..2a8011eb4
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -0,0 +1,306 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isDir() bool {
+	return d.fileType() == linux.S_IFDIR
+}
+
+// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked.
+// d.isDir(). child must be a newly-created dentry that has never had a parent.
+func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
+	d.IncRef() // reference held by child on its parent
+	child.parent = d
+	child.name = name
+	if d.children == nil {
+		d.children = make(map[string]*dentry)
+	}
+	d.children[name] = child
+}
+
+// Preconditions: d.dirMu must be locked. d.isDir().
+func (d *dentry) cacheNegativeLookupLocked(name string) {
+	// Don't cache negative lookups if InteropModeShared is in effect (since
+	// this makes remote lookup unavoidable), or if d.isSynthetic() (in which
+	// case the only files in the directory are those for which a dentry exists
+	// in d.children). Instead, just delete any previously-cached dentry.
+	if d.fs.opts.interop == InteropModeShared || d.isSynthetic() {
+		delete(d.children, name)
+		return
+	}
+	if d.children == nil {
+		d.children = make(map[string]*dentry)
+	}
+	d.children[name] = nil
+}
+
+type createSyntheticOpts struct {
+	name string
+	mode linux.FileMode
+	kuid auth.KUID
+	kgid auth.KGID
+
+	// The endpoint for a synthetic socket. endpoint should be nil if the file
+	// being created is not a socket.
+	endpoint transport.BoundEndpoint
+
+	// pipe should be nil if the file being created is not a pipe.
+	pipe *pipe.VFSPipe
+}
+
+// createSyntheticChildLocked creates a synthetic file with the given name
+// in d.
+//
+// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain
+// a child with the given name.
+func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
+	d2 := &dentry{
+		refs:      1, // held by d
+		fs:        d.fs,
+		ino:       d.fs.nextSyntheticIno(),
+		mode:      uint32(opts.mode),
+		uid:       uint32(opts.kuid),
+		gid:       uint32(opts.kgid),
+		blockSize: usermem.PageSize, // arbitrary
+		hostFD:    -1,
+		nlink:     uint32(2),
+	}
+	switch opts.mode.FileType() {
+	case linux.S_IFDIR:
+		// Nothing else needs to be done.
+	case linux.S_IFSOCK:
+		d2.endpoint = opts.endpoint
+	case linux.S_IFIFO:
+		d2.pipe = opts.pipe
+	default:
+		panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType()))
+	}
+	d2.pf.dentry = d2
+	d2.vfsd.Init(d2)
+
+	d.cacheNewChildLocked(d2, opts.name)
+	d.syntheticChildren++
+}
+
+type directoryFD struct {
+	fileDescription
+	vfs.DirectoryFileDescriptionDefaultImpl
+
+	mu      sync.Mutex
+	off     int64
+	dirents []vfs.Dirent
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release(context.Context) {
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	d := fd.dentry()
+	if fd.dirents == nil {
+		ds, err := d.getDirents(ctx)
+		if err != nil {
+			return err
+		}
+		fd.dirents = ds
+	}
+
+	d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent)
+	if d.cachedMetadataAuthoritative() {
+		d.touchAtime(fd.vfsfd.Mount())
+	}
+
+	for fd.off < int64(len(fd.dirents)) {
+		if err := cb.Handle(fd.dirents[fd.off]); err != nil {
+			return err
+		}
+		fd.off++
+	}
+	return nil
+}
+
+// Preconditions: d.isDir(). There exists at least one directoryFD representing d.
+func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
+	// NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the
+	// presence of concurrent mutation of an iterated directory, so
+	// implementations may duplicate or omit entries in this case, which
+	// violates POSIX semantics. Thus we read all directory entries while
+	// holding d.dirMu to exclude directory mutations. (Note that it is
+	// impossible for the client to exclude concurrent mutation from other
+	// remote filesystem users. Since there is no way to detect if the server
+	// has incorrectly omitted directory entries, we simply assume that the
+	// server is well-behaved under InteropModeShared.) This is inconsistent
+	// with Linux (which appears to assume that directory fids have the correct
+	// semantics, and translates struct file_operations::readdir calls directly
+	// to readdir RPCs), but is consistent with VFS1.
+
+	// filesystem.renameMu is needed for d.parent, and must be locked before
+	// dentry.dirMu.
+	d.fs.renameMu.RLock()
+	defer d.fs.renameMu.RUnlock()
+	d.dirMu.Lock()
+	defer d.dirMu.Unlock()
+	if d.dirents != nil {
+		return d.dirents, nil
+	}
+
+	// It's not clear if 9P2000.L's readdir is expected to return "." and "..",
+	// so we generate them here.
+	parent := genericParentOrSelf(d)
+	dirents := []vfs.Dirent{
+		{
+			Name:    ".",
+			Type:    linux.DT_DIR,
+			Ino:     uint64(d.ino),
+			NextOff: 1,
+		},
+		{
+			Name:    "..",
+			Type:    uint8(atomic.LoadUint32(&parent.mode) >> 12),
+			Ino:     uint64(parent.ino),
+			NextOff: 2,
+		},
+	}
+	var realChildren map[string]struct{}
+	if !d.isSynthetic() {
+		if d.syntheticChildren != 0 && d.fs.opts.interop == InteropModeShared {
+			// Record the set of children d actually has so that we don't emit
+			// duplicate entries for synthetic children.
+			realChildren = make(map[string]struct{})
+		}
+		off := uint64(0)
+		const count = 64 * 1024 // for consistency with the vfs1 client
+		d.handleMu.RLock()
+		if d.readFile.isNil() {
+			// This should not be possible because a readable handle should
+			// have been opened when the calling directoryFD was opened.
+			d.handleMu.RUnlock()
+			panic("gofer.dentry.getDirents called without a readable handle")
+		}
+		for {
+			p9ds, err := d.readFile.readdir(ctx, off, count)
+			if err != nil {
+				d.handleMu.RUnlock()
+				return nil, err
+			}
+			if len(p9ds) == 0 {
+				d.handleMu.RUnlock()
+				break
+			}
+			for _, p9d := range p9ds {
+				if p9d.Name == "." || p9d.Name == ".." {
+					continue
+				}
+				dirent := vfs.Dirent{
+					Name:    p9d.Name,
+					Ino:     uint64(inoFromPath(p9d.QID.Path)),
+					NextOff: int64(len(dirents) + 1),
+				}
+				// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
+				// DMSOCKET.
+				switch p9d.Type {
+				case p9.TypeSymlink:
+					dirent.Type = linux.DT_LNK
+				case p9.TypeDir:
+					dirent.Type = linux.DT_DIR
+				default:
+					dirent.Type = linux.DT_REG
+				}
+				dirents = append(dirents, dirent)
+				if realChildren != nil {
+					realChildren[p9d.Name] = struct{}{}
+				}
+			}
+			off = p9ds[len(p9ds)-1].Offset
+		}
+	}
+	// Emit entries for synthetic children.
+	if d.syntheticChildren != 0 {
+		for _, child := range d.children {
+			if child == nil || !child.isSynthetic() {
+				continue
+			}
+			if _, ok := realChildren[child.name]; ok {
+				continue
+			}
+			dirents = append(dirents, vfs.Dirent{
+				Name:    child.name,
+				Type:    uint8(atomic.LoadUint32(&child.mode) >> 12),
+				Ino:     uint64(child.ino),
+				NextOff: int64(len(dirents) + 1),
+			})
+		}
+	}
+	// Cache dirents for future directoryFDs if permitted.
+	if d.cachedMetadataAuthoritative() {
+		d.dirents = dirents
+	}
+	return dirents, nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		if offset < 0 {
+			return 0, syserror.EINVAL
+		}
+		if offset == 0 {
+			// Ensure that the next call to fd.IterDirents() calls
+			// fd.dentry().getDirents().
+			fd.dirents = nil
+		}
+		fd.off = offset
+		return fd.off, nil
+	case linux.SEEK_CUR:
+		offset += fd.off
+		if offset < 0 {
+			return 0, syserror.EINVAL
+		}
+		// Don't clear fd.dirents in this case, even if offset == 0.
+		fd.off = offset
+		return fd.off, nil
+	default:
+		return 0, syserror.EINVAL
+	}
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *directoryFD) Sync(ctx context.Context) error {
+	return fd.dentry().syncRemoteFile(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
new file mode 100644
index 000000000..a3903db33
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -0,0 +1,1550 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"math"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	// Snapshot current syncable dentries and special files.
+	fs.syncMu.Lock()
+	ds := make([]*dentry, 0, len(fs.syncableDentries))
+	for d := range fs.syncableDentries {
+		d.IncRef()
+		ds = append(ds, d)
+	}
+	sffds := make([]*specialFileFD, 0, len(fs.specialFileFDs))
+	for sffd := range fs.specialFileFDs {
+		sffd.vfsfd.IncRef()
+		sffds = append(sffds, sffd)
+	}
+	fs.syncMu.Unlock()
+
+	// Return the first error we encounter, but sync everything we can
+	// regardless.
+	var retErr error
+
+	// Sync regular files.
+	for _, d := range ds {
+		err := d.syncCachedFile(ctx)
+		d.DecRef(ctx)
+		if err != nil && retErr == nil {
+			retErr = err
+		}
+	}
+
+	// Sync special files, which may be writable but do not use dentry shared
+	// handles (so they won't be synced by the above).
+	for _, sffd := range sffds {
+		err := sffd.Sync(ctx)
+		sffd.vfsfd.DecRef(ctx)
+		if err != nil && retErr == nil {
+			retErr = err
+		}
+	}
+
+	return retErr
+}
+
+// maxFilenameLen is the maximum length of a filename. This is dictated by 9P's
+// encoding of strings, which uses 2 bytes for the length prefix.
+const maxFilenameLen = (1 << 16) - 1
+
+// dentrySlicePool is a pool of *[]*dentry used to store dentries for which
+// dentry.checkCachingLocked() must be called. The pool holds pointers to
+// slices because Go lacks generics, so sync.Pool operates on interface{}, so
+// every call to (what should be) sync.Pool<[]*dentry>.Put() allocates a copy
+// of the slice header on the heap.
+var dentrySlicePool = sync.Pool{
+	New: func() interface{} {
+		ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
+		return &ds
+	},
+}
+
+func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
+	if ds == nil {
+		ds = dentrySlicePool.Get().(*[]*dentry)
+	}
+	*ds = append(*ds, d)
+	return ds
+}
+
+// Preconditions: ds != nil.
+func putDentrySlice(ds *[]*dentry) {
+	// Allow dentries to be GC'd.
+	for i := range *ds {
+		(*ds)[i] = nil
+	}
+	*ds = (*ds)[:0]
+	dentrySlicePool.Put(ds)
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may become cached as a result of the traversal are appended
+// to *ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata
+// must be up to date.
+//
+// Postconditions: The returned dentry's cached metadata is up to date.
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	name := rp.Component()
+	if name == "." {
+		rp.Advance()
+		return d, nil
+	}
+	if name == ".." {
+		if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return d, nil
+		}
+		// We must assume that d.parent is correct, because if d has been moved
+		// elsewhere in the remote filesystem so that its parent has changed,
+		// we have no way of determining its new parent's location in the
+		// filesystem.
+		//
+		// Call rp.CheckMount() before updating d.parent's metadata, since if
+		// we traverse to another mount then d.parent's metadata is irrelevant.
+		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+			return nil, err
+		}
+		if d != d.parent && !d.cachedMetadataAuthoritative() {
+			if err := d.parent.updateFromGetattr(ctx); err != nil {
+				return nil, err
+			}
+		}
+		rp.Advance()
+		return d.parent, nil
+	}
+	child, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), d, name, ds)
+	if err != nil {
+		return nil, err
+	}
+	if child == nil {
+		return nil, syserror.ENOENT
+	}
+	if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+		return nil, err
+	}
+	if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx, rp.Mount())
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return child, nil
+}
+
+// getChildLocked returns a dentry representing the child of parent with the
+// given name. If no such child exists, getChildLocked returns (nil, nil).
+//
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// parent.isDir(). name is not "." or "..".
+//
+// Postconditions: If getChildLocked returns a non-nil dentry, its cached
+// metadata is up to date.
+func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+	if len(name) > maxFilenameLen {
+		return nil, syserror.ENAMETOOLONG
+	}
+	child, ok := parent.children[name]
+	if (ok && fs.opts.interop != InteropModeShared) || parent.isSynthetic() {
+		// Whether child is nil or not, it is cached information that is
+		// assumed to be correct.
+		return child, nil
+	}
+	// We either don't have cached information or need to verify that it's
+	// still correct, either of which requires a remote lookup. Check if this
+	// name is valid before performing the lookup.
+	return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds)
+}
+
+// Preconditions: As for getChildLocked. !parent.isSynthetic().
+func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
+	if child != nil {
+		// Need to lock child.metadataMu because we might be updating child
+		// metadata. We need to hold the lock *before* getting metadata from the
+		// server and release it after updating local metadata.
+		child.metadataMu.Lock()
+	}
+	qid, file, attrMask, attr, err := parent.file.walkGetAttrOne(ctx, name)
+	if err != nil && err != syserror.ENOENT {
+		if child != nil {
+			child.metadataMu.Unlock()
+		}
+		return nil, err
+	}
+	if child != nil {
+		if !file.isNil() && inoFromPath(qid.Path) == child.ino {
+			// The file at this path hasn't changed. Just update cached metadata.
+			file.close(ctx)
+			child.updateFromP9AttrsLocked(attrMask, &attr)
+			child.metadataMu.Unlock()
+			return child, nil
+		}
+		child.metadataMu.Unlock()
+		if file.isNil() && child.isSynthetic() {
+			// We have a synthetic file, and no remote file has arisen to
+			// replace it.
+			return child, nil
+		}
+		// The file at this path has changed or no longer exists. Mark the
+		// dentry invalidated, and re-evaluate its caching status (i.e. if it
+		// has 0 references, drop it). Wait to update parent.children until we
+		// know what to replace the existing dentry with (i.e. one of the
+		// returns below), to avoid a redundant map access.
+		vfsObj.InvalidateDentry(ctx, &child.vfsd)
+		if child.isSynthetic() {
+			// Normally we don't mark invalidated dentries as deleted since
+			// they may still exist (but at a different path), and also for
+			// consistency with Linux. However, synthetic files are guaranteed
+			// to become unreachable if their dentries are invalidated, so
+			// treat their invalidation as deletion.
+			child.setDeleted()
+			parent.syntheticChildren--
+			child.decRefLocked()
+			parent.dirents = nil
+		}
+		*ds = appendDentry(*ds, child)
+	}
+	if file.isNil() {
+		// No file exists at this path now. Cache the negative lookup if
+		// allowed.
+		parent.cacheNegativeLookupLocked(name)
+		return nil, nil
+	}
+	// Create a new dentry representing the file.
+	child, err = fs.newDentry(ctx, file, qid, attrMask, &attr)
+	if err != nil {
+		file.close(ctx)
+		delete(parent.children, name)
+		return nil, err
+	}
+	parent.cacheNewChildLocked(child, name)
+	// For now, child has 0 references, so our caller should call
+	// child.checkCachingLocked().
+	*ds = appendDentry(*ds, child)
+	return child, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// Preconditions: fs.renameMu must be locked. !rp.Done(). If
+// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to
+// date.
+func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+	for !rp.Final() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	if !d.cachedMetadataAuthoritative() {
+		// Get updated metadata for rp.Start() as required by fs.stepLocked().
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return nil, err
+		}
+	}
+	for !rp.Done() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// createInRemoteDir (if the parent directory is a real remote directory) or
+// createInSyntheticDir (if the parent directory is synthetic) to do so.
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	start := rp.Start().Impl().(*dentry)
+	if !start.cachedMetadataAuthoritative() {
+		// Get updated metadata for start as required by
+		// fs.walkParentDirLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return err
+		}
+	}
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EEXIST
+	}
+	if len(name) > maxFilenameLen {
+		return syserror.ENAMETOOLONG
+	}
+	if !dir && rp.MustBeDir() {
+		return syserror.ENOENT
+	}
+	if parent.isDeleted() {
+		return syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	if parent.isSynthetic() {
+		if child := parent.children[name]; child != nil {
+			return syserror.EEXIST
+		}
+		if createInSyntheticDir == nil {
+			return syserror.EPERM
+		}
+		if err := createInSyntheticDir(parent, name); err != nil {
+			return err
+		}
+		parent.touchCMtime()
+		parent.dirents = nil
+		ev := linux.IN_CREATE
+		if dir {
+			ev |= linux.IN_ISDIR
+		}
+		parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
+		return nil
+	}
+	if fs.opts.interop == InteropModeShared {
+		if child := parent.children[name]; child != nil && child.isSynthetic() {
+			return syserror.EEXIST
+		}
+		// The existence of a non-synthetic dentry at name would be inconclusive
+		// because the file it represents may have been deleted from the remote
+		// filesystem, so we would need to make an RPC to revalidate the dentry.
+		// Just attempt the file creation RPC instead. If a file does exist, the
+		// RPC will fail with EEXIST like we would have. If the RPC succeeds, and a
+		// stale dentry exists, the dentry will fail revalidation next time it's
+		// used.
+		if err := createInRemoteDir(parent, name); err != nil {
+			return err
+		}
+		ev := linux.IN_CREATE
+		if dir {
+			ev |= linux.IN_ISDIR
+		}
+		parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
+		return nil
+	}
+	if child := parent.children[name]; child != nil {
+		return syserror.EEXIST
+	}
+	// No cached dentry exists; however, there might still be an existing file
+	// at name. As above, we attempt the file creation RPC anyway.
+	if err := createInRemoteDir(parent, name); err != nil {
+		return err
+	}
+	if child, ok := parent.children[name]; ok && child == nil {
+		// Delete the now-stale negative dentry.
+		delete(parent.children, name)
+	}
+	parent.touchCMtime()
+	parent.dirents = nil
+	ev := linux.IN_CREATE
+	if dir {
+		ev |= linux.IN_ISDIR
+	}
+	parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
+	return nil
+}
+
+// Preconditions: !rp.Done().
+func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	start := rp.Start().Impl().(*dentry)
+	if !start.cachedMetadataAuthoritative() {
+		// Get updated metadata for start as required by
+		// fs.walkParentDirLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return err
+		}
+	}
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+
+	name := rp.Component()
+	if dir {
+		if name == "." {
+			return syserror.EINVAL
+		}
+		if name == ".." {
+			return syserror.ENOTEMPTY
+		}
+	} else {
+		if name == "." || name == ".." {
+			return syserror.EISDIR
+		}
+	}
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+
+	child, ok := parent.children[name]
+	if ok && child == nil {
+		return syserror.ENOENT
+	}
+
+	sticky := atomic.LoadUint32(&parent.mode)&linux.ModeSticky != 0
+	if sticky {
+		if !ok {
+			// If the sticky bit is set, we need to retrieve the child to determine
+			// whether removing it is allowed.
+			child, err = fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+			if err != nil {
+				return err
+			}
+		} else if child != nil && !child.cachedMetadataAuthoritative() {
+			// Make sure the dentry representing the file at name is up to date
+			// before examining its metadata.
+			child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
+			if err != nil {
+				return err
+			}
+		}
+		if err := parent.mayDelete(rp.Credentials(), child); err != nil {
+			return err
+		}
+	}
+
+	// If a child dentry exists, prepare to delete it. This should fail if it is
+	// a mount point. We detect mount points by speculatively calling
+	// PrepareDeleteDentry, which fails if child is a mount point. However, we
+	// may need to revalidate the file in this case to make sure that it has not
+	// been deleted or replaced on the remote fs, in which case the mount point
+	// will have disappeared. If calling PrepareDeleteDentry fails again on the
+	// up-to-date dentry, we can be sure that it is a mount point.
+	//
+	// Also note that if child is nil, then it can't be a mount point.
+	if child != nil {
+		// Hold child.dirMu so we can check child.children and
+		// child.syntheticChildren. We don't access these fields until a bit later,
+		// but locking child.dirMu after calling vfs.PrepareDeleteDentry() would
+		// create an inconsistent lock ordering between dentry.dirMu and
+		// vfs.Dentry.mu (in the VFS lock order, it would make dentry.dirMu both "a
+		// FilesystemImpl lock" and "a lock acquired by a FilesystemImpl between
+		// PrepareDeleteDentry and CommitDeleteDentry). To avoid this, lock
+		// child.dirMu before calling PrepareDeleteDentry.
+		child.dirMu.Lock()
+		defer child.dirMu.Unlock()
+		if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+			// We can skip revalidation in several cases:
+			// - We are not in InteropModeShared
+			// - The parent directory is synthetic, in which case the child must also
+			//   be synthetic
+			// - We already updated the child during the sticky bit check above
+			if parent.cachedMetadataAuthoritative() || sticky {
+				return err
+			}
+			child, err = fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, &ds)
+			if err != nil {
+				return err
+			}
+			if child != nil {
+				if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+					return err
+				}
+			}
+		}
+	}
+	flags := uint32(0)
+	// If a dentry exists, use it for best-effort checks on its deletability.
+	if dir {
+		if child != nil {
+			// child must be an empty directory.
+			if child.syntheticChildren != 0 {
+				// This is definitely not an empty directory, irrespective of
+				// fs.opts.interop.
+				vfsObj.AbortDeleteDentry(&child.vfsd)
+				return syserror.ENOTEMPTY
+			}
+			// If InteropModeShared is in effect and the first call to
+			// PrepareDeleteDentry above succeeded, then child wasn't
+			// revalidated (so we can't expect its file type to be correct) and
+			// individually revalidating its children (to confirm that they
+			// still exist) would be a waste of time.
+			if child.cachedMetadataAuthoritative() {
+				if !child.isDir() {
+					vfsObj.AbortDeleteDentry(&child.vfsd)
+					return syserror.ENOTDIR
+				}
+				for _, grandchild := range child.children {
+					if grandchild != nil {
+						vfsObj.AbortDeleteDentry(&child.vfsd)
+						return syserror.ENOTEMPTY
+					}
+				}
+			}
+		}
+		flags = linux.AT_REMOVEDIR
+	} else {
+		// child must be a non-directory file.
+		if child != nil && child.isDir() {
+			vfsObj.AbortDeleteDentry(&child.vfsd)
+			return syserror.EISDIR
+		}
+		if rp.MustBeDir() {
+			if child != nil {
+				vfsObj.AbortDeleteDentry(&child.vfsd)
+			}
+			return syserror.ENOTDIR
+		}
+	}
+	if parent.isSynthetic() {
+		if child == nil {
+			return syserror.ENOENT
+		}
+	} else if child == nil || !child.isSynthetic() {
+		err = parent.file.unlinkAt(ctx, name, flags)
+		if err != nil {
+			if child != nil {
+				vfsObj.AbortDeleteDentry(&child.vfsd)
+			}
+			return err
+		}
+	}
+
+	// Generate inotify events for rmdir or unlink.
+	if dir {
+		parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
+	} else {
+		var cw *vfs.Watches
+		if child != nil {
+			cw = &child.watches
+		}
+		vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name)
+	}
+
+	if child != nil {
+		vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
+		child.setDeleted()
+		if child.isSynthetic() {
+			parent.syntheticChildren--
+			child.decRefLocked()
+		}
+		ds = appendDentry(ds, child)
+	}
+	parent.cacheNegativeLookupLocked(name)
+	if parent.cachedMetadataAuthoritative() {
+		parent.dirents = nil
+		parent.touchCMtime()
+		if dir {
+			parent.decLinks()
+		}
+	}
+	return nil
+}
+
+// renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls
+// dentry.checkCachingLocked on all dentries in *ds with fs.renameMu locked for
+// writing.
+//
+// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
+// but dentry slices are allocated lazily, and it's much easier to say "defer
+// fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this.
+func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) {
+	fs.renameMu.RUnlock()
+	if *ds == nil {
+		return
+	}
+	if len(**ds) != 0 {
+		fs.renameMu.Lock()
+		for _, d := range **ds {
+			d.checkCachingLocked(ctx)
+		}
+		fs.renameMu.Unlock()
+	}
+	putDentrySlice(*ds)
+}
+
+func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) {
+	if *ds == nil {
+		fs.renameMu.Unlock()
+		return
+	}
+	for _, d := range **ds {
+		d.checkCachingLocked(ctx)
+	}
+	fs.renameMu.Unlock()
+	putDentrySlice(*ds)
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.checkPermissions(creds, ats)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	if opts.CheckSearchable {
+		if !d.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+			return nil, err
+		}
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	start := rp.Start().Impl().(*dentry)
+	if !start.cachedMetadataAuthoritative() {
+		// Get updated metadata for start as required by
+		// fs.walkParentDirLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return nil, err
+		}
+	}
+	d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error {
+		if rp.Mount() != vd.Mount() {
+			return syserror.EXDEV
+		}
+		d := vd.Dentry().Impl().(*dentry)
+		if d.isDir() {
+			return syserror.EPERM
+		}
+		gid := auth.KGID(atomic.LoadUint32(&d.gid))
+		uid := auth.KUID(atomic.LoadUint32(&d.uid))
+		mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+		if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil {
+			return err
+		}
+		if d.nlink == 0 {
+			return syserror.ENOENT
+		}
+		if d.nlink == math.MaxUint32 {
+			return syserror.EMLINK
+		}
+		if err := parent.file.link(ctx, d.file, childName); err != nil {
+			return err
+		}
+
+		// Success!
+		atomic.AddUint32(&d.nlink, 1)
+		return nil
+	}, nil)
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	creds := rp.Credentials()
+	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
+		if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil {
+			if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
+				return err
+			}
+			ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err)
+			parent.createSyntheticChildLocked(&createSyntheticOpts{
+				name: name,
+				mode: linux.S_IFDIR | opts.Mode,
+				kuid: creds.EffectiveKUID,
+				kgid: creds.EffectiveKGID,
+			})
+		}
+		if fs.opts.interop != InteropModeShared {
+			parent.incLinks()
+		}
+		return nil
+	}, func(parent *dentry, name string) error {
+		if !opts.ForSyntheticMountpoint {
+			// Can't create non-synthetic files in synthetic directories.
+			return syserror.EPERM
+		}
+		parent.createSyntheticChildLocked(&createSyntheticOpts{
+			name: name,
+			mode: linux.S_IFDIR | opts.Mode,
+			kuid: creds.EffectiveKUID,
+			kgid: creds.EffectiveKGID,
+		})
+		parent.incLinks()
+		return nil
+	})
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+		creds := rp.Credentials()
+		_, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+		// If the gofer does not allow creating a socket or pipe, create a
+		// synthetic one, i.e. one that is kept entirely in memory.
+		if err == syserror.EPERM {
+			switch opts.Mode.FileType() {
+			case linux.S_IFSOCK:
+				parent.createSyntheticChildLocked(&createSyntheticOpts{
+					name:     name,
+					mode:     opts.Mode,
+					kuid:     creds.EffectiveKUID,
+					kgid:     creds.EffectiveKGID,
+					endpoint: opts.Endpoint,
+				})
+				return nil
+			case linux.S_IFIFO:
+				parent.createSyntheticChildLocked(&createSyntheticOpts{
+					name: name,
+					mode: opts.Mode,
+					kuid: creds.EffectiveKUID,
+					kgid: creds.EffectiveKGID,
+					pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+				})
+				return nil
+			}
+		}
+		return err
+	}, nil)
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// Reject O_TMPFILE, which is not supported; supporting it correctly in the
+	// presence of other remote filesystem users requires remote filesystem
+	// support, and it isn't clear that there's any way to implement this in
+	// 9P.
+	if opts.Flags&linux.O_TMPFILE != 0 {
+		return nil, syserror.EOPNOTSUPP
+	}
+	mayCreate := opts.Flags&linux.O_CREAT != 0
+	mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)
+
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+
+	start := rp.Start().Impl().(*dentry)
+	if !start.cachedMetadataAuthoritative() {
+		// Get updated metadata for start as required by fs.stepLocked().
+		if err := start.updateFromGetattr(ctx); err != nil {
+			return nil, err
+		}
+	}
+	if rp.Done() {
+		// Reject attempts to open mount root directory with O_CREAT.
+		if mayCreate && rp.MustBeDir() {
+			return nil, syserror.EISDIR
+		}
+		if mustCreate {
+			return nil, syserror.EEXIST
+		}
+		return start.openLocked(ctx, rp, &opts)
+	}
+
+afterTrailingSymlink:
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+	// Reject attempts to open directories with O_CREAT.
+	if mayCreate && rp.MustBeDir() {
+		return nil, syserror.EISDIR
+	}
+	// Determine whether or not we need to create a file.
+	parent.dirMu.Lock()
+	child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+	if err == syserror.ENOENT && mayCreate {
+		if parent.isSynthetic() {
+			parent.dirMu.Unlock()
+			return nil, syserror.EPERM
+		}
+		fd, err := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds)
+		parent.dirMu.Unlock()
+		return fd, err
+	}
+	parent.dirMu.Unlock()
+	if err != nil {
+		return nil, err
+	}
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	// Open existing child or follow symlink.
+	if child.isSymlink() && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx, rp.Mount())
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		start = parent
+		goto afterTrailingSymlink
+	}
+	if rp.MustBeDir() && !child.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return child.openLocked(ctx, rp, &opts)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(opts)
+	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
+		return nil, err
+	}
+
+	trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG
+	if trunc {
+		// Lock metadataMu *while* we open a regular file with O_TRUNC because
+		// open(2) will change the file size on server.
+		d.metadataMu.Lock()
+		defer d.metadataMu.Unlock()
+	}
+
+	var vfd *vfs.FileDescription
+	var err error
+	mnt := rp.Mount()
+	switch d.fileType() {
+	case linux.S_IFREG:
+		if !d.fs.opts.regularFilesUseSpecialFileFD {
+			if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, ats&vfs.MayWrite != 0, trunc); err != nil {
+				return nil, err
+			}
+			fd := &regularFileFD{}
+			fd.LockFD.Init(&d.locks)
+			if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+				AllowDirectIO: true,
+			}); err != nil {
+				return nil, err
+			}
+			vfd = &fd.vfsfd
+		}
+	case linux.S_IFDIR:
+		// Can't open directories with O_CREAT.
+		if opts.Flags&linux.O_CREAT != 0 {
+			return nil, syserror.EISDIR
+		}
+		// Can't open directories writably.
+		if ats&vfs.MayWrite != 0 {
+			return nil, syserror.EISDIR
+		}
+		if opts.Flags&linux.O_DIRECT != 0 {
+			return nil, syserror.EINVAL
+		}
+		if !d.isSynthetic() {
+			if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil {
+				return nil, err
+			}
+		}
+		fd := &directoryFD{}
+		fd.LockFD.Init(&d.locks)
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+			return nil, err
+		}
+		return &fd.vfsfd, nil
+	case linux.S_IFLNK:
+		// Can't open symlinks without O_PATH (which is unimplemented).
+		return nil, syserror.ELOOP
+	case linux.S_IFSOCK:
+		if d.isSynthetic() {
+			return nil, syserror.ENXIO
+		}
+		if d.fs.iopts.OpenSocketsByConnecting {
+			return d.connectSocketLocked(ctx, opts)
+		}
+	case linux.S_IFIFO:
+		if d.isSynthetic() {
+			return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks)
+		}
+	}
+
+	if vfd == nil {
+		if vfd, err = d.openSpecialFileLocked(ctx, mnt, opts); err != nil {
+			return nil, err
+		}
+	}
+
+	if trunc {
+		// If no errors occured so far then update file size in memory. This
+		// step is required even if !d.cachedMetadataAuthoritative() because
+		// d.mappings has to be updated.
+		// d.metadataMu has already been acquired if trunc == true.
+		d.updateFileSizeLocked(0)
+
+		if d.cachedMetadataAuthoritative() {
+			d.touchCMtimeLocked()
+		}
+	}
+	return vfd, err
+}
+
+func (d *dentry) connectSocketLocked(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	if opts.Flags&linux.O_DIRECT != 0 {
+		return nil, syserror.EINVAL
+	}
+	fdObj, err := d.file.connect(ctx, p9.AnonymousSocket)
+	if err != nil {
+		return nil, err
+	}
+	fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fdObj.FD(), &host.NewFDOptions{
+		HaveFlags: true,
+		Flags:     opts.Flags,
+	})
+	if err != nil {
+		fdObj.Close()
+		return nil, err
+	}
+	fdObj.Release()
+	return fd, nil
+}
+
+func (d *dentry) openSpecialFileLocked(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(opts)
+	if opts.Flags&linux.O_DIRECT != 0 {
+		return nil, syserror.EINVAL
+	}
+	// We assume that the server silently inserts O_NONBLOCK in the open flags
+	// for all named pipes (because all existing gofers do this).
+	//
+	// NOTE(b/133875563): This makes named pipe opens racy, because the
+	// mechanisms for translating nonblocking to blocking opens can only detect
+	// the instantaneous presence of a peer holding the other end of the pipe
+	// open, not whether the pipe was *previously* opened by a peer that has
+	// since closed its end.
+	isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0
+retry:
+	h, err := openHandle(ctx, d.file, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0)
+	if err != nil {
+		if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && err == syserror.ENXIO {
+			// An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails
+			// with ENXIO if opening the same named pipe with O_WRONLY would
+			// block because there are no readers of the pipe.
+			if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
+				return nil, err
+			}
+			goto retry
+		}
+		return nil, err
+	}
+	if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 {
+		if err := blockUntilNonblockingPipeHasWriter(ctx, h.fd); err != nil {
+			h.close(ctx)
+			return nil, err
+		}
+	}
+	fd, err := newSpecialFileFD(h, mnt, d, &d.locks, opts.Flags)
+	if err != nil {
+		h.close(ctx)
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
+// !d.isSynthetic().
+func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+		return nil, err
+	}
+	if d.isDeleted() {
+		return nil, syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return nil, err
+	}
+	defer mnt.EndWrite()
+
+	// 9P2000.L's lcreate takes a fid representing the parent directory, and
+	// converts it into an open fid representing the created file, so we need
+	// to duplicate the directory fid first.
+	_, dirfile, err := d.file.walk(ctx, nil)
+	if err != nil {
+		return nil, err
+	}
+	creds := rp.Credentials()
+	name := rp.Component()
+	// We only want the access mode for creating the file.
+	createFlags := p9.OpenFlags(opts.Flags) & p9.OpenFlagsModeMask
+	fdobj, openFile, createQID, _, err := dirfile.create(ctx, name, createFlags, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+	if err != nil {
+		dirfile.close(ctx)
+		return nil, err
+	}
+	// Then we need to walk to the file we just created to get a non-open fid
+	// representing it, and to get its metadata. This must use d.file since, as
+	// explained above, dirfile was invalidated by dirfile.Create().
+	_, nonOpenFile, attrMask, attr, err := d.file.walkGetAttrOne(ctx, name)
+	if err != nil {
+		openFile.close(ctx)
+		if fdobj != nil {
+			fdobj.Close()
+		}
+		return nil, err
+	}
+
+	// Construct the new dentry.
+	child, err := d.fs.newDentry(ctx, nonOpenFile, createQID, attrMask, &attr)
+	if err != nil {
+		nonOpenFile.close(ctx)
+		openFile.close(ctx)
+		if fdobj != nil {
+			fdobj.Close()
+		}
+		return nil, err
+	}
+	*ds = appendDentry(*ds, child)
+	// Incorporate the fid that was opened by lcreate.
+	useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD
+	if useRegularFileFD {
+		child.handleMu.Lock()
+		if vfs.MayReadFileWithOpenFlags(opts.Flags) {
+			child.readFile = openFile
+			if fdobj != nil {
+				child.hostFD = int32(fdobj.Release())
+			}
+		} else if fdobj != nil {
+			// Can't use fdobj if it's not readable.
+			fdobj.Close()
+		}
+		if vfs.MayWriteFileWithOpenFlags(opts.Flags) {
+			child.writeFile = openFile
+		}
+		child.handleMu.Unlock()
+	}
+	// Insert the dentry into the tree.
+	d.cacheNewChildLocked(child, name)
+	if d.cachedMetadataAuthoritative() {
+		d.touchCMtime()
+		d.dirents = nil
+	}
+
+	// Finally, construct a file description representing the created file.
+	var childVFSFD *vfs.FileDescription
+	if useRegularFileFD {
+		fd := &regularFileFD{}
+		fd.LockFD.Init(&child.locks)
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &child.vfsd, &vfs.FileDescriptionOptions{
+			AllowDirectIO: true,
+		}); err != nil {
+			return nil, err
+		}
+		childVFSFD = &fd.vfsfd
+	} else {
+		h := handle{
+			file: openFile,
+			fd:   -1,
+		}
+		if fdobj != nil {
+			h.fd = int32(fdobj.Release())
+		}
+		fd, err := newSpecialFileFD(h, mnt, child, &d.locks, opts.Flags)
+		if err != nil {
+			h.close(ctx)
+			return nil, err
+		}
+		childVFSFD = &fd.vfsfd
+	}
+	d.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
+	return childVFSFD, nil
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	if !d.isSymlink() {
+		return "", syserror.EINVAL
+	}
+	return d.readlink(ctx, rp.Mount())
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	if opts.Flags != 0 {
+		// Requires 9P support.
+		return syserror.EINVAL
+	}
+
+	var ds *[]*dentry
+	fs.renameMu.Lock()
+	defer fs.renameMuUnlockAndCheckCaching(ctx, &ds)
+	newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
+	if err != nil {
+		return err
+	}
+	newName := rp.Component()
+	if newName == "." || newName == ".." {
+		return syserror.EBUSY
+	}
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	oldParent := oldParentVD.Dentry().Impl().(*dentry)
+	if !oldParent.cachedMetadataAuthoritative() {
+		if err := oldParent.updateFromGetattr(ctx); err != nil {
+			return err
+		}
+	}
+	creds := rp.Credentials()
+	if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	vfsObj := rp.VirtualFilesystem()
+	// We need a dentry representing the renamed file since, if it's a
+	// directory, we need to check for write permission on it.
+	oldParent.dirMu.Lock()
+	defer oldParent.dirMu.Unlock()
+	renamed, err := fs.getChildLocked(ctx, vfsObj, oldParent, oldName, &ds)
+	if err != nil {
+		return err
+	}
+	if renamed == nil {
+		return syserror.ENOENT
+	}
+	if err := oldParent.mayDelete(creds, renamed); err != nil {
+		return err
+	}
+	if renamed.isDir() {
+		if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
+			return syserror.EINVAL
+		}
+		if oldParent != newParent {
+			if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
+				return err
+			}
+		}
+	} else {
+		if opts.MustBeDir || rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+
+	if oldParent != newParent {
+		if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+			return err
+		}
+		newParent.dirMu.Lock()
+		defer newParent.dirMu.Unlock()
+	}
+	if newParent.isDeleted() {
+		return syserror.ENOENT
+	}
+	replaced, err := fs.getChildLocked(ctx, rp.VirtualFilesystem(), newParent, newName, &ds)
+	if err != nil {
+		return err
+	}
+	var replacedVFSD *vfs.Dentry
+	if replaced != nil {
+		replacedVFSD = &replaced.vfsd
+		if replaced.isDir() {
+			if !renamed.isDir() {
+				return syserror.EISDIR
+			}
+		} else {
+			if rp.MustBeDir() || renamed.isDir() {
+				return syserror.ENOTDIR
+			}
+		}
+	}
+
+	if oldParent == newParent && oldName == newName {
+		return nil
+	}
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
+		return err
+	}
+
+	// Update the remote filesystem.
+	if !renamed.isSynthetic() {
+		if err := renamed.file.rename(ctx, newParent.file, newName); err != nil {
+			vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+			return err
+		}
+	} else if replaced != nil && !replaced.isSynthetic() {
+		// We are replacing an existing real file with a synthetic one, so we
+		// need to unlink the former.
+		flags := uint32(0)
+		if replaced.isDir() {
+			flags = linux.AT_REMOVEDIR
+		}
+		if err := newParent.file.unlinkAt(ctx, newName, flags); err != nil {
+			vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+			return err
+		}
+	}
+
+	// Update the dentry tree.
+	vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
+	if replaced != nil {
+		replaced.setDeleted()
+		if replaced.isSynthetic() {
+			newParent.syntheticChildren--
+			replaced.decRefLocked()
+		}
+		ds = appendDentry(ds, replaced)
+	}
+	oldParent.cacheNegativeLookupLocked(oldName)
+	// We don't use newParent.cacheNewChildLocked() since we don't want to mess
+	// with reference counts and queue oldParent for checkCachingLocked if the
+	// parent isn't actually changing.
+	if oldParent != newParent {
+		ds = appendDentry(ds, oldParent)
+		newParent.IncRef()
+		if renamed.isSynthetic() {
+			oldParent.syntheticChildren--
+			newParent.syntheticChildren++
+		}
+	}
+	renamed.parent = newParent
+	renamed.name = newName
+	if newParent.children == nil {
+		newParent.children = make(map[string]*dentry)
+	}
+	newParent.children[newName] = renamed
+
+	// Update metadata.
+	if renamed.cachedMetadataAuthoritative() {
+		renamed.touchCtime()
+	}
+	if oldParent.cachedMetadataAuthoritative() {
+		oldParent.dirents = nil
+		oldParent.touchCMtime()
+		if renamed.isDir() {
+			oldParent.decLinks()
+		}
+	}
+	if newParent.cachedMetadataAuthoritative() {
+		newParent.dirents = nil
+		newParent.touchCMtime()
+		if renamed.isDir() && (replaced == nil || !replaced.isDir()) {
+			// Increase the link count if we did not replace another directory.
+			newParent.incLinks()
+		}
+	}
+	vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir())
+	return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	return fs.unlinkAt(ctx, rp, true /* dir */)
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+		return err
+	}
+	if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil {
+		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+		return err
+	}
+	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+	}
+	return nil
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	// Since walking updates metadata for all traversed dentries under
+	// InteropModeShared, including the returned one, we can return cached
+	// metadata here regardless of fs.opts.interop.
+	var stat linux.Statx
+	d.statTo(&stat)
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	// If d is synthetic, invoke statfs on the first ancestor of d that isn't.
+	for d.isSynthetic() {
+		d = d.parent
+	}
+	fsstat, err := d.file.statFS(ctx)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	nameLen := uint64(fsstat.NameLength)
+	if nameLen > maxFilenameLen {
+		nameLen = maxFilenameLen
+	}
+	return linux.Statfs{
+		// This is primarily for distinguishing a gofer file system in
+		// tests. Testing is important, so instead of defining
+		// something completely random, use a standard value.
+		Type:            linux.V9FS_MAGIC,
+		BlockSize:       int64(fsstat.BlockSize),
+		Blocks:          fsstat.Blocks,
+		BlocksFree:      fsstat.BlocksFree,
+		BlocksAvailable: fsstat.BlocksAvailable,
+		Files:           fsstat.Files,
+		FilesFree:       fsstat.FilesFree,
+		NameLength:      nameLen,
+	}, nil
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+		creds := rp.Credentials()
+		_, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
+		return err
+	}, nil)
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	return fs.unlinkAt(ctx, rp, false /* dir */)
+}
+
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+		return nil, err
+	}
+	if d.isSocket() {
+		if !d.isSynthetic() {
+			d.IncRef()
+			return &endpoint{
+				dentry: d,
+				file:   d.file.file,
+				path:   opts.Addr,
+			}, nil
+		}
+		return d.endpoint, nil
+	}
+	return nil, syserror.ECONNREFUSED
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	return d.listxattr(ctx, rp.Credentials(), size)
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	return d.getxattr(ctx, rp.Credentials(), &opts)
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+		return err
+	}
+	if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil {
+		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+		return err
+	}
+	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+		return err
+	}
+	if err := d.removexattr(ctx, rp.Credentials(), name); err != nil {
+		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+		return err
+	}
+	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.renameMu.RLock()
+	defer fs.renameMu.RUnlock()
+	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
+}
+
+func (fs *filesystem) nextSyntheticIno() inodeNumber {
+	return inodeNumber(atomic.AddUint64(&fs.syntheticSeq, 1) | syntheticInoMask)
+}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
new file mode 100644
index 000000000..63e589859
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -0,0 +1,1708 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gofer provides a filesystem implementation that is backed by a 9p
+// server, interchangably referred to as "gofers" throughout this package.
+//
+// Lock order:
+//   regularFileFD/directoryFD.mu
+//     filesystem.renameMu
+//       dentry.dirMu
+//         filesystem.syncMu
+//         dentry.metadataMu
+//           *** "memmap.Mappable locks" below this point
+//           dentry.mapsMu
+//             *** "memmap.Mappable locks taken by Translate" below this point
+//             dentry.handleMu
+//               dentry.dataMu
+//
+// Locking dentry.dirMu in multiple dentries requires that either ancestor
+// dentries are locked before descendant dentries, or that filesystem.renameMu
+// is locked for writing.
+package gofer
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Name is the default filesystem name.
+const Name = "9p"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// mfp is used to allocate memory that caches regular file contents. mfp is
+	// immutable.
+	mfp pgalloc.MemoryFileProvider
+
+	// Immutable options.
+	opts  filesystemOptions
+	iopts InternalFilesystemOptions
+
+	// client is the client used by this filesystem. client is immutable.
+	client *p9.Client
+
+	// clock is a realtime clock used to set timestamps in file operations.
+	clock ktime.Clock
+
+	// devMinor is the filesystem's minor device number. devMinor is immutable.
+	devMinor uint32
+
+	// renameMu serves two purposes:
+	//
+	// - It synchronizes path resolution with renaming initiated by this
+	// client.
+	//
+	// - It is held by path resolution to ensure that reachable dentries remain
+	// valid. A dentry is reachable by path resolution if it has a non-zero
+	// reference count (such that it is usable as vfs.ResolvingPath.Start() or
+	// is reachable from its children), or if it is a child dentry (such that
+	// it is reachable from its parent).
+	renameMu sync.RWMutex
+
+	// cachedDentries contains all dentries with 0 references. (Due to race
+	// conditions, it may also contain dentries with non-zero references.)
+	// cachedDentriesLen is the number of dentries in cachedDentries. These
+	// fields are protected by renameMu.
+	cachedDentries    dentryList
+	cachedDentriesLen uint64
+
+	// syncableDentries contains all dentries in this filesystem for which
+	// !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs.
+	// These fields are protected by syncMu.
+	syncMu           sync.Mutex
+	syncableDentries map[*dentry]struct{}
+	specialFileFDs   map[*specialFileFD]struct{}
+
+	// syntheticSeq stores a counter to used to generate unique inodeNumber for
+	// synthetic dentries.
+	syntheticSeq uint64
+}
+
+// inodeNumber represents inode number reported in Dirent.Ino. For regular
+// dentries, it comes from QID.Path from the 9P server. Synthetic dentries
+// have have their inodeNumber generated sequentially, with the MSB reserved to
+// prevent conflicts with regular dentries.
+type inodeNumber uint64
+
+// Reserve MSB for synthetic mounts.
+const syntheticInoMask = uint64(1) << 63
+
+func inoFromPath(path uint64) inodeNumber {
+	if path&syntheticInoMask != 0 {
+		log.Warningf("Dropping MSB from ino, collision is possible. Original: %d, new: %d", path, path&^syntheticInoMask)
+	}
+	return inodeNumber(path &^ syntheticInoMask)
+}
+
+type filesystemOptions struct {
+	// "Standard" 9P options.
+	fd      int
+	aname   string
+	interop InteropMode // derived from the "cache" mount option
+	dfltuid auth.KUID
+	dfltgid auth.KGID
+	msize   uint32
+	version string
+
+	// maxCachedDentries is the maximum number of dentries with 0 references
+	// retained by the client.
+	maxCachedDentries uint64
+
+	// If forcePageCache is true, host FDs may not be used for application
+	// memory mappings even if available; instead, the client must perform its
+	// own caching of regular file pages. This is primarily useful for testing.
+	forcePageCache bool
+
+	// If limitHostFDTranslation is true, apply maxFillRange() constraints to
+	// host FD mappings returned by dentry.(memmap.Mappable).Translate(). This
+	// makes memory accounting behavior more consistent between cases where
+	// host FDs are / are not available, but may increase the frequency of
+	// sentry-handled page faults on files for which a host FD is available.
+	limitHostFDTranslation bool
+
+	// If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote
+	// filesystem may not be coherent with writable host FDs opened later, so
+	// all uses of the former must be replaced by uses of the latter. This is
+	// usually only the case when the remote filesystem is a Linux overlayfs
+	// mount. (Prior to Linux 4.18, patch series centered on commit
+	// d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were
+	// incoherent between pre-copy-up and post-copy-up FDs; after that patch
+	// series, only memory mappings are incoherent.)
+	overlayfsStaleRead bool
+
+	// If regularFilesUseSpecialFileFD is true, application FDs representing
+	// regular files will use distinct file handles for each FD, in the same
+	// way that application FDs representing "special files" such as sockets
+	// do. Note that this disables client caching and mmap for regular files.
+	regularFilesUseSpecialFileFD bool
+}
+
+// InteropMode controls the client's interaction with other remote filesystem
+// users.
+type InteropMode uint32
+
+const (
+	// InteropModeExclusive is appropriate when the filesystem client is the
+	// only user of the remote filesystem.
+	//
+	// - The client may cache arbitrary filesystem state (file data, metadata,
+	// filesystem structure, etc.).
+	//
+	// - Client changes to filesystem state may be sent to the remote
+	// filesystem asynchronously, except when server permission checks are
+	// necessary.
+	//
+	// - File timestamps are based on client clocks. This ensures that users of
+	// the client observe timestamps that are coherent with their own clocks
+	// and consistent with Linux's semantics (in particular, it is not always
+	// possible for clients to set arbitrary atimes and mtimes depending on the
+	// remote filesystem implementation, and never possible for clients to set
+	// arbitrary ctimes.) If a dentry containing a client-defined atime or
+	// mtime is evicted from cache, client timestamps will be sent to the
+	// remote filesystem on a best-effort basis to attempt to ensure that
+	// timestamps will be preserved when another dentry representing the same
+	// file is instantiated.
+	InteropModeExclusive InteropMode = iota
+
+	// InteropModeWritethrough is appropriate when there are read-only users of
+	// the remote filesystem that expect to observe changes made by the
+	// filesystem client.
+	//
+	// - The client may cache arbitrary filesystem state.
+	//
+	// - Client changes to filesystem state must be sent to the remote
+	// filesystem synchronously.
+	//
+	// - File timestamps are based on client clocks. As a corollary, access
+	// timestamp changes from other remote filesystem users will not be visible
+	// to the client.
+	InteropModeWritethrough
+
+	// InteropModeShared is appropriate when there are users of the remote
+	// filesystem that may mutate its state other than the client.
+	//
+	// - The client must verify ("revalidate") cached filesystem state before
+	// using it.
+	//
+	// - Client changes to filesystem state must be sent to the remote
+	// filesystem synchronously.
+	//
+	// - File timestamps are based on server clocks. This is necessary to
+	// ensure that timestamp changes are synchronized between remote filesystem
+	// users.
+	//
+	// Note that the correctness of InteropModeShared depends on the server
+	// correctly implementing 9P fids (i.e. each fid immutably represents a
+	// single filesystem object), even in the presence of remote filesystem
+	// mutations from other users. If this is violated, the behavior of the
+	// client is undefined.
+	InteropModeShared
+)
+
+// InternalFilesystemOptions may be passed as
+// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+type InternalFilesystemOptions struct {
+	// If LeakConnection is true, do not close the connection to the server
+	// when the Filesystem is released. This is necessary for deployments in
+	// which servers can handle only a single client and report failure if that
+	// client disconnects.
+	LeakConnection bool
+
+	// If OpenSocketsByConnecting is true, silently translate attempts to open
+	// files identifying as sockets to connect RPCs.
+	OpenSocketsByConnecting bool
+}
+
+// _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default
+// UIDs and GIDs used for files that do not provide a specific owner or group
+// respectively.
+const (
+	// uint32(-2) doesn't work in Go.
+	_V9FS_DEFUID = auth.KUID(4294967294)
+	_V9FS_DEFGID = auth.KGID(4294967294)
+)
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	if mfp == nil {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider")
+		return nil, nil, syserror.EINVAL
+	}
+
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	var fsopts filesystemOptions
+
+	// Check that the transport is "fd".
+	trans, ok := mopts["trans"]
+	if !ok {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: transport must be specified as 'trans=fd'")
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "trans")
+	if trans != "fd" {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: unsupported transport: trans=%s", trans)
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Check that read and write FDs are provided and identical.
+	rfdstr, ok := mopts["rfdno"]
+	if !ok {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD must be specified as 'rfdno=<file descriptor>")
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "rfdno")
+	rfd, err := strconv.Atoi(rfdstr)
+	if err != nil {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid read FD: rfdno=%s", rfdstr)
+		return nil, nil, syserror.EINVAL
+	}
+	wfdstr, ok := mopts["wfdno"]
+	if !ok {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: write FD must be specified as 'wfdno=<file descriptor>")
+		return nil, nil, syserror.EINVAL
+	}
+	delete(mopts, "wfdno")
+	wfd, err := strconv.Atoi(wfdstr)
+	if err != nil {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid write FD: wfdno=%s", wfdstr)
+		return nil, nil, syserror.EINVAL
+	}
+	if rfd != wfd {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
+		return nil, nil, syserror.EINVAL
+	}
+	fsopts.fd = rfd
+
+	// Get the attach name.
+	fsopts.aname = "/"
+	if aname, ok := mopts["aname"]; ok {
+		delete(mopts, "aname")
+		fsopts.aname = aname
+	}
+
+	// Parse the cache policy. For historical reasons, this defaults to the
+	// least generally-applicable option, InteropModeExclusive.
+	fsopts.interop = InteropModeExclusive
+	if cache, ok := mopts["cache"]; ok {
+		delete(mopts, "cache")
+		switch cache {
+		case "fscache":
+			fsopts.interop = InteropModeExclusive
+		case "fscache_writethrough":
+			fsopts.interop = InteropModeWritethrough
+		case "none":
+			fsopts.regularFilesUseSpecialFileFD = true
+			fallthrough
+		case "remote_revalidating":
+			fsopts.interop = InteropModeShared
+		default:
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: cache=%s", cache)
+			return nil, nil, syserror.EINVAL
+		}
+	}
+
+	// Parse the default UID and GID.
+	fsopts.dfltuid = _V9FS_DEFUID
+	if dfltuidstr, ok := mopts["dfltuid"]; ok {
+		delete(mopts, "dfltuid")
+		dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltuid=%s", dfltuidstr)
+			return nil, nil, syserror.EINVAL
+		}
+		// In Linux, dfltuid is interpreted as a UID and is converted to a KUID
+		// in the caller's user namespace, but goferfs isn't
+		// application-mountable.
+		fsopts.dfltuid = auth.KUID(dfltuid)
+	}
+	fsopts.dfltgid = _V9FS_DEFGID
+	if dfltgidstr, ok := mopts["dfltgid"]; ok {
+		delete(mopts, "dfltgid")
+		dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: dfltgid=%s", dfltgidstr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.dfltgid = auth.KGID(dfltgid)
+	}
+
+	// Parse the 9P message size.
+	fsopts.msize = 1024 * 1024 // 1M, tested to give good enough performance up to 64M
+	if msizestr, ok := mopts["msize"]; ok {
+		delete(mopts, "msize")
+		msize, err := strconv.ParseUint(msizestr, 10, 32)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid message size: msize=%s", msizestr)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.msize = uint32(msize)
+	}
+
+	// Parse the 9P protocol version.
+	fsopts.version = p9.HighestVersionString()
+	if version, ok := mopts["version"]; ok {
+		delete(mopts, "version")
+		fsopts.version = version
+	}
+
+	// Parse the dentry cache limit.
+	fsopts.maxCachedDentries = 1000
+	if str, ok := mopts["dentry_cache_limit"]; ok {
+		delete(mopts, "dentry_cache_limit")
+		maxCachedDentries, err := strconv.ParseUint(str, 10, 64)
+		if err != nil {
+			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
+			return nil, nil, syserror.EINVAL
+		}
+		fsopts.maxCachedDentries = maxCachedDentries
+	}
+
+	// Handle simple flags.
+	if _, ok := mopts["force_page_cache"]; ok {
+		delete(mopts, "force_page_cache")
+		fsopts.forcePageCache = true
+	}
+	if _, ok := mopts["limit_host_fd_translation"]; ok {
+		delete(mopts, "limit_host_fd_translation")
+		fsopts.limitHostFDTranslation = true
+	}
+	if _, ok := mopts["overlayfs_stale_read"]; ok {
+		delete(mopts, "overlayfs_stale_read")
+		fsopts.overlayfsStaleRead = true
+	}
+	// fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying
+	// "cache=none".
+
+	// Check for unparsed options.
+	if len(mopts) != 0 {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Handle internal options.
+	iopts, ok := opts.InternalData.(InternalFilesystemOptions)
+	if opts.InternalData != nil && !ok {
+		ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData)
+		return nil, nil, syserror.EINVAL
+	}
+	// If !ok, iopts being the zero value is correct.
+
+	// Establish a connection with the server.
+	conn, err := unet.NewSocket(fsopts.fd)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	// Perform version negotiation with the server.
+	ctx.UninterruptibleSleepStart(false)
+	client, err := p9.NewClient(conn, fsopts.msize, fsopts.version)
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		conn.Close()
+		return nil, nil, err
+	}
+	// Ownership of conn has been transferred to client.
+
+	// Perform attach to obtain the filesystem root.
+	ctx.UninterruptibleSleepStart(false)
+	attached, err := client.Attach(fsopts.aname)
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		client.Close()
+		return nil, nil, err
+	}
+	attachFile := p9file{attached}
+	qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
+	if err != nil {
+		attachFile.close(ctx)
+		client.Close()
+		return nil, nil, err
+	}
+
+	// Construct the filesystem object.
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		attachFile.close(ctx)
+		client.Close()
+		return nil, nil, err
+	}
+	fs := &filesystem{
+		mfp:              mfp,
+		opts:             fsopts,
+		iopts:            iopts,
+		client:           client,
+		clock:            ktime.RealtimeClockFromContext(ctx),
+		devMinor:         devMinor,
+		syncableDentries: make(map[*dentry]struct{}),
+		specialFileFDs:   make(map[*specialFileFD]struct{}),
+	}
+	fs.vfsfs.Init(vfsObj, &fstype, fs)
+
+	// Construct the root dentry.
+	root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr)
+	if err != nil {
+		attachFile.close(ctx)
+		fs.vfsfs.DecRef(ctx)
+		return nil, nil, err
+	}
+	// Set the root's reference count to 2. One reference is returned to the
+	// caller, and the other is deliberately leaked to prevent the root from
+	// being "cached" and subsequently evicted. Its resources will still be
+	// cleaned up by fs.Release().
+	root.refs = 2
+
+	return &fs.vfsfs, &root.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+	mf := fs.mfp.MemoryFile()
+
+	fs.syncMu.Lock()
+	for d := range fs.syncableDentries {
+		d.handleMu.Lock()
+		d.dataMu.Lock()
+		if h := d.writeHandleLocked(); h.isOpen() {
+			// Write dirty cached data to the remote file.
+			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), h.writeFromBlocksAt); err != nil {
+				log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
+			}
+			// TODO(jamieliu): Do we need to flushf/fsync d?
+		}
+		// Discard cached pages.
+		d.cache.DropAll(mf)
+		d.dirty.RemoveAll()
+		d.dataMu.Unlock()
+		// Close the host fd if one exists.
+		if d.hostFD >= 0 {
+			syscall.Close(int(d.hostFD))
+			d.hostFD = -1
+		}
+		d.handleMu.Unlock()
+	}
+	// There can't be any specialFileFDs still using fs, since each such
+	// FileDescription would hold a reference on a Mount holding a reference on
+	// fs.
+	fs.syncMu.Unlock()
+
+	if !fs.iopts.LeakConnection {
+		// Close the connection to the server. This implicitly clunks all fids.
+		fs.client.Close()
+	}
+
+	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+	vfsd vfs.Dentry
+
+	// refs is the reference count. Each dentry holds a reference on its
+	// parent, even if disowned. An additional reference is held on all
+	// synthetic dentries until they are unlinked or invalidated. When refs
+	// reaches 0, the dentry may be added to the cache or destroyed. If refs ==
+	// -1, the dentry has already been destroyed. refs is accessed using atomic
+	// memory operations.
+	refs int64
+
+	// fs is the owning filesystem. fs is immutable.
+	fs *filesystem
+
+	// parent is this dentry's parent directory. Each dentry holds a reference
+	// on its parent. If this dentry is a filesystem root, parent is nil.
+	// parent is protected by filesystem.renameMu.
+	parent *dentry
+
+	// name is the name of this dentry in its parent. If this dentry is a
+	// filesystem root, name is the empty string. name is protected by
+	// filesystem.renameMu.
+	name string
+
+	// file is the unopened p9.File that backs this dentry. file is immutable.
+	//
+	// If file.isNil(), this dentry represents a synthetic file, i.e. a file
+	// that does not exist on the remote filesystem. As of this writing, the
+	// only files that can be synthetic are sockets, pipes, and directories.
+	file p9file
+
+	// If deleted is non-zero, the file represented by this dentry has been
+	// deleted. deleted is accessed using atomic memory operations.
+	deleted uint32
+
+	// If cached is true, dentryEntry links dentry into
+	// filesystem.cachedDentries. cached and dentryEntry are protected by
+	// filesystem.renameMu.
+	cached bool
+	dentryEntry
+
+	dirMu sync.Mutex
+
+	// If this dentry represents a directory, children contains:
+	//
+	// - Mappings of child filenames to dentries representing those children.
+	//
+	// - Mappings of child filenames that are known not to exist to nil
+	// dentries (only if InteropModeShared is not in effect and the directory
+	// is not synthetic).
+	//
+	// children is protected by dirMu.
+	children map[string]*dentry
+
+	// If this dentry represents a directory, syntheticChildren is the number
+	// of child dentries for which dentry.isSynthetic() == true.
+	// syntheticChildren is protected by dirMu.
+	syntheticChildren int
+
+	// If this dentry represents a directory,
+	// dentry.cachedMetadataAuthoritative() == true, and dirents is not nil, it
+	// is a cache of all entries in the directory, in the order they were
+	// returned by the server. dirents is protected by dirMu.
+	dirents []vfs.Dirent
+
+	// Cached metadata; protected by metadataMu.
+	// To access:
+	//   - In situations where consistency is not required (like stat), these
+	//     can be accessed using atomic operations only (without locking).
+	//   - Lock metadataMu and can access without atomic operations.
+	// To mutate:
+	//   - Lock metadataMu and use atomic operations to update because we might
+	//     have atomic readers that don't hold the lock.
+	metadataMu sync.Mutex
+	ino        inodeNumber // immutable
+	mode       uint32      // type is immutable, perms are mutable
+	uid        uint32      // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid        uint32      // auth.KGID, but ...
+	blockSize  uint32      // 0 if unknown
+	// Timestamps, all nsecs from the Unix epoch.
+	atime int64
+	mtime int64
+	ctime int64
+	btime int64
+	// File size, which differs from other metadata in two ways:
+	//
+	// - We make a best-effort attempt to keep it up to date even if
+	// !dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes.
+	//
+	// - size is protected by both metadataMu and dataMu (i.e. both must be
+	// locked to mutate it; locking either is sufficient to access it).
+	size uint64
+	// If this dentry does not represent a synthetic file, deleted is 0, and
+	// atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the
+	// remote file's timestamps, which should be updated when this dentry is
+	// evicted.
+	atimeDirty uint32
+	mtimeDirty uint32
+
+	// nlink counts the number of hard links to this dentry. It's updated and
+	// accessed using atomic operations. It's not protected by metadataMu like the
+	// other metadata fields.
+	nlink uint32
+
+	mapsMu sync.Mutex
+
+	// If this dentry represents a regular file, mappings tracks mappings of
+	// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
+	mappings memmap.MappingSet
+
+	// - If this dentry represents a regular file or directory, readFile is the
+	// p9.File used for reads by all regularFileFDs/directoryFDs representing
+	// this dentry.
+	//
+	// - If this dentry represents a regular file, writeFile is the p9.File
+	// used for writes by all regularFileFDs representing this dentry.
+	//
+	// - If this dentry represents a regular file, hostFD is the host FD used
+	// for memory mappings and I/O (when applicable) in preference to readFile
+	// and writeFile. hostFD is always readable; if !writeFile.isNil(), it must
+	// also be writable. If hostFD is -1, no such host FD is available.
+	//
+	// These fields are protected by handleMu.
+	//
+	// readFile and writeFile may or may not represent the same p9.File. Once
+	// either p9.File transitions from closed (isNil() == true) to open
+	// (isNil() == false), it may be mutated with handleMu locked, but cannot
+	// be closed until the dentry is destroyed.
+	handleMu  sync.RWMutex
+	readFile  p9file
+	writeFile p9file
+	hostFD    int32
+
+	dataMu sync.RWMutex
+
+	// If this dentry represents a regular file that is client-cached, cache
+	// maps offsets into the cached file to offsets into
+	// filesystem.mfp.MemoryFile() that store the file's data. cache is
+	// protected by dataMu.
+	cache fsutil.FileRangeSet
+
+	// If this dentry represents a regular file that is client-cached, dirty
+	// tracks dirty segments in cache. dirty is protected by dataMu.
+	dirty fsutil.DirtySet
+
+	// pf implements platform.File for mappings of hostFD.
+	pf dentryPlatformFile
+
+	// If this dentry represents a symbolic link, InteropModeShared is not in
+	// effect, and haveTarget is true, target is the symlink target. haveTarget
+	// and target are protected by dataMu.
+	haveTarget bool
+	target     string
+
+	// If this dentry represents a synthetic socket file, endpoint is the
+	// transport endpoint bound to this file.
+	endpoint transport.BoundEndpoint
+
+	// If this dentry represents a synthetic named pipe, pipe is the pipe
+	// endpoint bound to this file.
+	pipe *pipe.VFSPipe
+
+	locks vfs.FileLocks
+
+	// Inotify watches for this dentry.
+	watches vfs.Watches
+}
+
+// dentryAttrMask returns a p9.AttrMask enabling all attributes used by the
+// gofer client.
+func dentryAttrMask() p9.AttrMask {
+	return p9.AttrMask{
+		Mode:  true,
+		UID:   true,
+		GID:   true,
+		ATime: true,
+		MTime: true,
+		CTime: true,
+		Size:  true,
+		BTime: true,
+	}
+}
+
+// newDentry creates a new dentry representing the given file. The dentry
+// initially has no references, but is not cached; it is the caller's
+// responsibility to set the dentry's reference count and/or call
+// dentry.checkCachingLocked() as appropriate.
+//
+// Preconditions: !file.isNil().
+func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, mask p9.AttrMask, attr *p9.Attr) (*dentry, error) {
+	if !mask.Mode {
+		ctx.Warningf("can't create gofer.dentry without file type")
+		return nil, syserror.EIO
+	}
+	if attr.Mode.FileType() == p9.ModeRegular && !mask.Size {
+		ctx.Warningf("can't create regular file gofer.dentry without file size")
+		return nil, syserror.EIO
+	}
+
+	d := &dentry{
+		fs:        fs,
+		file:      file,
+		ino:       inoFromPath(qid.Path),
+		mode:      uint32(attr.Mode),
+		uid:       uint32(fs.opts.dfltuid),
+		gid:       uint32(fs.opts.dfltgid),
+		blockSize: usermem.PageSize,
+		hostFD:    -1,
+	}
+	d.pf.dentry = d
+	if mask.UID {
+		d.uid = dentryUIDFromP9UID(attr.UID)
+	}
+	if mask.GID {
+		d.gid = dentryGIDFromP9GID(attr.GID)
+	}
+	if mask.Size {
+		d.size = attr.Size
+	}
+	if attr.BlockSize != 0 {
+		d.blockSize = uint32(attr.BlockSize)
+	}
+	if mask.ATime {
+		d.atime = dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds)
+	}
+	if mask.MTime {
+		d.mtime = dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds)
+	}
+	if mask.CTime {
+		d.ctime = dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds)
+	}
+	if mask.BTime {
+		d.btime = dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds)
+	}
+	if mask.NLink {
+		d.nlink = uint32(attr.NLink)
+	}
+	d.vfsd.Init(d)
+
+	fs.syncMu.Lock()
+	fs.syncableDentries[d] = struct{}{}
+	fs.syncMu.Unlock()
+	return d, nil
+}
+
+func (d *dentry) isSynthetic() bool {
+	return d.file.isNil()
+}
+
+func (d *dentry) cachedMetadataAuthoritative() bool {
+	return d.fs.opts.interop != InteropModeShared || d.isSynthetic()
+}
+
+// updateFromP9Attrs is called to update d's metadata after an update from the
+// remote filesystem.
+// Precondition: d.metadataMu must be locked.
+func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
+	if mask.Mode {
+		if got, want := uint32(attr.Mode.FileType()), d.fileType(); got != want {
+			d.metadataMu.Unlock()
+			panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got))
+		}
+		atomic.StoreUint32(&d.mode, uint32(attr.Mode))
+	}
+	if mask.UID {
+		atomic.StoreUint32(&d.uid, dentryUIDFromP9UID(attr.UID))
+	}
+	if mask.GID {
+		atomic.StoreUint32(&d.gid, dentryGIDFromP9GID(attr.GID))
+	}
+	// There is no P9_GETATTR_* bit for I/O block size.
+	if attr.BlockSize != 0 {
+		atomic.StoreUint32(&d.blockSize, uint32(attr.BlockSize))
+	}
+	// Don't override newer client-defined timestamps with old server-defined
+	// ones.
+	if mask.ATime && atomic.LoadUint32(&d.atimeDirty) == 0 {
+		atomic.StoreInt64(&d.atime, dentryTimestampFromP9(attr.ATimeSeconds, attr.ATimeNanoSeconds))
+	}
+	if mask.MTime && atomic.LoadUint32(&d.mtimeDirty) == 0 {
+		atomic.StoreInt64(&d.mtime, dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds))
+	}
+	if mask.CTime {
+		atomic.StoreInt64(&d.ctime, dentryTimestampFromP9(attr.CTimeSeconds, attr.CTimeNanoSeconds))
+	}
+	if mask.BTime {
+		atomic.StoreInt64(&d.btime, dentryTimestampFromP9(attr.BTimeSeconds, attr.BTimeNanoSeconds))
+	}
+	if mask.NLink {
+		atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
+	}
+	if mask.Size {
+		d.updateFileSizeLocked(attr.Size)
+	}
+}
+
+// Preconditions: !d.isSynthetic()
+func (d *dentry) updateFromGetattr(ctx context.Context) error {
+	// Use d.readFile or d.writeFile, which represent 9P fids that have been
+	// opened, in preference to d.file, which represents a 9P fid that has not.
+	// This may be significantly more efficient in some implementations. Prefer
+	// d.writeFile over d.readFile since some filesystem implementations may
+	// update a writable handle's metadata after writes to that handle, without
+	// making metadata updates immediately visible to read-only handles
+	// representing the same file.
+	var (
+		file            p9file
+		handleMuRLocked bool
+	)
+	// d.metadataMu must be locked *before* we getAttr so that we do not end up
+	// updating stale attributes in d.updateFromP9AttrsLocked().
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+	d.handleMu.RLock()
+	if !d.writeFile.isNil() {
+		file = d.writeFile
+		handleMuRLocked = true
+	} else if !d.readFile.isNil() {
+		file = d.readFile
+		handleMuRLocked = true
+	} else {
+		file = d.file
+		d.handleMu.RUnlock()
+	}
+	_, attrMask, attr, err := file.getAttr(ctx, dentryAttrMask())
+	if handleMuRLocked {
+		d.handleMu.RUnlock()
+	}
+	if err != nil {
+		return err
+	}
+	d.updateFromP9AttrsLocked(attrMask, &attr)
+	return nil
+}
+
+func (d *dentry) fileType() uint32 {
+	return atomic.LoadUint32(&d.mode) & linux.S_IFMT
+}
+
+func (d *dentry) statTo(stat *linux.Statx) {
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
+	stat.Blksize = atomic.LoadUint32(&d.blockSize)
+	stat.Nlink = atomic.LoadUint32(&d.nlink)
+	if stat.Nlink == 0 {
+		// The remote filesystem doesn't support link count; just make
+		// something up. This is consistent with Linux, where
+		// fs/inode.c:inode_init_always() initializes link count to 1, and
+		// fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if
+		// it's not provided by the remote filesystem.
+		stat.Nlink = 1
+	}
+	stat.UID = atomic.LoadUint32(&d.uid)
+	stat.GID = atomic.LoadUint32(&d.gid)
+	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
+	stat.Ino = uint64(d.ino)
+	stat.Size = atomic.LoadUint64(&d.size)
+	// This is consistent with regularFileFD.Seek(), which treats regular files
+	// as having no holes.
+	stat.Blocks = (stat.Size + 511) / 512
+	stat.Atime = statxTimestampFromDentry(atomic.LoadInt64(&d.atime))
+	stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime))
+	stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime))
+	stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime))
+	stat.DevMajor = linux.UNNAMED_MAJOR
+	stat.DevMinor = d.fs.devMinor
+}
+
+func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error {
+	stat := &opts.Stat
+	if stat.Mask == 0 {
+		return nil
+	}
+	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
+		return syserror.EPERM
+	}
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+		return err
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	if stat.Mask&linux.STATX_SIZE != 0 {
+		// Reject attempts to truncate files other than regular files, since
+		// filesystem implementations may return the wrong errno.
+		switch mode.FileType() {
+		case linux.S_IFREG:
+			// ok
+		case linux.S_IFDIR:
+			return syserror.EISDIR
+		default:
+			return syserror.EINVAL
+		}
+	}
+
+	var now int64
+	if d.cachedMetadataAuthoritative() {
+		// Truncate updates mtime.
+		if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE {
+			stat.Mask |= linux.STATX_MTIME
+			stat.Mtime = linux.StatxTimestamp{
+				Nsec: linux.UTIME_NOW,
+			}
+		}
+
+		// Use client clocks for timestamps.
+		now = d.fs.clock.Now().Nanoseconds()
+		if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW {
+			stat.Atime = statxTimestampFromDentry(now)
+		}
+		if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW {
+			stat.Mtime = statxTimestampFromDentry(now)
+		}
+	}
+
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+	if !d.isSynthetic() {
+		if stat.Mask != 0 {
+			if err := d.file.setAttr(ctx, p9.SetAttrMask{
+				Permissions:        stat.Mask&linux.STATX_MODE != 0,
+				UID:                stat.Mask&linux.STATX_UID != 0,
+				GID:                stat.Mask&linux.STATX_GID != 0,
+				Size:               stat.Mask&linux.STATX_SIZE != 0,
+				ATime:              stat.Mask&linux.STATX_ATIME != 0,
+				MTime:              stat.Mask&linux.STATX_MTIME != 0,
+				ATimeNotSystemTime: stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec != linux.UTIME_NOW,
+				MTimeNotSystemTime: stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec != linux.UTIME_NOW,
+			}, p9.SetAttr{
+				Permissions:      p9.FileMode(stat.Mode),
+				UID:              p9.UID(stat.UID),
+				GID:              p9.GID(stat.GID),
+				Size:             stat.Size,
+				ATimeSeconds:     uint64(stat.Atime.Sec),
+				ATimeNanoSeconds: uint64(stat.Atime.Nsec),
+				MTimeSeconds:     uint64(stat.Mtime.Sec),
+				MTimeNanoSeconds: uint64(stat.Mtime.Nsec),
+			}); err != nil {
+				return err
+			}
+			if stat.Mask&linux.STATX_SIZE != 0 {
+				// d.size should be kept up to date, and privatized
+				// copy-on-write mappings of truncated pages need to be
+				// invalidated, even if InteropModeShared is in effect.
+				d.updateFileSizeLocked(stat.Size)
+			}
+		}
+		if d.fs.opts.interop == InteropModeShared {
+			// There's no point to updating d's metadata in this case since
+			// it'll be overwritten by revalidation before the next time it's
+			// used anyway. (InteropModeShared inhibits client caching of
+			// regular file data, so there's no cache to truncate either.)
+			return nil
+		}
+	}
+	if stat.Mask&linux.STATX_MODE != 0 {
+		atomic.StoreUint32(&d.mode, d.fileType()|uint32(stat.Mode))
+	}
+	if stat.Mask&linux.STATX_UID != 0 {
+		atomic.StoreUint32(&d.uid, stat.UID)
+	}
+	if stat.Mask&linux.STATX_GID != 0 {
+		atomic.StoreUint32(&d.gid, stat.GID)
+	}
+	// Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because
+	// if d.cachedMetadataAuthoritative() then we converted stat.Atime and
+	// stat.Mtime to client-local timestamps above, and if
+	// !d.cachedMetadataAuthoritative() then we returned after calling
+	// d.file.setAttr(). For the same reason, now must have been initialized.
+	if stat.Mask&linux.STATX_ATIME != 0 {
+		atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
+		atomic.StoreUint32(&d.atimeDirty, 0)
+	}
+	if stat.Mask&linux.STATX_MTIME != 0 {
+		atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
+		atomic.StoreUint32(&d.mtimeDirty, 0)
+	}
+	atomic.StoreInt64(&d.ctime, now)
+	return nil
+}
+
+// Preconditions: d.metadataMu must be locked.
+func (d *dentry) updateFileSizeLocked(newSize uint64) {
+	d.dataMu.Lock()
+	oldSize := d.size
+	atomic.StoreUint64(&d.size, newSize)
+	// d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
+	// below. This allows concurrent calls to Read/Translate/etc. These
+	// functions synchronize with truncation by refusing to use cache
+	// contents beyond the new d.size. (We are still holding d.metadataMu,
+	// so we can't race with Write or another truncate.)
+	d.dataMu.Unlock()
+	if d.size < oldSize {
+		oldpgend, _ := usermem.PageRoundUp(oldSize)
+		newpgend, _ := usermem.PageRoundUp(d.size)
+		if oldpgend != newpgend {
+			d.mapsMu.Lock()
+			d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+				// Compare Linux's mm/truncate.c:truncate_setsize() =>
+				// truncate_pagecache() =>
+				// mm/memory.c:unmap_mapping_range(evencows=1).
+				InvalidatePrivate: true,
+			})
+			d.mapsMu.Unlock()
+		}
+		// We are now guaranteed that there are no translations of
+		// truncated pages, and can remove them from the cache. Since
+		// truncated pages have been removed from the remote file, they
+		// should be dropped without being written back.
+		d.dataMu.Lock()
+		d.cache.Truncate(d.size, d.fs.mfp.MemoryFile())
+		d.dirty.KeepClean(memmap.MappableRange{d.size, oldpgend})
+		d.dataMu.Unlock()
+	}
+}
+
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
+	return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid)))
+}
+
+func dentryUIDFromP9UID(uid p9.UID) uint32 {
+	if !uid.Ok() {
+		return uint32(auth.OverflowUID)
+	}
+	return uint32(uid)
+}
+
+func dentryGIDFromP9GID(gid p9.GID) uint32 {
+	if !gid.Ok() {
+		return uint32(auth.OverflowGID)
+	}
+	return uint32(gid)
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+	// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
+	// d.checkCachingLocked().
+	atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&d.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef(ctx context.Context) {
+	if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+		d.fs.renameMu.Lock()
+		d.checkCachingLocked(ctx)
+		d.fs.renameMu.Unlock()
+	} else if refs < 0 {
+		panic("gofer.dentry.DecRef() called without holding a reference")
+	}
+}
+
+// decRefLocked decrements d's reference count without calling
+// d.checkCachingLocked, even if d's reference count reaches 0; callers are
+// responsible for ensuring that d.checkCachingLocked will be called later.
+func (d *dentry) decRefLocked() {
+	if refs := atomic.AddInt64(&d.refs, -1); refs < 0 {
+		panic("gofer.dentry.decRefLocked() called without holding a reference")
+	}
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
+	if d.isDir() {
+		events |= linux.IN_ISDIR
+	}
+
+	d.fs.renameMu.RLock()
+	// The ordering below is important, Linux always notifies the parent first.
+	if d.parent != nil {
+		d.parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted())
+	}
+	d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted())
+	d.fs.renameMu.RUnlock()
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+	return &d.watches
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+//
+// If no watches are left on this dentry and it has no references, cache it.
+func (d *dentry) OnZeroWatches(ctx context.Context) {
+	if atomic.LoadInt64(&d.refs) == 0 {
+		d.fs.renameMu.Lock()
+		d.checkCachingLocked(ctx)
+		d.fs.renameMu.Unlock()
+	}
+}
+
+// checkCachingLocked should be called after d's reference count becomes 0 or it
+// becomes disowned.
+//
+// It may be called on a destroyed dentry. For example,
+// renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times
+// for the same dentry when the dentry is visited more than once in the same
+// operation. One of the calls may destroy the dentry, so subsequent calls will
+// do nothing.
+//
+// Preconditions: d.fs.renameMu must be locked for writing; it may be
+// temporarily unlocked.
+func (d *dentry) checkCachingLocked(ctx context.Context) {
+	// Dentries with a non-zero reference count must be retained. (The only way
+	// to obtain a reference on a dentry with zero references is via path
+	// resolution, which requires renameMu, so if d.refs is zero then it will
+	// remain zero while we hold renameMu for writing.)
+	refs := atomic.LoadInt64(&d.refs)
+	if refs > 0 {
+		if d.cached {
+			d.fs.cachedDentries.Remove(d)
+			d.fs.cachedDentriesLen--
+			d.cached = false
+		}
+		return
+	}
+	if refs == -1 {
+		// Dentry has already been destroyed.
+		return
+	}
+	// Deleted and invalidated dentries with zero references are no longer
+	// reachable by path resolution and should be dropped immediately.
+	if d.vfsd.IsDead() {
+		if d.isDeleted() {
+			d.watches.HandleDeletion(ctx)
+		}
+		if d.cached {
+			d.fs.cachedDentries.Remove(d)
+			d.fs.cachedDentriesLen--
+			d.cached = false
+		}
+		d.destroyLocked(ctx)
+		return
+	}
+	// If d still has inotify watches and it is not deleted or invalidated, we
+	// cannot cache it and allow it to be evicted. Otherwise, we will lose its
+	// watches, even if a new dentry is created for the same file in the future.
+	// Note that the size of d.watches cannot concurrently transition from zero
+	// to non-zero, because adding a watch requires holding a reference on d.
+	if d.watches.Size() > 0 {
+		return
+	}
+	// If d is already cached, just move it to the front of the LRU.
+	if d.cached {
+		d.fs.cachedDentries.Remove(d)
+		d.fs.cachedDentries.PushFront(d)
+		return
+	}
+	// Cache the dentry, then evict the least recently used cached dentry if
+	// the cache becomes over-full.
+	d.fs.cachedDentries.PushFront(d)
+	d.fs.cachedDentriesLen++
+	d.cached = true
+	if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries {
+		victim := d.fs.cachedDentries.Back()
+		d.fs.cachedDentries.Remove(victim)
+		d.fs.cachedDentriesLen--
+		victim.cached = false
+		// victim.refs may have become non-zero from an earlier path resolution
+		// since it was inserted into fs.cachedDentries.
+		if atomic.LoadInt64(&victim.refs) == 0 {
+			if victim.parent != nil {
+				victim.parent.dirMu.Lock()
+				if !victim.vfsd.IsDead() {
+					// Note that victim can't be a mount point (in any mount
+					// namespace), since VFS holds references on mount points.
+					d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd)
+					delete(victim.parent.children, victim.name)
+					// We're only deleting the dentry, not the file it
+					// represents, so we don't need to update
+					// victimParent.dirents etc.
+				}
+				victim.parent.dirMu.Unlock()
+			}
+			victim.destroyLocked(ctx)
+		}
+		// Whether or not victim was destroyed, we brought fs.cachedDentriesLen
+		// back down to fs.opts.maxCachedDentries, so we don't loop.
+	}
+}
+
+// destroyLocked destroys the dentry.
+//
+// Preconditions:
+// * d.fs.renameMu must be locked for writing; it may be temporarily unlocked.
+// * d.refs == 0.
+// * d.parent.children[d.name] != d, i.e. d is not reachable by path traversal
+//   from its former parent dentry.
+func (d *dentry) destroyLocked(ctx context.Context) {
+	switch atomic.LoadInt64(&d.refs) {
+	case 0:
+		// Mark the dentry destroyed.
+		atomic.StoreInt64(&d.refs, -1)
+	case -1:
+		panic("dentry.destroyLocked() called on already destroyed dentry")
+	default:
+		panic("dentry.destroyLocked() called with references on the dentry")
+	}
+
+	// Allow the following to proceed without renameMu locked to improve
+	// scalability.
+	d.fs.renameMu.Unlock()
+
+	mf := d.fs.mfp.MemoryFile()
+	d.handleMu.Lock()
+	d.dataMu.Lock()
+	if h := d.writeHandleLocked(); h.isOpen() {
+		// Write dirty pages back to the remote filesystem.
+		if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
+			log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err)
+		}
+	}
+	// Discard cached data.
+	if !d.cache.IsEmpty() {
+		mf.MarkAllUnevictable(d)
+		d.cache.DropAll(mf)
+		d.dirty.RemoveAll()
+	}
+	d.dataMu.Unlock()
+	// Clunk open fids and close open host FDs.
+	if !d.readFile.isNil() {
+		d.readFile.close(ctx)
+	}
+	if !d.writeFile.isNil() && d.readFile != d.writeFile {
+		d.writeFile.close(ctx)
+	}
+	d.readFile = p9file{}
+	d.writeFile = p9file{}
+	if d.hostFD >= 0 {
+		syscall.Close(int(d.hostFD))
+		d.hostFD = -1
+	}
+	d.handleMu.Unlock()
+
+	if !d.file.isNil() {
+		if !d.isDeleted() {
+			// Write dirty timestamps back to the remote filesystem.
+			atimeDirty := atomic.LoadUint32(&d.atimeDirty) != 0
+			mtimeDirty := atomic.LoadUint32(&d.mtimeDirty) != 0
+			if atimeDirty || mtimeDirty {
+				atime := atomic.LoadInt64(&d.atime)
+				mtime := atomic.LoadInt64(&d.mtime)
+				if err := d.file.setAttr(ctx, p9.SetAttrMask{
+					ATime:              atimeDirty,
+					ATimeNotSystemTime: atimeDirty,
+					MTime:              mtimeDirty,
+					MTimeNotSystemTime: mtimeDirty,
+				}, p9.SetAttr{
+					ATimeSeconds:     uint64(atime / 1e9),
+					ATimeNanoSeconds: uint64(atime % 1e9),
+					MTimeSeconds:     uint64(mtime / 1e9),
+					MTimeNanoSeconds: uint64(mtime % 1e9),
+				}); err != nil {
+					log.Warningf("gofer.dentry.destroyLocked: failed to write dirty timestamps back: %v", err)
+				}
+			}
+		}
+		d.file.close(ctx)
+		d.file = p9file{}
+		// Remove d from the set of syncable dentries.
+		d.fs.syncMu.Lock()
+		delete(d.fs.syncableDentries, d)
+		d.fs.syncMu.Unlock()
+	}
+
+	d.fs.renameMu.Lock()
+
+	// Drop the reference held by d on its parent without recursively locking
+	// d.fs.renameMu.
+	if d.parent != nil {
+		if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+			d.parent.checkCachingLocked(ctx)
+		} else if refs < 0 {
+			panic("gofer.dentry.DecRef() called without holding a reference")
+		}
+	}
+}
+
+func (d *dentry) isDeleted() bool {
+	return atomic.LoadUint32(&d.deleted) != 0
+}
+
+func (d *dentry) setDeleted() {
+	atomic.StoreUint32(&d.deleted, 1)
+}
+
+// We only support xattrs prefixed with "user." (see b/148380782). Currently,
+// there is no need to expose any other xattrs through a gofer.
+func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+	if d.file.isNil() || !d.userXattrSupported() {
+		return nil, nil
+	}
+	xattrMap, err := d.file.listXattr(ctx, size)
+	if err != nil {
+		return nil, err
+	}
+	xattrs := make([]string, 0, len(xattrMap))
+	for x := range xattrMap {
+		if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
+			xattrs = append(xattrs, x)
+		}
+	}
+	return xattrs, nil
+}
+
+func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+	if d.file.isNil() {
+		return "", syserror.ENODATA
+	}
+	if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
+		return "", err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return "", syserror.EOPNOTSUPP
+	}
+	if !d.userXattrSupported() {
+		return "", syserror.ENODATA
+	}
+	return d.file.getXattr(ctx, opts.Name, opts.Size)
+}
+
+func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+	if d.file.isNil() {
+		return syserror.EPERM
+	}
+	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	if !d.userXattrSupported() {
+		return syserror.EPERM
+	}
+	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
+}
+
+func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+	if d.file.isNil() {
+		return syserror.EPERM
+	}
+	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	if !d.userXattrSupported() {
+		return syserror.EPERM
+	}
+	return d.file.removeXattr(ctx, name)
+}
+
+// Extended attributes in the user.* namespace are only supported for regular
+// files and directories.
+func (d *dentry) userXattrSupported() bool {
+	filetype := linux.FileMode(atomic.LoadUint32(&d.mode)).FileType()
+	return filetype == linux.ModeRegular || filetype == linux.ModeDirectory
+}
+
+// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDir().
+func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
+	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
+	// O_TRUNC).
+	if !trunc {
+		d.handleMu.RLock()
+		if (!read || !d.readFile.isNil()) && (!write || !d.writeFile.isNil()) {
+			// Current handles are sufficient.
+			d.handleMu.RUnlock()
+			return nil
+		}
+		d.handleMu.RUnlock()
+	}
+
+	fdToClose := int32(-1)
+	invalidateTranslations := false
+	d.handleMu.Lock()
+	if (read && d.readFile.isNil()) || (write && d.writeFile.isNil()) || trunc {
+		// Get a new handle. If this file has been opened for both reading and
+		// writing, try to get a single handle that is usable for both:
+		//
+		// - Writable memory mappings of a host FD require that the host FD is
+		// opened for both reading and writing.
+		//
+		// - NOTE(b/141991141): Some filesystems may not ensure coherence
+		// between multiple handles for the same file.
+		openReadable := !d.readFile.isNil() || read
+		openWritable := !d.writeFile.isNil() || write
+		h, err := openHandle(ctx, d.file, openReadable, openWritable, trunc)
+		if err == syserror.EACCES && (openReadable != read || openWritable != write) {
+			// It may not be possible to use a single handle for both
+			// reading and writing, since permissions on the file may have
+			// changed to e.g. disallow reading after previously being
+			// opened for reading. In this case, we have no choice but to
+			// use separate handles for reading and writing.
+			ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d)
+			openReadable = read
+			openWritable = write
+			h, err = openHandle(ctx, d.file, openReadable, openWritable, trunc)
+		}
+		if err != nil {
+			d.handleMu.Unlock()
+			return err
+		}
+
+		if d.hostFD < 0 && openReadable && h.fd >= 0 {
+			// We have no existing FD; use the new FD for at least reading.
+			d.hostFD = h.fd
+		} else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable {
+			// We have an existing read-only FD, but the file has just been
+			// opened for writing, so we need to start supporting writable memory
+			// mappings. This may race with callers of d.pf.FD() using the existing
+			// FD, so in most cases we need to delay closing the old FD until after
+			// invalidating memmap.Translations that might have observed it.
+			if !openReadable || h.fd < 0 {
+				// We don't have a read/write FD, so we have no FD that can be
+				// used to create writable memory mappings. Switch to using the
+				// internal page cache.
+				invalidateTranslations = true
+				fdToClose = d.hostFD
+				d.hostFD = -1
+			} else if d.fs.opts.overlayfsStaleRead {
+				// We do have a read/write FD, but it may not be coherent with
+				// the existing read-only FD, so we must switch to mappings of
+				// the new FD in both the application and sentry.
+				if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
+					d.handleMu.Unlock()
+					ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
+					h.close(ctx)
+					return err
+				}
+				invalidateTranslations = true
+				fdToClose = d.hostFD
+				d.hostFD = h.fd
+			} else {
+				// We do have a read/write FD. To avoid invalidating existing
+				// memmap.Translations (which is expensive), use dup3 to make
+				// the old file descriptor refer to the new file description,
+				// then close the new file descriptor (which is no longer
+				// needed). Racing callers of d.pf.FD() may use the old or new
+				// file description, but this doesn't matter since they refer
+				// to the same file, and any racing mappings must be read-only.
+				if err := syscall.Dup3(int(h.fd), int(d.hostFD), syscall.O_CLOEXEC); err != nil {
+					oldHostFD := d.hostFD
+					d.handleMu.Unlock()
+					ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldHostFD, err)
+					h.close(ctx)
+					return err
+				}
+				fdToClose = h.fd
+			}
+		} else {
+			// h.fd is not useful.
+			fdToClose = h.fd
+		}
+
+		// Switch to new fids.
+		var oldReadFile p9file
+		if openReadable {
+			oldReadFile = d.readFile
+			d.readFile = h.file
+		}
+		var oldWriteFile p9file
+		if openWritable {
+			oldWriteFile = d.writeFile
+			d.writeFile = h.file
+		}
+		// NOTE(b/141991141): Clunk old fids before making new fids visible (by
+		// unlocking d.handleMu).
+		if !oldReadFile.isNil() {
+			oldReadFile.close(ctx)
+		}
+		if !oldWriteFile.isNil() && oldReadFile != oldWriteFile {
+			oldWriteFile.close(ctx)
+		}
+	}
+	d.handleMu.Unlock()
+
+	if invalidateTranslations {
+		// Invalidate application mappings that may be using an old FD; they
+		// will be replaced with mappings using the new FD after future calls
+		// to d.Translate(). This requires holding d.mapsMu, which precedes
+		// d.handleMu in the lock order.
+		d.mapsMu.Lock()
+		d.mappings.InvalidateAll(memmap.InvalidateOpts{})
+		d.mapsMu.Unlock()
+	}
+	if fdToClose >= 0 {
+		syscall.Close(int(fdToClose))
+	}
+
+	return nil
+}
+
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) readHandleLocked() handle {
+	return handle{
+		file: d.readFile,
+		fd:   d.hostFD,
+	}
+}
+
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) writeHandleLocked() handle {
+	return handle{
+		file: d.writeFile,
+		fd:   d.hostFD,
+	}
+}
+
+func (d *dentry) syncRemoteFile(ctx context.Context) error {
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	return d.syncRemoteFileLocked(ctx)
+}
+
+// Preconditions: d.handleMu must be locked.
+func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
+	// If we have a host FD, fsyncing it is likely to be faster than an fsync
+	// RPC.
+	if d.hostFD >= 0 {
+		ctx.UninterruptibleSleepStart(false)
+		err := syscall.Fsync(int(d.hostFD))
+		ctx.UninterruptibleSleepFinish(false)
+		return err
+	}
+	if !d.writeFile.isNil() {
+		return d.writeFile.fsync(ctx)
+	}
+	if !d.readFile.isNil() {
+		return d.readFile.fsync(ctx)
+	}
+	return nil
+}
+
+// incLinks increments link count.
+func (d *dentry) incLinks() {
+	if atomic.LoadUint32(&d.nlink) == 0 {
+		// The remote filesystem doesn't support link count.
+		return
+	}
+	atomic.AddUint32(&d.nlink, 1)
+}
+
+// decLinks decrements link count.
+func (d *dentry) decLinks() {
+	if atomic.LoadUint32(&d.nlink) == 0 {
+		// The remote filesystem doesn't support link count.
+		return
+	}
+	atomic.AddUint32(&d.nlink, ^uint32(0))
+}
+
+// fileDescription is embedded by gofer implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+
+	lockLogging sync.Once
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) dentry() *dentry {
+	return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	d := fd.dentry()
+	const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME)
+	if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
+		// TODO(jamieliu): Use specialFileFD.handle.file for the getattr if
+		// available?
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return linux.Statx{}, err
+		}
+	}
+	var stat linux.Statx
+	d.statTo(&stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	if err := fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount()); err != nil {
+		return err
+	}
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		fd.dentry().InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+	}
+	return nil
+}
+
+// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
+func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size)
+}
+
+// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
+func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
+	return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
+}
+
+// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
+func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+	d := fd.dentry()
+	if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
+		return err
+	}
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
+func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+	d := fd.dentry()
+	if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
+		return err
+	}
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
+func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error {
+	fd.lockLogging.Do(func() {
+		log.Infof("File lock using gofer file handled internally.")
+	})
+	return fd.LockFD.LockBSD(ctx, uid, t, block)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	fd.lockLogging.Do(func() {
+		log.Infof("Range lock using gofer file handled internally.")
+	})
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
new file mode 100644
index 000000000..bfe75dfe4
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -0,0 +1,67 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync/atomic"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+)
+
+func TestDestroyIdempotent(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fs := filesystem{
+		mfp:              pgalloc.MemoryFileProviderFromContext(ctx),
+		syncableDentries: make(map[*dentry]struct{}),
+		opts: filesystemOptions{
+			// Test relies on no dentry being held in the cache.
+			maxCachedDentries: 0,
+		},
+	}
+
+	attr := &p9.Attr{
+		Mode: p9.ModeRegular,
+	}
+	mask := p9.AttrMask{
+		Mode: true,
+		Size: true,
+	}
+	parent, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr)
+	if err != nil {
+		t.Fatalf("fs.newDentry(): %v", err)
+	}
+
+	child, err := fs.newDentry(ctx, p9file{}, p9.QID{}, mask, attr)
+	if err != nil {
+		t.Fatalf("fs.newDentry(): %v", err)
+	}
+	parent.cacheNewChildLocked(child, "child")
+
+	fs.renameMu.Lock()
+	defer fs.renameMu.Unlock()
+	child.checkCachingLocked(ctx)
+	if got := atomic.LoadInt64(&child.refs); got != -1 {
+		t.Fatalf("child.refs=%d, want: -1", got)
+	}
+	// Parent will also be destroyed when child reference is removed.
+	if got := atomic.LoadInt64(&parent.refs); got != -1 {
+		t.Fatalf("parent.refs=%d, want: -1", got)
+	}
+	child.checkCachingLocked(ctx)
+	child.checkCachingLocked(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
new file mode 100644
index 000000000..104157512
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -0,0 +1,130 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/hostfd"
+)
+
+// handle represents a remote "open file descriptor", consisting of an opened
+// fid (p9.File) and optionally a host file descriptor.
+type handle struct {
+	file p9file
+	fd   int32 // -1 if unavailable
+}
+
+// Preconditions: read || write.
+func openHandle(ctx context.Context, file p9file, read, write, trunc bool) (handle, error) {
+	_, newfile, err := file.walk(ctx, nil)
+	if err != nil {
+		return handle{fd: -1}, err
+	}
+	var flags p9.OpenFlags
+	switch {
+	case read && !write:
+		flags = p9.ReadOnly
+	case !read && write:
+		flags = p9.WriteOnly
+	case read && write:
+		flags = p9.ReadWrite
+	}
+	if trunc {
+		flags |= p9.OpenTruncate
+	}
+	fdobj, _, _, err := newfile.open(ctx, flags)
+	if err != nil {
+		newfile.close(ctx)
+		return handle{fd: -1}, err
+	}
+	fd := int32(-1)
+	if fdobj != nil {
+		fd = int32(fdobj.Release())
+	}
+	return handle{
+		file: newfile,
+		fd:   fd,
+	}, nil
+}
+
+func (h *handle) isOpen() bool {
+	return !h.file.isNil()
+}
+
+func (h *handle) close(ctx context.Context) {
+	h.file.close(ctx)
+	h.file = p9file{}
+	if h.fd >= 0 {
+		syscall.Close(int(h.fd))
+		h.fd = -1
+	}
+}
+
+func (h *handle) readToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+	if h.fd >= 0 {
+		ctx.UninterruptibleSleepStart(false)
+		n, err := hostfd.Preadv2(h.fd, dsts, int64(offset), 0 /* flags */)
+		ctx.UninterruptibleSleepFinish(false)
+		return n, err
+	}
+	if dsts.NumBlocks() == 1 && !dsts.Head().NeedSafecopy() {
+		n, err := h.file.readAt(ctx, dsts.Head().ToSlice(), offset)
+		return uint64(n), err
+	}
+	// Buffer the read since p9.File.ReadAt() takes []byte.
+	buf := make([]byte, dsts.NumBytes())
+	n, err := h.file.readAt(ctx, buf, offset)
+	if n == 0 {
+		return 0, err
+	}
+	if cp, cperr := safemem.CopySeq(dsts, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:n]))); cperr != nil {
+		return cp, cperr
+	}
+	return uint64(n), err
+}
+
+func (h *handle) writeFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+	if h.fd >= 0 {
+		ctx.UninterruptibleSleepStart(false)
+		n, err := hostfd.Pwritev2(h.fd, srcs, int64(offset), 0 /* flags */)
+		ctx.UninterruptibleSleepFinish(false)
+		return n, err
+	}
+	if srcs.NumBlocks() == 1 && !srcs.Head().NeedSafecopy() {
+		n, err := h.file.writeAt(ctx, srcs.Head().ToSlice(), offset)
+		return uint64(n), err
+	}
+	// Buffer the write since p9.File.WriteAt() takes []byte.
+	buf := make([]byte, srcs.NumBytes())
+	cp, cperr := safemem.CopySeq(safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), srcs)
+	if cp == 0 {
+		return 0, cperr
+	}
+	n, err := h.file.writeAt(ctx, buf[:cp], offset)
+	if err != nil {
+		return uint64(n), err
+	}
+	return cp, cperr
+}
diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
new file mode 100644
index 000000000..7294de7d6
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
@@ -0,0 +1,97 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+	"sync"
+	"time"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Global pipe used by blockUntilNonblockingPipeHasWriter since we can't create
+// pipes after sentry initialization due to syscall filters.
+var (
+	tempPipeMu      sync.Mutex
+	tempPipeReadFD  int
+	tempPipeWriteFD int
+	tempPipeBuf     [1]byte
+)
+
+func init() {
+	var pipeFDs [2]int
+	if err := unix.Pipe(pipeFDs[:]); err != nil {
+		panic(fmt.Sprintf("failed to create pipe for gofer.blockUntilNonblockingPipeHasWriter: %v", err))
+	}
+	tempPipeReadFD = pipeFDs[0]
+	tempPipeWriteFD = pipeFDs[1]
+}
+
+func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error {
+	for {
+		ok, err := nonblockingPipeHasWriter(fd)
+		if err != nil {
+			return err
+		}
+		if ok {
+			return nil
+		}
+		if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
+			return err
+		}
+	}
+}
+
+func nonblockingPipeHasWriter(fd int32) (bool, error) {
+	tempPipeMu.Lock()
+	defer tempPipeMu.Unlock()
+	// Copy 1 byte from fd into the temporary pipe.
+	n, err := unix.Tee(int(fd), tempPipeWriteFD, 1, unix.SPLICE_F_NONBLOCK)
+	if err == syserror.EAGAIN {
+		// The pipe represented by fd is empty, but has a writer.
+		return true, nil
+	}
+	if err != nil {
+		return false, err
+	}
+	if n == 0 {
+		// The pipe represented by fd is empty and has no writer.
+		return false, nil
+	}
+	// The pipe represented by fd is non-empty, so it either has, or has
+	// previously had, a writer. Remove the byte copied to the temporary pipe
+	// before returning.
+	if n, err := unix.Read(tempPipeReadFD, tempPipeBuf[:]); err != nil || n != 1 {
+		panic(fmt.Sprintf("failed to drain pipe for gofer.blockUntilNonblockingPipeHasWriter: got (%d, %v), wanted (1, nil)", n, err))
+	}
+	return true, nil
+}
+
+func sleepBetweenNamedPipeOpenChecks(ctx context.Context) error {
+	t := time.NewTimer(100 * time.Millisecond)
+	defer t.Stop()
+	cancel := ctx.SleepStart()
+	select {
+	case <-t.C:
+		ctx.SleepFinish(true)
+		return nil
+	case <-cancel:
+		ctx.SleepFinish(false)
+		return syserror.ErrInterrupted
+	}
+}
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
new file mode 100644
index 000000000..87f0b877f
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -0,0 +1,233 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fd"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// p9file is a wrapper around p9.File that provides methods that are
+// Context-aware.
+type p9file struct {
+	file p9.File
+}
+
+func (f p9file) isNil() bool {
+	return f.file == nil
+}
+
+func (f p9file) walk(ctx context.Context, names []string) ([]p9.QID, p9file, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qids, newfile, err := f.file.Walk(names)
+	ctx.UninterruptibleSleepFinish(false)
+	return qids, p9file{newfile}, err
+}
+
+func (f p9file) walkGetAttr(ctx context.Context, names []string) ([]p9.QID, p9file, p9.AttrMask, p9.Attr, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qids, newfile, attrMask, attr, err := f.file.WalkGetAttr(names)
+	ctx.UninterruptibleSleepFinish(false)
+	return qids, p9file{newfile}, attrMask, attr, err
+}
+
+// walkGetAttrOne is a wrapper around p9.File.WalkGetAttr that takes a single
+// path component and returns a single qid.
+func (f p9file) walkGetAttrOne(ctx context.Context, name string) (p9.QID, p9file, p9.AttrMask, p9.Attr, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qids, newfile, attrMask, attr, err := f.file.WalkGetAttr([]string{name})
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, err
+	}
+	if len(qids) != 1 {
+		ctx.Warningf("p9.File.WalkGetAttr returned %d qids (%v), wanted 1", len(qids), qids)
+		if newfile != nil {
+			p9file{newfile}.close(ctx)
+		}
+		return p9.QID{}, p9file{}, p9.AttrMask{}, p9.Attr{}, syserror.EIO
+	}
+	return qids[0], p9file{newfile}, attrMask, attr, nil
+}
+
+func (f p9file) statFS(ctx context.Context) (p9.FSStat, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fsstat, err := f.file.StatFS()
+	ctx.UninterruptibleSleepFinish(false)
+	return fsstat, err
+}
+
+func (f p9file) getAttr(ctx context.Context, req p9.AttrMask) (p9.QID, p9.AttrMask, p9.Attr, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, attrMask, attr, err := f.file.GetAttr(req)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, attrMask, attr, err
+}
+
+func (f p9file) setAttr(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.SetAttr(valid, attr)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) listXattr(ctx context.Context, size uint64) (map[string]struct{}, error) {
+	ctx.UninterruptibleSleepStart(false)
+	xattrs, err := f.file.ListXattr(size)
+	ctx.UninterruptibleSleepFinish(false)
+	return xattrs, err
+}
+
+func (f p9file) getXattr(ctx context.Context, name string, size uint64) (string, error) {
+	ctx.UninterruptibleSleepStart(false)
+	val, err := f.file.GetXattr(name, size)
+	ctx.UninterruptibleSleepFinish(false)
+	return val, err
+}
+
+func (f p9file) setXattr(ctx context.Context, name, value string, flags uint32) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.SetXattr(name, value, flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) removeXattr(ctx context.Context, name string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.RemoveXattr(name)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) allocate(ctx context.Context, mode p9.AllocateMode, offset, length uint64) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Allocate(mode, offset, length)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) close(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Close()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fdobj, qid, iounit, err := f.file.Open(flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return fdobj, qid, iounit, err
+}
+
+func (f p9file) readAt(ctx context.Context, p []byte, offset uint64) (int, error) {
+	ctx.UninterruptibleSleepStart(false)
+	n, err := f.file.ReadAt(p, offset)
+	ctx.UninterruptibleSleepFinish(false)
+	return n, err
+}
+
+func (f p9file) writeAt(ctx context.Context, p []byte, offset uint64) (int, error) {
+	ctx.UninterruptibleSleepStart(false)
+	n, err := f.file.WriteAt(p, offset)
+	ctx.UninterruptibleSleepFinish(false)
+	return n, err
+}
+
+func (f p9file) fsync(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.FSync()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) create(ctx context.Context, name string, flags p9.OpenFlags, permissions p9.FileMode, uid p9.UID, gid p9.GID) (*fd.FD, p9file, p9.QID, uint32, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fdobj, newfile, qid, iounit, err := f.file.Create(name, flags, permissions, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return fdobj, p9file{newfile}, qid, iounit, err
+}
+
+func (f p9file) mkdir(ctx context.Context, name string, permissions p9.FileMode, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, err := f.file.Mkdir(name, permissions, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, err
+}
+
+func (f p9file) symlink(ctx context.Context, oldName string, newName string, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, err := f.file.Symlink(oldName, newName, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, err
+}
+
+func (f p9file) link(ctx context.Context, target p9file, newName string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Link(target.file, newName)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) mknod(ctx context.Context, name string, mode p9.FileMode, major uint32, minor uint32, uid p9.UID, gid p9.GID) (p9.QID, error) {
+	ctx.UninterruptibleSleepStart(false)
+	qid, err := f.file.Mknod(name, mode, major, minor, uid, gid)
+	ctx.UninterruptibleSleepFinish(false)
+	return qid, err
+}
+
+func (f p9file) rename(ctx context.Context, newDir p9file, newName string) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Rename(newDir.file, newName)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) unlinkAt(ctx context.Context, name string, flags uint32) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.UnlinkAt(name, flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) readdir(ctx context.Context, offset uint64, count uint32) ([]p9.Dirent, error) {
+	ctx.UninterruptibleSleepStart(false)
+	dirents, err := f.file.Readdir(offset, count)
+	ctx.UninterruptibleSleepFinish(false)
+	return dirents, err
+}
+
+func (f p9file) readlink(ctx context.Context) (string, error) {
+	ctx.UninterruptibleSleepStart(false)
+	target, err := f.file.Readlink()
+	ctx.UninterruptibleSleepFinish(false)
+	return target, err
+}
+
+func (f p9file) flush(ctx context.Context) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.Flush()
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
+func (f p9file) connect(ctx context.Context, flags p9.ConnectFlags) (*fd.FD, error) {
+	ctx.UninterruptibleSleepStart(false)
+	fdobj, err := f.file.Connect(flags)
+	ctx.UninterruptibleSleepFinish(false)
+	return fdobj, err
+}
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
new file mode 100644
index 000000000..7e1cbf065
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -0,0 +1,944 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isRegularFile() bool {
+	return d.fileType() == linux.S_IFREG
+}
+
+type regularFileFD struct {
+	fileDescription
+
+	// off is the file offset. off is protected by mu.
+	mu  sync.Mutex
+	off int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release(context.Context) {
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *regularFileFD) OnClose(ctx context.Context) error {
+	if !fd.vfsfd.IsWritable() {
+		return nil
+	}
+	// Skip flushing if writes may be buffered by the client, since (as with
+	// the VFS1 client) we don't flush buffered writes on close anyway.
+	d := fd.dentry()
+	if d.fs.opts.interop == InteropModeExclusive {
+		return nil
+	}
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	if d.writeFile.isNil() {
+		return nil
+	}
+	return d.writeFile.flush(ctx)
+}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	d := fd.dentry()
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+
+	// Allocating a smaller size is a noop.
+	size := offset + length
+	if d.cachedMetadataAuthoritative() && size <= d.size {
+		return nil
+	}
+
+	d.handleMu.RLock()
+	err := d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+	d.handleMu.RUnlock()
+	if err != nil {
+		return err
+	}
+	d.dataMu.Lock()
+	atomic.StoreUint64(&d.size, size)
+	d.dataMu.Unlock()
+	if d.cachedMetadataAuthoritative() {
+		d.touchCMtimeLocked()
+	}
+	return nil
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	// Check for reading at EOF before calling into MM (but not under
+	// InteropModeShared, which makes d.size unreliable).
+	d := fd.dentry()
+	if d.cachedMetadataAuthoritative() && uint64(offset) >= atomic.LoadUint64(&d.size) {
+		return 0, io.EOF
+	}
+
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		// Lock d.metadataMu for the rest of the read to prevent d.size from
+		// changing.
+		d.metadataMu.Lock()
+		defer d.metadataMu.Unlock()
+		// Write dirty cached pages that will be touched by the read back to
+		// the remote file.
+		if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil {
+			return 0, err
+		}
+	}
+
+	rw := getDentryReadWriter(ctx, d, offset)
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		// Require the read to go to the remote file.
+		rw.direct = true
+	}
+	n, err := dst.CopyOutFrom(ctx, rw)
+	putDentryReadWriter(rw)
+	if d.fs.opts.interop != InteropModeShared {
+		// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+		d.touchAtime(fd.vfsfd.Mount())
+	}
+	return n, err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.mu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, _, err := fd.pwrite(ctx, src, offset, opts)
+	return n, err
+}
+
+// pwrite returns the number of bytes written, final offset, error. The final
+// offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
+	if offset < 0 {
+		return 0, offset, syserror.EINVAL
+	}
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, offset, syserror.EOPNOTSUPP
+	}
+
+	d := fd.dentry()
+	// If the fd was opened with O_APPEND, make sure the file size is updated.
+	// There is a possible race here if size is modified externally after
+	// metadata cache is updated.
+	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return 0, offset, err
+		}
+	}
+
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+
+	// Set offset to file size if the fd was opened with O_APPEND.
+	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+		// Holding d.metadataMu is sufficient for reading d.size.
+		offset = int64(d.size)
+	}
+	limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
+	if err != nil {
+		return 0, offset, err
+	}
+	src = src.TakeFirst64(limit)
+
+	if d.fs.opts.interop != InteropModeShared {
+		// Compare Linux's mm/filemap.c:__generic_file_write_iter() =>
+		// file_update_time(). This is d.touchCMtime(), but without locking
+		// d.metadataMu (recursively).
+		d.touchCMtimeLocked()
+	}
+
+	rw := getDentryReadWriter(ctx, d, offset)
+	defer putDentryReadWriter(rw)
+
+	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		if err := fd.writeCache(ctx, d, offset, src); err != nil {
+			return 0, offset, err
+		}
+
+		// Require the write to go to the remote file.
+		rw.direct = true
+	}
+
+	n, err := src.CopyInTo(ctx, rw)
+	if err != nil {
+		return n, offset + n, err
+	}
+	if n > 0 && fd.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
+		// Note that if any of the following fail, then we can't guarantee that
+		// any data was actually written with the semantics of O_DSYNC or
+		// O_SYNC, so we return zero bytes written. Compare Linux's
+		// mm/filemap.c:generic_file_write_iter() =>
+		// include/linux/fs.h:generic_write_sync().
+		//
+		// Write dirty cached pages touched by the write back to the remote
+		// file.
+		if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
+			return 0, offset, err
+		}
+		// Request the remote filesystem to sync the remote file.
+		if err := d.syncRemoteFile(ctx); err != nil {
+			return 0, offset, err
+		}
+	}
+	return n, offset + n, nil
+}
+
+func (fd *regularFileFD) writeCache(ctx context.Context, d *dentry, offset int64, src usermem.IOSequence) error {
+	// Write dirty cached pages that will be touched by the write back to
+	// the remote file.
+	if err := d.writeback(ctx, offset, src.NumBytes()); err != nil {
+		return err
+	}
+
+	// Remove touched pages from the cache.
+	pgstart := usermem.PageRoundDown(uint64(offset))
+	pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes()))
+	if !ok {
+		return syserror.EINVAL
+	}
+	mr := memmap.MappableRange{pgstart, pgend}
+	var freed []memmap.FileRange
+
+	d.dataMu.Lock()
+	cseg := d.cache.LowerBoundSegment(mr.Start)
+	for cseg.Ok() && cseg.Start() < mr.End {
+		cseg = d.cache.Isolate(cseg, mr)
+		freed = append(freed, memmap.FileRange{cseg.Value(), cseg.Value() + cseg.Range().Length()})
+		cseg = d.cache.Remove(cseg).NextSegment()
+	}
+	d.dataMu.Unlock()
+
+	// Invalidate mappings of removed pages.
+	d.mapsMu.Lock()
+	d.mappings.Invalidate(mr, memmap.InvalidateOpts{})
+	d.mapsMu.Unlock()
+
+	// Finally free pages removed from the cache.
+	mf := d.fs.mfp.MemoryFile()
+	for _, freedFR := range freed {
+		mf.DecRef(freedFR)
+	}
+	return nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.mu.Lock()
+	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+	fd.off = off
+	fd.mu.Unlock()
+	return n, err
+}
+
+type dentryReadWriter struct {
+	ctx    context.Context
+	d      *dentry
+	off    uint64
+	direct bool
+}
+
+var dentryReadWriterPool = sync.Pool{
+	New: func() interface{} {
+		return &dentryReadWriter{}
+	},
+}
+
+func getDentryReadWriter(ctx context.Context, d *dentry, offset int64) *dentryReadWriter {
+	rw := dentryReadWriterPool.Get().(*dentryReadWriter)
+	rw.ctx = ctx
+	rw.d = d
+	rw.off = uint64(offset)
+	rw.direct = false
+	return rw
+}
+
+func putDentryReadWriter(rw *dentryReadWriter) {
+	rw.ctx = nil
+	rw.d = nil
+	dentryReadWriterPool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	if dsts.IsEmpty() {
+		return 0, nil
+	}
+
+	// If we have a mmappable host FD (which must be used here to ensure
+	// coherence with memory-mapped I/O), or if InteropModeShared is in effect
+	// (which prevents us from caching file contents and makes dentry.size
+	// unreliable), or if the file was opened O_DIRECT, read directly from
+	// dentry.readHandleLocked() without locking dentry.dataMu.
+	rw.d.handleMu.RLock()
+	h := rw.d.readHandleLocked()
+	if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+		n, err := h.readToBlocksAt(rw.ctx, dsts, rw.off)
+		rw.d.handleMu.RUnlock()
+		rw.off += n
+		return n, err
+	}
+
+	// Otherwise read from/through the cache.
+	mf := rw.d.fs.mfp.MemoryFile()
+	fillCache := mf.ShouldCacheEvictable()
+	var dataMuUnlock func()
+	if fillCache {
+		rw.d.dataMu.Lock()
+		dataMuUnlock = rw.d.dataMu.Unlock
+	} else {
+		rw.d.dataMu.RLock()
+		dataMuUnlock = rw.d.dataMu.RUnlock
+	}
+
+	// Compute the range to read (limited by file size and overflow-checked).
+	if rw.off >= rw.d.size {
+		dataMuUnlock()
+		rw.d.handleMu.RUnlock()
+		return 0, io.EOF
+	}
+	end := rw.d.size
+	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
+		end = rend
+	}
+
+	var done uint64
+	seg, gap := rw.d.cache.Find(rw.off)
+	for rw.off < end {
+		mr := memmap.MappableRange{rw.off, end}
+		switch {
+		case seg.Ok():
+			// Get internal mappings from the cache.
+			ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+			if err != nil {
+				dataMuUnlock()
+				rw.d.handleMu.RUnlock()
+				return done, err
+			}
+
+			// Copy from internal mappings.
+			n, err := safemem.CopySeq(dsts, ims)
+			done += n
+			rw.off += n
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				dataMuUnlock()
+				rw.d.handleMu.RUnlock()
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			gapMR := gap.Range().Intersect(mr)
+			if fillCache {
+				// Read into the cache, then re-enter the loop to read from the
+				// cache.
+				gapEnd, _ := usermem.PageRoundUp(gapMR.End)
+				reqMR := memmap.MappableRange{
+					Start: usermem.PageRoundDown(gapMR.Start),
+					End:   gapEnd,
+				}
+				optMR := gap.Range()
+				err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, h.readToBlocksAt)
+				mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
+				seg, gap = rw.d.cache.Find(rw.off)
+				if !seg.Ok() {
+					dataMuUnlock()
+					rw.d.handleMu.RUnlock()
+					return done, err
+				}
+				// err might have occurred in part of gap.Range() outside
+				// gapMR. Forget about it for now; if the error matters and
+				// persists, we'll run into it again in a later iteration of
+				// this loop.
+			} else {
+				// Read directly from the file.
+				gapDsts := dsts.TakeFirst64(gapMR.Length())
+				n, err := h.readToBlocksAt(rw.ctx, gapDsts, gapMR.Start)
+				done += n
+				rw.off += n
+				dsts = dsts.DropFirst64(n)
+				// Partial reads are fine. But we must stop reading.
+				if n != gapDsts.NumBytes() || err != nil {
+					dataMuUnlock()
+					rw.d.handleMu.RUnlock()
+					return done, err
+				}
+
+				// Continue.
+				seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+			}
+		}
+	}
+	dataMuUnlock()
+	rw.d.handleMu.RUnlock()
+	return done, nil
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+//
+// Preconditions: rw.d.metadataMu must be locked.
+func (rw *dentryReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	if srcs.IsEmpty() {
+		return 0, nil
+	}
+
+	// If we have a mmappable host FD (which must be used here to ensure
+	// coherence with memory-mapped I/O), or if InteropModeShared is in effect
+	// (which prevents us from caching file contents), or if the file was
+	// opened with O_DIRECT, write directly to dentry.writeHandleLocked()
+	// without locking dentry.dataMu.
+	rw.d.handleMu.RLock()
+	h := rw.d.writeHandleLocked()
+	if (rw.d.hostFD >= 0 && !rw.d.fs.opts.forcePageCache) || rw.d.fs.opts.interop == InteropModeShared || rw.direct {
+		n, err := h.writeFromBlocksAt(rw.ctx, srcs, rw.off)
+		rw.off += n
+		rw.d.dataMu.Lock()
+		if rw.off > rw.d.size {
+			atomic.StoreUint64(&rw.d.size, rw.off)
+			// The remote file's size will implicitly be extended to the correct
+			// value when we write back to it.
+		}
+		rw.d.dataMu.Unlock()
+		rw.d.handleMu.RUnlock()
+		return n, err
+	}
+
+	// Otherwise write to/through the cache.
+	mf := rw.d.fs.mfp.MemoryFile()
+	rw.d.dataMu.Lock()
+
+	// Compute the range to write (overflow-checked).
+	start := rw.off
+	end := rw.off + srcs.NumBytes()
+	if end <= rw.off {
+		end = math.MaxInt64
+	}
+
+	var (
+		done   uint64
+		retErr error
+	)
+	seg, gap := rw.d.cache.Find(rw.off)
+	for rw.off < end {
+		mr := memmap.MappableRange{rw.off, end}
+		switch {
+		case seg.Ok():
+			// Get internal mappings from the cache.
+			segMR := seg.Range().Intersect(mr)
+			ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Copy to internal mappings.
+			n, err := safemem.CopySeq(ims, srcs)
+			done += n
+			rw.off += n
+			srcs = srcs.DropFirst64(n)
+			rw.d.dirty.MarkDirty(segMR)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Write directly to the file. At present, we never fill the cache
+			// when writing, since doing so can convert small writes into
+			// inefficient read-modify-write cycles, and we have no mechanism
+			// for detecting or avoiding this.
+			gapMR := gap.Range().Intersect(mr)
+			gapSrcs := srcs.TakeFirst64(gapMR.Length())
+			n, err := h.writeFromBlocksAt(rw.ctx, gapSrcs, gapMR.Start)
+			done += n
+			rw.off += n
+			srcs = srcs.DropFirst64(n)
+			// Partial writes are fine. But we must stop writing.
+			if n != gapSrcs.NumBytes() || err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Continue.
+			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+		}
+	}
+exitLoop:
+	if rw.off > rw.d.size {
+		atomic.StoreUint64(&rw.d.size, rw.off)
+		// The remote file's size will implicitly be extended to the correct
+		// value when we write back to it.
+	}
+	// If InteropModeWritethrough is in effect, flush written data back to the
+	// remote filesystem.
+	if rw.d.fs.opts.interop == InteropModeWritethrough && done != 0 {
+		if err := fsutil.SyncDirty(rw.ctx, memmap.MappableRange{
+			Start: start,
+			End:   rw.off,
+		}, &rw.d.cache, &rw.d.dirty, rw.d.size, mf, h.writeFromBlocksAt); err != nil {
+			// We have no idea how many bytes were actually flushed.
+			rw.off = start
+			done = 0
+			retErr = err
+		}
+	}
+	rw.d.dataMu.Unlock()
+	rw.d.handleMu.RUnlock()
+	return done, retErr
+}
+
+func (d *dentry) writeback(ctx context.Context, offset, size int64) error {
+	if size == 0 {
+		return nil
+	}
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	h := d.writeHandleLocked()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+	// Compute the range of valid bytes (overflow-checked).
+	if uint64(offset) >= d.size {
+		return nil
+	}
+	end := int64(d.size)
+	if rend := offset + size; rend > offset && rend < end {
+		end = rend
+	}
+	return fsutil.SyncDirty(ctx, memmap.MappableRange{
+		Start: uint64(offset),
+		End:   uint64(end),
+	}, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
+	if err != nil {
+		return 0, err
+	}
+	fd.off = newOffset
+	return newOffset, nil
+}
+
+// Calculate the new offset for a seek operation on a regular file.
+func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int64, whence int32) (int64, error) {
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as specified.
+	case linux.SEEK_CUR:
+		offset += fdOffset
+	case linux.SEEK_END, linux.SEEK_DATA, linux.SEEK_HOLE:
+		// Ensure file size is up to date.
+		if !d.cachedMetadataAuthoritative() {
+			if err := d.updateFromGetattr(ctx); err != nil {
+				return 0, err
+			}
+		}
+		size := int64(atomic.LoadUint64(&d.size))
+		// For SEEK_DATA and SEEK_HOLE, treat the file as a single contiguous
+		// block of data.
+		switch whence {
+		case linux.SEEK_END:
+			offset += size
+		case linux.SEEK_DATA:
+			if offset > size {
+				return 0, syserror.ENXIO
+			}
+			// Use offset as specified.
+		case linux.SEEK_HOLE:
+			if offset > size {
+				return 0, syserror.ENXIO
+			}
+			offset = size
+		}
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+	return fd.dentry().syncCachedFile(ctx)
+}
+
+func (d *dentry) syncCachedFile(ctx context.Context) error {
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+
+	if h := d.writeHandleLocked(); h.isOpen() {
+		d.dataMu.Lock()
+		// Write dirty cached data to the remote file.
+		err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
+		d.dataMu.Unlock()
+		if err != nil {
+			return err
+		}
+	}
+	return d.syncRemoteFileLocked(ctx)
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	d := fd.dentry()
+	switch d.fs.opts.interop {
+	case InteropModeExclusive:
+		// Any mapping is fine.
+	case InteropModeWritethrough:
+		// Shared writable mappings require a host FD, since otherwise we can't
+		// synchronously flush memory-mapped writes to the remote file.
+		if opts.Private || !opts.MaxPerms.Write {
+			break
+		}
+		fallthrough
+	case InteropModeShared:
+		// All mappings require a host FD to be coherent with other filesystem
+		// users.
+		if d.fs.opts.forcePageCache {
+			// Whether or not we have a host FD, we're not allowed to use it.
+			return syserror.ENODEV
+		}
+		d.handleMu.RLock()
+		haveFD := d.hostFD >= 0
+		d.handleMu.RUnlock()
+		if !haveFD {
+			return syserror.ENODEV
+		}
+	default:
+		panic(fmt.Sprintf("unknown InteropMode %v", d.fs.opts.interop))
+	}
+	// After this point, d may be used as a memmap.Mappable.
+	d.pf.hostFileMapperInitOnce.Do(d.pf.hostFileMapper.Init)
+	return vfs.GenericConfigureMMap(&fd.vfsfd, d, opts)
+}
+
+func (d *dentry) mayCachePages() bool {
+	if d.fs.opts.interop == InteropModeShared {
+		return false
+	}
+	if d.fs.opts.forcePageCache {
+		return true
+	}
+	d.handleMu.RLock()
+	haveFD := d.hostFD >= 0
+	d.handleMu.RUnlock()
+	return haveFD
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	d.mapsMu.Lock()
+	mapped := d.mappings.AddMapping(ms, ar, offset, writable)
+	// Do this unconditionally since whether we have a host FD can change
+	// across save/restore.
+	for _, r := range mapped {
+		d.pf.hostFileMapper.IncRefOn(r)
+	}
+	if d.mayCachePages() {
+		// d.Evict() will refuse to evict memory-mapped pages, so tell the
+		// MemoryFile to not bother trying.
+		mf := d.fs.mfp.MemoryFile()
+		for _, r := range mapped {
+			mf.MarkUnevictable(d, pgalloc.EvictableRange{r.Start, r.End})
+		}
+	}
+	d.mapsMu.Unlock()
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	d.mapsMu.Lock()
+	unmapped := d.mappings.RemoveMapping(ms, ar, offset, writable)
+	for _, r := range unmapped {
+		d.pf.hostFileMapper.DecRefOn(r)
+	}
+	if d.mayCachePages() {
+		// Pages that are no longer referenced by any application memory
+		// mappings are now considered unused; allow MemoryFile to evict them
+		// when necessary.
+		mf := d.fs.mfp.MemoryFile()
+		d.dataMu.Lock()
+		for _, r := range unmapped {
+			// Since these pages are no longer mapped, they are no longer
+			// concurrently dirtyable by a writable memory mapping.
+			d.dirty.AllowClean(r)
+			mf.MarkEvictable(d, pgalloc.EvictableRange{r.Start, r.End})
+		}
+		d.dataMu.Unlock()
+	}
+	d.mapsMu.Unlock()
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return d.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	d.handleMu.RLock()
+	if d.hostFD >= 0 && !d.fs.opts.forcePageCache {
+		d.handleMu.RUnlock()
+		mr := optional
+		if d.fs.opts.limitHostFDTranslation {
+			mr = maxFillRange(required, optional)
+		}
+		return []memmap.Translation{
+			{
+				Source: mr,
+				File:   &d.pf,
+				Offset: mr.Start,
+				Perms:  usermem.AnyAccess,
+			},
+		}, nil
+	}
+
+	d.dataMu.Lock()
+
+	// Constrain translations to d.size (rounded up) to prevent translation to
+	// pages that may be concurrently truncated.
+	pgend, _ := usermem.PageRoundUp(d.size)
+	var beyondEOF bool
+	if required.End > pgend {
+		if required.Start >= pgend {
+			d.dataMu.Unlock()
+			d.handleMu.RUnlock()
+			return nil, &memmap.BusError{io.EOF}
+		}
+		beyondEOF = true
+		required.End = pgend
+	}
+	if optional.End > pgend {
+		optional.End = pgend
+	}
+
+	mf := d.fs.mfp.MemoryFile()
+	h := d.readHandleLocked()
+	cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, h.readToBlocksAt)
+
+	var ts []memmap.Translation
+	var translatedEnd uint64
+	for seg := d.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
+		segMR := seg.Range().Intersect(optional)
+		// TODO(jamieliu): Make Translations writable even if writability is
+		// not required if already kept-dirty by another writable translation.
+		perms := usermem.AccessType{
+			Read:    true,
+			Execute: true,
+		}
+		if at.Write {
+			// From this point forward, this memory can be dirtied through the
+			// mapping at any time.
+			d.dirty.KeepDirty(segMR)
+			perms.Write = true
+		}
+		ts = append(ts, memmap.Translation{
+			Source: segMR,
+			File:   mf,
+			Offset: seg.FileRangeOf(segMR).Start,
+			Perms:  perms,
+		})
+		translatedEnd = segMR.End
+	}
+
+	d.dataMu.Unlock()
+	d.handleMu.RUnlock()
+
+	// Don't return the error returned by c.cache.Fill if it occurred outside
+	// of required.
+	if translatedEnd < required.End && cerr != nil {
+		return ts, &memmap.BusError{cerr}
+	}
+	if beyondEOF {
+		return ts, &memmap.BusError{io.EOF}
+	}
+	return ts, nil
+}
+
+func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange {
+	const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily
+	if required.Length() >= maxReadahead {
+		return required
+	}
+	if optional.Length() <= maxReadahead {
+		return optional
+	}
+	optional.Start = required.Start
+	if optional.Length() <= maxReadahead {
+		return optional
+	}
+	optional.End = optional.Start + maxReadahead
+	return optional
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
+	// Whether we have a host fd (and consequently what memmap.File is
+	// mapped) can change across save/restore, so invalidate all translations
+	// unconditionally.
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	d.mappings.InvalidateAll(memmap.InvalidateOpts{})
+
+	// Write the cache's contents back to the remote file so that if we have a
+	// host fd after restore, the remote file's contents are coherent.
+	mf := d.fs.mfp.MemoryFile()
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	h := d.writeHandleLocked()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+	if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
+		return err
+	}
+
+	// Discard the cache so that it's not stored in saved state. This is safe
+	// because per InvalidateUnsavable invariants, no new translations can have
+	// been returned after we invalidated all existing translations above.
+	d.cache.DropAll(mf)
+	d.dirty.RemoveAll()
+
+	return nil
+}
+
+// Evict implements pgalloc.EvictableMemoryUser.Evict.
+func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
+	mr := memmap.MappableRange{er.Start, er.End}
+	mf := d.fs.mfp.MemoryFile()
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	h := d.writeHandleLocked()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+
+	// Only allow pages that are no longer memory-mapped to be evicted.
+	for mgap := d.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
+		mgapMR := mgap.Range().Intersect(mr)
+		if mgapMR.Length() == 0 {
+			continue
+		}
+		if err := fsutil.SyncDirty(ctx, mgapMR, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
+			log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
+		}
+		d.cache.Drop(mgapMR, mf)
+		d.dirty.KeepClean(mgapMR)
+	}
+}
+
+// dentryPlatformFile implements memmap.File. It exists solely because dentry
+// cannot implement both vfs.DentryImpl.IncRef and memmap.File.IncRef.
+//
+// dentryPlatformFile is only used when a host FD representing the remote file
+// is available (i.e. dentry.hostFD >= 0), and that FD is used for application
+// memory mappings (i.e. !filesystem.opts.forcePageCache).
+type dentryPlatformFile struct {
+	*dentry
+
+	// fdRefs counts references on memmap.File offsets. fdRefs is protected
+	// by dentry.dataMu.
+	fdRefs fsutil.FrameRefSet
+
+	// If this dentry represents a regular file, and dentry.hostFD >= 0,
+	// hostFileMapper caches mappings of dentry.hostFD.
+	hostFileMapper fsutil.HostFileMapper
+
+	// hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
+	hostFileMapperInitOnce sync.Once
+}
+
+// IncRef implements memmap.File.IncRef.
+func (d *dentryPlatformFile) IncRef(fr memmap.FileRange) {
+	d.dataMu.Lock()
+	d.fdRefs.IncRefAndAccount(fr)
+	d.dataMu.Unlock()
+}
+
+// DecRef implements memmap.File.DecRef.
+func (d *dentryPlatformFile) DecRef(fr memmap.FileRange) {
+	d.dataMu.Lock()
+	d.fdRefs.DecRefAndAccount(fr)
+	d.dataMu.Unlock()
+}
+
+// MapInternal implements memmap.File.MapInternal.
+func (d *dentryPlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	return d.hostFileMapper.MapInternal(fr, int(d.hostFD), at.Write)
+}
+
+// FD implements memmap.File.FD.
+func (d *dentryPlatformFile) FD() int {
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	return int(d.hostFD)
+}
diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go
new file mode 100644
index 000000000..85d2bee72
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/socket.go
@@ -0,0 +1,146 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func (d *dentry) isSocket() bool {
+	return d.fileType() == linux.S_IFSOCK
+}
+
+// endpoint is a Gofer-backed transport.BoundEndpoint.
+//
+// An endpoint's lifetime is the time between when filesystem.BoundEndpointAt()
+// is called and either BoundEndpoint.BidirectionalConnect or
+// BoundEndpoint.UnidirectionalConnect is called.
+type endpoint struct {
+	// dentry is the filesystem dentry which produced this endpoint.
+	dentry *dentry
+
+	// file is the p9 file that contains a single unopened fid.
+	file p9.File
+
+	// path is the sentry path where this endpoint is bound.
+	path string
+}
+
+func sockTypeToP9(t linux.SockType) (p9.ConnectFlags, bool) {
+	switch t {
+	case linux.SOCK_STREAM:
+		return p9.StreamSocket, true
+	case linux.SOCK_SEQPACKET:
+		return p9.SeqpacketSocket, true
+	case linux.SOCK_DGRAM:
+		return p9.DgramSocket, true
+	}
+	return 0, false
+}
+
+// BidirectionalConnect implements ConnectableEndpoint.BidirectionalConnect.
+func (e *endpoint) BidirectionalConnect(ctx context.Context, ce transport.ConnectingEndpoint, returnConnect func(transport.Receiver, transport.ConnectedEndpoint)) *syserr.Error {
+	cf, ok := sockTypeToP9(ce.Type())
+	if !ok {
+		return syserr.ErrConnectionRefused
+	}
+
+	// No lock ordering required as only the ConnectingEndpoint has a mutex.
+	ce.Lock()
+
+	// Check connecting state.
+	if ce.Connected() {
+		ce.Unlock()
+		return syserr.ErrAlreadyConnected
+	}
+	if ce.Listening() {
+		ce.Unlock()
+		return syserr.ErrInvalidEndpointState
+	}
+
+	c, err := e.newConnectedEndpoint(ctx, cf, ce.WaiterQueue())
+	if err != nil {
+		ce.Unlock()
+		return err
+	}
+
+	returnConnect(c, c)
+	ce.Unlock()
+	if err := c.Init(); err != nil {
+		return syserr.FromError(err)
+	}
+
+	return nil
+}
+
+// UnidirectionalConnect implements
+// transport.BoundEndpoint.UnidirectionalConnect.
+func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.ConnectedEndpoint, *syserr.Error) {
+	c, err := e.newConnectedEndpoint(ctx, p9.DgramSocket, &waiter.Queue{})
+	if err != nil {
+		return nil, err
+	}
+
+	if err := c.Init(); err != nil {
+		return nil, syserr.FromError(err)
+	}
+
+	// We don't need the receiver.
+	c.CloseRecv()
+	c.Release(ctx)
+
+	return c, nil
+}
+
+func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) {
+	hostFile, err := e.file.Connect(flags)
+	if err != nil {
+		return nil, syserr.ErrConnectionRefused
+	}
+	// Dup the fd so that the new endpoint can manage its lifetime.
+	hostFD, err := syscall.Dup(hostFile.FD())
+	if err != nil {
+		log.Warningf("Could not dup host socket fd %d: %v", hostFile.FD(), err)
+		return nil, syserr.FromError(err)
+	}
+	// After duplicating, we no longer need hostFile.
+	hostFile.Close()
+
+	c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path)
+	if serr != nil {
+		log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.file, flags, serr)
+		return nil, serr
+	}
+	return c, nil
+}
+
+// Release implements transport.BoundEndpoint.Release.
+func (e *endpoint) Release(ctx context.Context) {
+	e.dentry.DecRef(ctx)
+}
+
+// Passcred implements transport.BoundEndpoint.Passcred.
+func (e *endpoint) Passcred() bool {
+	return false
+}
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
new file mode 100644
index 000000000..a6368fdd0
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -0,0 +1,292 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// specialFileFD implements vfs.FileDescriptionImpl for pipes, sockets, device
+// special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is
+// in effect) regular files. specialFileFD differs from regularFileFD by using
+// per-FD handles instead of shared per-dentry handles, and never buffering I/O.
+type specialFileFD struct {
+	fileDescription
+
+	// handle is used for file I/O. handle is immutable.
+	handle handle
+
+	// seekable is true if this file description represents a file for which
+	// file offset is significant, i.e. a regular file. seekable is immutable.
+	seekable bool
+
+	// haveQueue is true if this file description represents a file for which
+	// queue may send I/O readiness events. haveQueue is immutable.
+	haveQueue bool
+	queue     waiter.Queue
+
+	// If seekable is true, off is the file offset. off is protected by mu.
+	mu  sync.Mutex
+	off int64
+}
+
+func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) {
+	ftype := d.fileType()
+	seekable := ftype == linux.S_IFREG
+	haveQueue := (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && h.fd >= 0
+	fd := &specialFileFD{
+		handle:    h,
+		seekable:  seekable,
+		haveQueue: haveQueue,
+	}
+	fd.LockFD.Init(locks)
+	if haveQueue {
+		if err := fdnotifier.AddFD(h.fd, &fd.queue); err != nil {
+			return nil, err
+		}
+	}
+	if err := fd.vfsfd.Init(fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+		DenyPRead:  !seekable,
+		DenyPWrite: !seekable,
+	}); err != nil {
+		if haveQueue {
+			fdnotifier.RemoveFD(h.fd)
+		}
+		return nil, err
+	}
+	return fd, nil
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *specialFileFD) Release(ctx context.Context) {
+	if fd.haveQueue {
+		fdnotifier.RemoveFD(fd.handle.fd)
+	}
+	fd.handle.close(ctx)
+	fs := fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+	fs.syncMu.Lock()
+	delete(fs.specialFileFDs, fd)
+	fs.syncMu.Unlock()
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *specialFileFD) OnClose(ctx context.Context) error {
+	if !fd.vfsfd.IsWritable() {
+		return nil
+	}
+	return fd.handle.file.flush(ctx)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *specialFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	if fd.haveQueue {
+		return fdnotifier.NonBlockingPoll(fd.handle.fd, mask)
+	}
+	return fd.fileDescription.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *specialFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	if fd.haveQueue {
+		fd.queue.EventRegister(e, mask)
+		fdnotifier.UpdateFD(fd.handle.fd)
+		return
+	}
+	fd.fileDescription.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
+	if fd.haveQueue {
+		fd.queue.EventUnregister(e)
+		fdnotifier.UpdateFD(fd.handle.fd)
+		return
+	}
+	fd.fileDescription.EventUnregister(e)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if fd.seekable && offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	// Going through dst.CopyOutFrom() holds MM locks around file operations of
+	// unknown duration. For regularFileFD, doing so is necessary to support
+	// mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't
+	// hold here since specialFileFD doesn't client-cache data. Just buffer the
+	// read instead.
+	if d := fd.dentry(); d.cachedMetadataAuthoritative() {
+		d.touchAtime(fd.vfsfd.Mount())
+	}
+	buf := make([]byte, dst.NumBytes())
+	n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	if err == syserror.EAGAIN {
+		err = syserror.ErrWouldBlock
+	}
+	if n == 0 {
+		return 0, err
+	}
+	if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil {
+		return int64(cp), cperr
+	}
+	return int64(n), err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *specialFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	if !fd.seekable {
+		return fd.PRead(ctx, dst, -1, opts)
+	}
+
+	fd.mu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *specialFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, _, err := fd.pwrite(ctx, src, offset, opts)
+	return n, err
+}
+
+// pwrite returns the number of bytes written, final offset, error. The final
+// offset should be ignored by PWrite.
+func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
+	if fd.seekable && offset < 0 {
+		return 0, offset, syserror.EINVAL
+	}
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, offset, syserror.EOPNOTSUPP
+	}
+
+	d := fd.dentry()
+	// If the regular file fd was opened with O_APPEND, make sure the file size
+	// is updated. There is a possible race here if size is modified externally
+	// after metadata cache is updated.
+	if fd.seekable && fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return 0, offset, err
+		}
+	}
+
+	if fd.seekable {
+		// We need to hold the metadataMu *while* writing to a regular file.
+		d.metadataMu.Lock()
+		defer d.metadataMu.Unlock()
+
+		// Set offset to file size if the regular file was opened with O_APPEND.
+		if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+			// Holding d.metadataMu is sufficient for reading d.size.
+			offset = int64(d.size)
+		}
+		limit, err := vfs.CheckLimit(ctx, offset, src.NumBytes())
+		if err != nil {
+			return 0, offset, err
+		}
+		src = src.TakeFirst64(limit)
+	}
+
+	// Do a buffered write. See rationale in PRead.
+	if d.cachedMetadataAuthoritative() {
+		d.touchCMtime()
+	}
+	buf := make([]byte, src.NumBytes())
+	// Don't do partial writes if we get a partial read from src.
+	if _, err := src.CopyIn(ctx, buf); err != nil {
+		return 0, offset, err
+	}
+	n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	if err == syserror.EAGAIN {
+		err = syserror.ErrWouldBlock
+	}
+	finalOff = offset
+	// Update file size for regular files.
+	if fd.seekable {
+		finalOff += int64(n)
+		// d.metadataMu is already locked at this point.
+		if uint64(finalOff) > d.size {
+			d.dataMu.Lock()
+			defer d.dataMu.Unlock()
+			atomic.StoreUint64(&d.size, uint64(finalOff))
+		}
+	}
+	return int64(n), finalOff, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *specialFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	if !fd.seekable {
+		return fd.PWrite(ctx, src, -1, opts)
+	}
+
+	fd.mu.Lock()
+	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+	fd.off = off
+	fd.mu.Unlock()
+	return n, err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	if !fd.seekable {
+		return 0, syserror.ESPIPE
+	}
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	newOffset, err := regularFileSeekLocked(ctx, fd.dentry(), fd.off, offset, whence)
+	if err != nil {
+		return 0, err
+	}
+	fd.off = newOffset
+	return newOffset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *specialFileFD) Sync(ctx context.Context) error {
+	// If we have a host FD, fsyncing it is likely to be faster than an fsync
+	// RPC.
+	if fd.handle.fd >= 0 {
+		ctx.UninterruptibleSleepStart(false)
+		err := syscall.Fsync(int(fd.handle.fd))
+		ctx.UninterruptibleSleepFinish(false)
+		return err
+	}
+	return fd.handle.file.fsync(ctx)
+}
diff --git a/pkg/sentry/fsimpl/gofer/symlink.go b/pkg/sentry/fsimpl/gofer/symlink.go
new file mode 100644
index 000000000..2ec819f86
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/symlink.go
@@ -0,0 +1,47 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func (d *dentry) isSymlink() bool {
+	return d.fileType() == linux.S_IFLNK
+}
+
+// Precondition: d.isSymlink().
+func (d *dentry) readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
+	if d.fs.opts.interop != InteropModeShared {
+		d.touchAtime(mnt)
+		d.dataMu.Lock()
+		if d.haveTarget {
+			target := d.target
+			d.dataMu.Unlock()
+			return target, nil
+		}
+	}
+	target, err := d.file.readlink(ctx)
+	if d.fs.opts.interop != InteropModeShared {
+		if err == nil {
+			d.haveTarget = true
+			d.target = target
+		}
+		d.dataMu.Unlock()
+	}
+	return target, err
+}
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
new file mode 100644
index 000000000..2cb8191b9
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -0,0 +1,82 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func dentryTimestampFromP9(s, ns uint64) int64 {
+	return int64(s*1e9 + ns)
+}
+
+func dentryTimestampFromStatx(ts linux.StatxTimestamp) int64 {
+	return ts.Sec*1e9 + int64(ts.Nsec)
+}
+
+func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
+	return linux.StatxTimestamp{
+		Sec:  ns / 1e9,
+		Nsec: uint32(ns % 1e9),
+	}
+}
+
+// Preconditions: d.cachedMetadataAuthoritative() == true.
+func (d *dentry) touchAtime(mnt *vfs.Mount) {
+	if mnt.Flags.NoATime {
+		return
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return
+	}
+	now := d.fs.clock.Now().Nanoseconds()
+	d.metadataMu.Lock()
+	atomic.StoreInt64(&d.atime, now)
+	atomic.StoreUint32(&d.atimeDirty, 1)
+	d.metadataMu.Unlock()
+	mnt.EndWrite()
+}
+
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// successfully called vfs.Mount.CheckBeginWrite().
+func (d *dentry) touchCtime() {
+	now := d.fs.clock.Now().Nanoseconds()
+	d.metadataMu.Lock()
+	atomic.StoreInt64(&d.ctime, now)
+	d.metadataMu.Unlock()
+}
+
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// successfully called vfs.Mount.CheckBeginWrite().
+func (d *dentry) touchCMtime() {
+	now := d.fs.clock.Now().Nanoseconds()
+	d.metadataMu.Lock()
+	atomic.StoreInt64(&d.mtime, now)
+	atomic.StoreInt64(&d.ctime, now)
+	atomic.StoreUint32(&d.mtimeDirty, 1)
+	d.metadataMu.Unlock()
+}
+
+// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
+// locked d.metadataMu.
+func (d *dentry) touchCMtimeLocked() {
+	now := d.fs.clock.Now().Nanoseconds()
+	atomic.StoreInt64(&d.mtime, now)
+	atomic.StoreInt64(&d.ctime, now)
+	atomic.StoreUint32(&d.mtimeDirty, 1)
+}
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
new file mode 100644
index 000000000..bd701bbc7
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -0,0 +1,52 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "host",
+    srcs = [
+        "control.go",
+        "host.go",
+        "ioctl_unsafe.go",
+        "mmap.go",
+        "socket.go",
+        "socket_iovec.go",
+        "socket_unsafe.go",
+        "tty.go",
+        "util.go",
+        "util_unsafe.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fdnotifier",
+        "//pkg/fspath",
+        "//pkg/iovec",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/safemem",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/hostfd",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/socket/control",
+        "//pkg/sentry/socket/unix",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/unimpl",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserr",
+        "//pkg/syserror",
+        "//pkg/tcpip",
+        "//pkg/unet",
+        "//pkg/usermem",
+        "//pkg/waiter",
+        "@org_golang_x_sys//unix:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go
new file mode 100644
index 000000000..0135e4428
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/control.go
@@ -0,0 +1,96 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/control"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+type scmRights struct {
+	fds []int
+}
+
+func newSCMRights(fds []int) control.SCMRightsVFS2 {
+	return &scmRights{fds}
+}
+
+// Files implements control.SCMRights.Files.
+func (c *scmRights) Files(ctx context.Context, max int) (control.RightsFilesVFS2, bool) {
+	n := max
+	var trunc bool
+	if l := len(c.fds); n > l {
+		n = l
+	} else if n < l {
+		trunc = true
+	}
+
+	rf := control.RightsFilesVFS2(fdsToFiles(ctx, c.fds[:n]))
+
+	// Only consume converted FDs (fdsToFiles may convert fewer than n FDs).
+	c.fds = c.fds[len(rf):]
+	return rf, trunc
+}
+
+// Clone implements transport.RightsControlMessage.Clone.
+func (c *scmRights) Clone() transport.RightsControlMessage {
+	// Host rights never need to be cloned.
+	return nil
+}
+
+// Release implements transport.RightsControlMessage.Release.
+func (c *scmRights) Release(ctx context.Context) {
+	for _, fd := range c.fds {
+		syscall.Close(fd)
+	}
+	c.fds = nil
+}
+
+// If an error is encountered, only files created before the error will be
+// returned. This is what Linux does.
+func fdsToFiles(ctx context.Context, fds []int) []*vfs.FileDescription {
+	files := make([]*vfs.FileDescription, 0, len(fds))
+	for _, fd := range fds {
+		// Get flags. We do it here because they may be modified
+		// by subsequent functions.
+		fileFlags, _, errno := syscall.Syscall(syscall.SYS_FCNTL, uintptr(fd), syscall.F_GETFL, 0)
+		if errno != 0 {
+			ctx.Warningf("Error retrieving host FD flags: %v", error(errno))
+			break
+		}
+
+		// Create the file backed by hostFD.
+		file, err := ImportFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, false /* isTTY */)
+		if err != nil {
+			ctx.Warningf("Error creating file from host FD: %v", err)
+			break
+		}
+
+		if err := file.SetStatusFlags(ctx, auth.CredentialsFromContext(ctx), uint32(fileFlags&linux.O_NONBLOCK)); err != nil {
+			ctx.Warningf("Error setting flags on host FD file: %v", err)
+			break
+		}
+
+		files = append(files, file)
+	}
+	return files
+}
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
new file mode 100644
index 000000000..bd6caba06
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -0,0 +1,769 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package host provides a filesystem implementation for host files imported as
+// file descriptors.
+package host
+
+import (
+	"fmt"
+	"math"
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/refs"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/hostfd"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	unixsocket "gvisor.dev/gvisor/pkg/sentry/socket/unix"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// NewFDOptions contains options to NewFD.
+type NewFDOptions struct {
+	// If IsTTY is true, the file descriptor is a TTY.
+	IsTTY bool
+
+	// If HaveFlags is true, use Flags for the new file description. Otherwise,
+	// the new file description will inherit flags from hostFD.
+	HaveFlags bool
+	Flags     uint32
+}
+
+// NewFD returns a vfs.FileDescription representing the given host file
+// descriptor. mnt must be Kernel.HostMount().
+func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions) (*vfs.FileDescription, error) {
+	fs, ok := mnt.Filesystem().Impl().(*filesystem)
+	if !ok {
+		return nil, fmt.Errorf("can't import host FDs into filesystems of type %T", mnt.Filesystem().Impl())
+	}
+
+	// Retrieve metadata.
+	var s unix.Stat_t
+	if err := unix.Fstat(hostFD, &s); err != nil {
+		return nil, err
+	}
+
+	flags := opts.Flags
+	if !opts.HaveFlags {
+		// Get flags for the imported FD.
+		flagsInt, err := unix.FcntlInt(uintptr(hostFD), syscall.F_GETFL, 0)
+		if err != nil {
+			return nil, err
+		}
+		flags = uint32(flagsInt)
+	}
+
+	fileMode := linux.FileMode(s.Mode)
+	fileType := fileMode.FileType()
+
+	// Determine if hostFD is seekable. If not, this syscall will return ESPIPE
+	// (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character
+	// devices.
+	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
+	seekable := err != syserror.ESPIPE
+
+	i := &inode{
+		hostFD:     hostFD,
+		ino:        fs.NextIno(),
+		isTTY:      opts.IsTTY,
+		wouldBlock: wouldBlock(uint32(fileType)),
+		seekable:   seekable,
+		// NOTE(b/38213152): Technically, some obscure char devices can be memory
+		// mapped, but we only allow regular files.
+		canMap: fileType == linux.S_IFREG,
+	}
+	i.pf.inode = i
+
+	// Non-seekable files can't be memory mapped, assert this.
+	if !i.seekable && i.canMap {
+		panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
+	}
+
+	// If the hostFD would block, we must set it to non-blocking and handle
+	// blocking behavior in the sentry.
+	if i.wouldBlock {
+		if err := syscall.SetNonblock(i.hostFD, true); err != nil {
+			return nil, err
+		}
+		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
+			return nil, err
+		}
+	}
+
+	d := &kernfs.Dentry{}
+	d.Init(i)
+
+	// i.open will take a reference on d.
+	defer d.DecRef(ctx)
+
+	// For simplicity, fileDescription.offset is set to 0. Technically, we
+	// should only set to 0 on files that are not seekable (sockets, pipes,
+	// etc.), and use the offset from the host fd otherwise when importing.
+	return i.open(ctx, d.VFSDentry(), mnt, flags)
+}
+
+// ImportFD sets up and returns a vfs.FileDescription from a donated fd.
+func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) {
+	return NewFD(ctx, mnt, hostFD, &NewFDOptions{
+		IsTTY: isTTY,
+	})
+}
+
+// filesystemType implements vfs.FilesystemType.
+type filesystemType struct{}
+
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	panic("host.filesystemType.GetFilesystem should never be called")
+}
+
+// Name implements FilesystemType.Name.
+func (filesystemType) Name() string {
+	return "none"
+}
+
+// NewFilesystem sets up and returns a new hostfs filesystem.
+//
+// Note that there should only ever be one instance of host.filesystem,
+// a global mount for host fds.
+func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, err
+	}
+	fs := &filesystem{
+		devMinor: devMinor,
+	}
+	fs.VFSFilesystem().Init(vfsObj, filesystemType{}, fs)
+	return fs.VFSFilesystem(), nil
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	kernfs.Filesystem
+
+	devMinor uint32
+}
+
+func (fs *filesystem) Release(ctx context.Context) {
+	fs.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release(ctx)
+}
+
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	d := vd.Dentry().Impl().(*kernfs.Dentry)
+	inode := d.Inode().(*inode)
+	b.PrependComponent(fmt.Sprintf("host:[%d]", inode.ino))
+	return vfs.PrependPathSyntheticError{}
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	locks vfs.FileLocks
+
+	// When the reference count reaches zero, the host fd is closed.
+	refs.AtomicRefCount
+
+	// hostFD contains the host fd that this file was originally created from,
+	// which must be available at time of restore.
+	//
+	// This field is initialized at creation time and is immutable.
+	hostFD int
+
+	// ino is an inode number unique within this filesystem.
+	//
+	// This field is initialized at creation time and is immutable.
+	ino uint64
+
+	// isTTY is true if this file represents a TTY.
+	//
+	// This field is initialized at creation time and is immutable.
+	isTTY bool
+
+	// seekable is false if the host fd points to a file representing a stream,
+	// e.g. a socket or a pipe. Such files are not seekable and can return
+	// EWOULDBLOCK for I/O operations.
+	//
+	// This field is initialized at creation time and is immutable.
+	seekable bool
+
+	// wouldBlock is true if the host FD would return EWOULDBLOCK for
+	// operations that would block.
+	//
+	// This field is initialized at creation time and is immutable.
+	wouldBlock bool
+
+	// Event queue for blocking operations.
+	queue waiter.Queue
+
+	// canMap specifies whether we allow the file to be memory mapped.
+	//
+	// This field is initialized at creation time and is immutable.
+	canMap bool
+
+	// mapsMu protects mappings.
+	mapsMu sync.Mutex
+
+	// If canMap is true, mappings tracks mappings of hostFD into
+	// memmap.MappingSpaces.
+	mappings memmap.MappingSet
+
+	// pf implements platform.File for mappings of hostFD.
+	pf inodePlatformFile
+}
+
+// CheckPermissions implements kernfs.Inode.
+func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.hostFD, &s); err != nil {
+		return err
+	}
+	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid))
+}
+
+// Mode implements kernfs.Inode.
+func (i *inode) Mode() linux.FileMode {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.hostFD, &s); err != nil {
+		// Retrieving the mode from the host fd using fstat(2) should not fail.
+		// If the syscall does not succeed, something is fundamentally wrong.
+		panic(fmt.Sprintf("failed to retrieve mode from host fd %d: %v", i.hostFD, err))
+	}
+	return linux.FileMode(s.Mode)
+}
+
+// Stat implements kernfs.Inode.
+func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	if opts.Mask&linux.STATX__RESERVED != 0 {
+		return linux.Statx{}, syserror.EINVAL
+	}
+	if opts.Sync&linux.AT_STATX_SYNC_TYPE == linux.AT_STATX_SYNC_TYPE {
+		return linux.Statx{}, syserror.EINVAL
+	}
+
+	fs := vfsfs.Impl().(*filesystem)
+
+	// Limit our host call only to known flags.
+	mask := opts.Mask & linux.STATX_ALL
+	var s unix.Statx_t
+	err := unix.Statx(i.hostFD, "", int(unix.AT_EMPTY_PATH|opts.Sync), int(mask), &s)
+	if err == syserror.ENOSYS {
+		// Fallback to fstat(2), if statx(2) is not supported on the host.
+		//
+		// TODO(b/151263641): Remove fallback.
+		return i.fstat(fs)
+	}
+	if err != nil {
+		return linux.Statx{}, err
+	}
+
+	// Unconditionally fill blksize, attributes, and device numbers, as
+	// indicated by /include/uapi/linux/stat.h. Inode number is always
+	// available, since we use our own rather than the host's.
+	ls := linux.Statx{
+		Mask:           linux.STATX_INO,
+		Blksize:        s.Blksize,
+		Attributes:     s.Attributes,
+		Ino:            i.ino,
+		AttributesMask: s.Attributes_mask,
+		DevMajor:       linux.UNNAMED_MAJOR,
+		DevMinor:       fs.devMinor,
+	}
+
+	// Copy other fields that were returned by the host. RdevMajor/RdevMinor
+	// are never copied (and therefore left as zero), so as not to expose host
+	// device numbers.
+	ls.Mask |= s.Mask & linux.STATX_ALL
+	if s.Mask&linux.STATX_TYPE != 0 {
+		ls.Mode |= s.Mode & linux.S_IFMT
+	}
+	if s.Mask&linux.STATX_MODE != 0 {
+		ls.Mode |= s.Mode &^ linux.S_IFMT
+	}
+	if s.Mask&linux.STATX_NLINK != 0 {
+		ls.Nlink = s.Nlink
+	}
+	if s.Mask&linux.STATX_UID != 0 {
+		ls.UID = s.Uid
+	}
+	if s.Mask&linux.STATX_GID != 0 {
+		ls.GID = s.Gid
+	}
+	if s.Mask&linux.STATX_ATIME != 0 {
+		ls.Atime = unixToLinuxStatxTimestamp(s.Atime)
+	}
+	if s.Mask&linux.STATX_BTIME != 0 {
+		ls.Btime = unixToLinuxStatxTimestamp(s.Btime)
+	}
+	if s.Mask&linux.STATX_CTIME != 0 {
+		ls.Ctime = unixToLinuxStatxTimestamp(s.Ctime)
+	}
+	if s.Mask&linux.STATX_MTIME != 0 {
+		ls.Mtime = unixToLinuxStatxTimestamp(s.Mtime)
+	}
+	if s.Mask&linux.STATX_SIZE != 0 {
+		ls.Size = s.Size
+	}
+	if s.Mask&linux.STATX_BLOCKS != 0 {
+		ls.Blocks = s.Blocks
+	}
+
+	return ls, nil
+}
+
+// fstat is a best-effort fallback for inode.Stat() if the host does not
+// support statx(2).
+//
+// We ignore the mask and sync flags in opts and simply supply
+// STATX_BASIC_STATS, as fstat(2) itself does not allow the specification
+// of a mask or sync flags. fstat(2) does not provide any metadata
+// equivalent to Statx.Attributes, Statx.AttributesMask, or Statx.Btime, so
+// those fields remain empty.
+func (i *inode) fstat(fs *filesystem) (linux.Statx, error) {
+	var s unix.Stat_t
+	if err := unix.Fstat(i.hostFD, &s); err != nil {
+		return linux.Statx{}, err
+	}
+
+	// As with inode.Stat(), we always use internal device and inode numbers,
+	// and never expose the host's represented device numbers.
+	return linux.Statx{
+		Mask:     linux.STATX_BASIC_STATS,
+		Blksize:  uint32(s.Blksize),
+		Nlink:    uint32(s.Nlink),
+		UID:      s.Uid,
+		GID:      s.Gid,
+		Mode:     uint16(s.Mode),
+		Ino:      i.ino,
+		Size:     uint64(s.Size),
+		Blocks:   uint64(s.Blocks),
+		Atime:    timespecToStatxTimestamp(s.Atim),
+		Ctime:    timespecToStatxTimestamp(s.Ctim),
+		Mtime:    timespecToStatxTimestamp(s.Mtim),
+		DevMajor: linux.UNNAMED_MAJOR,
+		DevMinor: fs.devMinor,
+	}, nil
+}
+
+// SetStat implements kernfs.Inode.
+func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	s := &opts.Stat
+
+	m := s.Mask
+	if m == 0 {
+		return nil
+	}
+	if m&^(linux.STATX_MODE|linux.STATX_SIZE|linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
+		return syserror.EPERM
+	}
+	var hostStat syscall.Stat_t
+	if err := syscall.Fstat(i.hostFD, &hostStat); err != nil {
+		return err
+	}
+	if err := vfs.CheckSetStat(ctx, creds, &opts, linux.FileMode(hostStat.Mode), auth.KUID(hostStat.Uid), auth.KGID(hostStat.Gid)); err != nil {
+		return err
+	}
+
+	if m&linux.STATX_MODE != 0 {
+		if err := syscall.Fchmod(i.hostFD, uint32(s.Mode)); err != nil {
+			return err
+		}
+	}
+	if m&linux.STATX_SIZE != 0 {
+		if hostStat.Mode&linux.S_IFMT != linux.S_IFREG {
+			return syserror.EINVAL
+		}
+		if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
+			return err
+		}
+		oldSize := uint64(hostStat.Size)
+		if s.Size < oldSize {
+			oldpgend, _ := usermem.PageRoundUp(oldSize)
+			newpgend, _ := usermem.PageRoundUp(s.Size)
+			if oldpgend != newpgend {
+				i.mapsMu.Lock()
+				i.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+					// Compare Linux's mm/truncate.c:truncate_setsize() =>
+					// truncate_pagecache() =>
+					// mm/memory.c:unmap_mapping_range(evencows=1).
+					InvalidatePrivate: true,
+				})
+				i.mapsMu.Unlock()
+			}
+		}
+	}
+	if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
+		ts := [2]syscall.Timespec{
+			toTimespec(s.Atime, m&linux.STATX_ATIME == 0),
+			toTimespec(s.Mtime, m&linux.STATX_MTIME == 0),
+		}
+		if err := setTimestamps(i.hostFD, &ts); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// DecRef implements kernfs.Inode.
+func (i *inode) DecRef(ctx context.Context) {
+	i.AtomicRefCount.DecRefWithDestructor(ctx, i.Destroy)
+}
+
+// Destroy implements kernfs.Inode.
+func (i *inode) Destroy(context.Context) {
+	if i.wouldBlock {
+		fdnotifier.RemoveFD(int32(i.hostFD))
+	}
+	if err := unix.Close(i.hostFD); err != nil {
+		log.Warningf("failed to close host fd %d: %v", i.hostFD, err)
+	}
+}
+
+// Open implements kernfs.Inode.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// Once created, we cannot re-open a socket fd through /proc/[pid]/fd/.
+	if i.Mode().FileType() == linux.S_IFSOCK {
+		return nil, syserror.ENXIO
+	}
+	return i.open(ctx, vfsd, rp.Mount(), opts.Flags)
+}
+
+func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) {
+	var s syscall.Stat_t
+	if err := syscall.Fstat(i.hostFD, &s); err != nil {
+		return nil, err
+	}
+	fileType := s.Mode & linux.FileTypeMask
+
+	// Constrain flags to a subset we can handle.
+	//
+	// TODO(gvisor.dev/issue/2601): Support O_NONBLOCK by adding RWF_NOWAIT to pread/pwrite calls.
+	flags &= syscall.O_ACCMODE | syscall.O_NONBLOCK | syscall.O_DSYNC | syscall.O_SYNC | syscall.O_APPEND
+
+	switch fileType {
+	case syscall.S_IFSOCK:
+		if i.isTTY {
+			log.Warningf("cannot use host socket fd %d as TTY", i.hostFD)
+			return nil, syserror.ENOTTY
+		}
+
+		ep, err := newEndpoint(ctx, i.hostFD, &i.queue)
+		if err != nil {
+			return nil, err
+		}
+		// Currently, we only allow Unix sockets to be imported.
+		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks)
+
+	case syscall.S_IFREG, syscall.S_IFIFO, syscall.S_IFCHR:
+		if i.isTTY {
+			fd := &TTYFileDescription{
+				fileDescription: fileDescription{inode: i},
+				termios:         linux.DefaultSlaveTermios,
+			}
+			fd.LockFD.Init(&i.locks)
+			vfsfd := &fd.vfsfd
+			if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+				return nil, err
+			}
+			return vfsfd, nil
+		}
+
+		fd := &fileDescription{inode: i}
+		fd.LockFD.Init(&i.locks)
+		vfsfd := &fd.vfsfd
+		if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+			return nil, err
+		}
+		return vfsfd, nil
+
+	default:
+		log.Warningf("cannot import host fd %d with file type %o", i.hostFD, fileType)
+		return nil, syserror.EPERM
+	}
+}
+
+// fileDescription is embedded by host fd implementations of FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+
+	// inode is vfsfd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode), but
+	// cached to reduce indirections and casting. fileDescription does not hold
+	// a reference on the inode through the inode field (since one is already
+	// held via the Dentry).
+	//
+	// inode is immutable after fileDescription creation.
+	inode *inode
+
+	// offsetMu protects offset.
+	offsetMu sync.Mutex
+
+	// offset specifies the current file offset. It is only meaningful when
+	// inode.seekable is true.
+	offset int64
+}
+
+// SetStat implements vfs.FileDescriptionImpl.
+func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.
+func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.
+func (f *fileDescription) Release(context.Context) {
+	// noop
+}
+
+// Allocate implements vfs.FileDescriptionImpl.
+func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	if !f.inode.seekable {
+		return syserror.ESPIPE
+	}
+
+	// TODO(gvisor.dev/issue/3589): Implement Allocate for non-pipe hostfds.
+	return syserror.EOPNOTSUPP
+}
+
+// PRead implements FileDescriptionImpl.
+func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	i := f.inode
+	if !i.seekable {
+		return 0, syserror.ESPIPE
+	}
+
+	return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags)
+}
+
+// Read implements FileDescriptionImpl.
+func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	i := f.inode
+	if !i.seekable {
+		n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags)
+		if isBlockError(err) {
+			// If we got any data at all, return it as a "completed" partial read
+			// rather than retrying until complete.
+			if n != 0 {
+				err = nil
+			} else {
+				err = syserror.ErrWouldBlock
+			}
+		}
+		return n, err
+	}
+
+	f.offsetMu.Lock()
+	n, err := readFromHostFD(ctx, i.hostFD, dst, f.offset, opts.Flags)
+	f.offset += n
+	f.offsetMu.Unlock()
+	return n, err
+}
+
+func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if flags&^linux.RWF_HIPRI != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+	reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
+	n, err := dst.CopyOutFrom(ctx, reader)
+	hostfd.PutReadWriterAt(reader)
+	return int64(n), err
+}
+
+// PWrite implements FileDescriptionImpl.
+func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	if !f.inode.seekable {
+		return 0, syserror.ESPIPE
+	}
+
+	return f.writeToHostFD(ctx, src, offset, opts.Flags)
+}
+
+// Write implements FileDescriptionImpl.
+func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	i := f.inode
+	if !i.seekable {
+		n, err := f.writeToHostFD(ctx, src, -1, opts.Flags)
+		if isBlockError(err) {
+			err = syserror.ErrWouldBlock
+		}
+		return n, err
+	}
+
+	f.offsetMu.Lock()
+	// NOTE(gvisor.dev/issue/2983): O_APPEND may cause memory corruption if
+	// another process modifies the host file between retrieving the file size
+	// and writing to the host fd. This is an unavoidable race condition because
+	// we cannot enforce synchronization on the host.
+	if f.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+		var s syscall.Stat_t
+		if err := syscall.Fstat(i.hostFD, &s); err != nil {
+			f.offsetMu.Unlock()
+			return 0, err
+		}
+		f.offset = s.Size
+	}
+	n, err := f.writeToHostFD(ctx, src, f.offset, opts.Flags)
+	f.offset += n
+	f.offsetMu.Unlock()
+	return n, err
+}
+
+func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSequence, offset int64, flags uint32) (int64, error) {
+	hostFD := f.inode.hostFD
+	// TODO(gvisor.dev/issue/2601): Support select pwritev2 flags.
+	if flags != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+	writer := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
+	n, err := src.CopyInTo(ctx, writer)
+	hostfd.PutReadWriterAt(writer)
+	// NOTE(gvisor.dev/issue/2979): We always sync everything, even for O_DSYNC.
+	if n > 0 && f.vfsfd.StatusFlags()&(linux.O_DSYNC|linux.O_SYNC) != 0 {
+		if syncErr := unix.Fsync(hostFD); syncErr != nil {
+			return int64(n), syncErr
+		}
+	}
+	return int64(n), err
+}
+
+// Seek implements FileDescriptionImpl.
+//
+// Note that we do not support seeking on directories, since we do not even
+// allow directory fds to be imported at all.
+func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (int64, error) {
+	i := f.inode
+	if !i.seekable {
+		return 0, syserror.ESPIPE
+	}
+
+	f.offsetMu.Lock()
+	defer f.offsetMu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		if offset < 0 {
+			return f.offset, syserror.EINVAL
+		}
+		f.offset = offset
+
+	case linux.SEEK_CUR:
+		// Check for overflow. Note that underflow cannot occur, since f.offset >= 0.
+		if offset > math.MaxInt64-f.offset {
+			return f.offset, syserror.EOVERFLOW
+		}
+		if f.offset+offset < 0 {
+			return f.offset, syserror.EINVAL
+		}
+		f.offset += offset
+
+	case linux.SEEK_END:
+		var s syscall.Stat_t
+		if err := syscall.Fstat(i.hostFD, &s); err != nil {
+			return f.offset, err
+		}
+		size := s.Size
+
+		// Check for overflow. Note that underflow cannot occur, since size >= 0.
+		if offset > math.MaxInt64-size {
+			return f.offset, syserror.EOVERFLOW
+		}
+		if size+offset < 0 {
+			return f.offset, syserror.EINVAL
+		}
+		f.offset = size + offset
+
+	case linux.SEEK_DATA, linux.SEEK_HOLE:
+		// Modifying the offset in the host file table should not matter, since
+		// this is the only place where we use it.
+		//
+		// For reading and writing, we always rely on our internal offset.
+		n, err := unix.Seek(i.hostFD, offset, int(whence))
+		if err != nil {
+			return f.offset, err
+		}
+		f.offset = n
+
+	default:
+		// Invalid whence.
+		return f.offset, syserror.EINVAL
+	}
+
+	return f.offset, nil
+}
+
+// Sync implements FileDescriptionImpl.
+func (f *fileDescription) Sync(context.Context) error {
+	// TODO(gvisor.dev/issue/1897): Currently, we always sync everything.
+	return unix.Fsync(f.inode.hostFD)
+}
+
+// ConfigureMMap implements FileDescriptionImpl.
+func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
+	if !f.inode.canMap {
+		return syserror.ENODEV
+	}
+	i := f.inode
+	i.pf.fileMapperInitOnce.Do(i.pf.fileMapper.Init)
+	return vfs.GenericConfigureMMap(&f.vfsfd, i, opts)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (f *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	f.inode.queue.EventRegister(e, mask)
+	fdnotifier.UpdateFD(int32(f.inode.hostFD))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (f *fileDescription) EventUnregister(e *waiter.Entry) {
+	f.inode.queue.EventUnregister(e)
+	fdnotifier.UpdateFD(int32(f.inode.hostFD))
+}
+
+// Readiness uses the poll() syscall to check the status of the underlying FD.
+func (f *fileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return fdnotifier.NonBlockingPoll(int32(f.inode.hostFD), mask)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (f *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return f.Locks().LockPOSIX(ctx, &f.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (f *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return f.Locks().UnlockPOSIX(ctx, &f.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/host/ioctl_unsafe.go b/pkg/sentry/fsimpl/host/ioctl_unsafe.go
new file mode 100644
index 000000000..0983bf7d8
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/ioctl_unsafe.go
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+)
+
+func ioctlGetTermios(fd int) (*linux.Termios, error) {
+	var t linux.Termios
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TCGETS, uintptr(unsafe.Pointer(&t)))
+	if errno != 0 {
+		return nil, errno
+	}
+	return &t, nil
+}
+
+func ioctlSetTermios(fd int, req uint64, t *linux.Termios) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(unsafe.Pointer(t)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
+
+func ioctlGetWinsize(fd int) (*linux.Winsize, error) {
+	var w linux.Winsize
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCGWINSZ, uintptr(unsafe.Pointer(&w)))
+	if errno != 0 {
+		return nil, errno
+	}
+	return &w, nil
+}
+
+func ioctlSetWinsize(fd int, w *linux.Winsize) error {
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, uintptr(fd), linux.TIOCSWINSZ, uintptr(unsafe.Pointer(w)))
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go
new file mode 100644
index 000000000..65d3af38c
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/mmap.go
@@ -0,0 +1,131 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// inodePlatformFile implements memmap.File. It exists solely because inode
+// cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef.
+//
+// inodePlatformFile should only be used if inode.canMap is true.
+type inodePlatformFile struct {
+	*inode
+
+	// fdRefsMu protects fdRefs.
+	fdRefsMu sync.Mutex
+
+	// fdRefs counts references on memmap.File offsets. It is used solely for
+	// memory accounting.
+	fdRefs fsutil.FrameRefSet
+
+	// fileMapper caches mappings of the host file represented by this inode.
+	fileMapper fsutil.HostFileMapper
+
+	// fileMapperInitOnce is used to lazily initialize fileMapper.
+	fileMapperInitOnce sync.Once
+}
+
+// IncRef implements memmap.File.IncRef.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inodePlatformFile) IncRef(fr memmap.FileRange) {
+	i.fdRefsMu.Lock()
+	i.fdRefs.IncRefAndAccount(fr)
+	i.fdRefsMu.Unlock()
+}
+
+// DecRef implements memmap.File.DecRef.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inodePlatformFile) DecRef(fr memmap.FileRange) {
+	i.fdRefsMu.Lock()
+	i.fdRefs.DecRefAndAccount(fr)
+	i.fdRefsMu.Unlock()
+}
+
+// MapInternal implements memmap.File.MapInternal.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inodePlatformFile) MapInternal(fr memmap.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
+	return i.fileMapper.MapInternal(fr, i.hostFD, at.Write)
+}
+
+// FD implements memmap.File.FD.
+func (i *inodePlatformFile) FD() int {
+	return i.hostFD
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	i.mapsMu.Lock()
+	mapped := i.mappings.AddMapping(ms, ar, offset, writable)
+	for _, r := range mapped {
+		i.pf.fileMapper.IncRefOn(r)
+	}
+	i.mapsMu.Unlock()
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	i.mapsMu.Lock()
+	unmapped := i.mappings.RemoveMapping(ms, ar, offset, writable)
+	for _, r := range unmapped {
+		i.pf.fileMapper.DecRefOn(r)
+	}
+	i.mapsMu.Unlock()
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return i.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	mr := optional
+	return []memmap.Translation{
+		{
+			Source: mr,
+			File:   &i.pf,
+			Offset: mr.Start,
+			Perms:  usermem.AnyAccess,
+		},
+	}, nil
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+//
+// Precondition: i.inode.canMap must be true.
+func (i *inode) InvalidateUnsavable(ctx context.Context) error {
+	// We expect the same host fd across save/restore, so all translations
+	// should be valid.
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go
new file mode 100644
index 000000000..4979dd0a9
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/socket.go
@@ -0,0 +1,385 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/socket/control"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserr"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip"
+	"gvisor.dev/gvisor/pkg/unet"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// Create a new host-backed endpoint from the given fd and its corresponding
+// notification queue.
+func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transport.Endpoint, error) {
+	// Set up an external transport.Endpoint using the host fd.
+	addr := fmt.Sprintf("hostfd:[%d]", hostFD)
+	e, err := NewConnectedEndpoint(ctx, hostFD, addr, true /* saveable */)
+	if err != nil {
+		return nil, err.ToError()
+	}
+	ep := transport.NewExternal(ctx, e.stype, uniqueid.GlobalProviderFromContext(ctx), queue, e, e)
+	return ep, nil
+}
+
+// ConnectedEndpoint is an implementation of transport.ConnectedEndpoint and
+// transport.Receiver. It is backed by a host fd that was imported at sentry
+// startup. This fd is shared with a hostfs inode, which retains ownership of
+// it.
+//
+// ConnectedEndpoint is saveable, since we expect that the host will provide
+// the same fd upon restore.
+//
+// As of this writing, we only allow Unix sockets to be imported.
+//
+// +stateify savable
+type ConnectedEndpoint struct {
+	// ref keeps track of references to a ConnectedEndpoint.
+	ref refs.AtomicRefCount
+
+	// mu protects fd below.
+	mu sync.RWMutex `state:"nosave"`
+
+	// fd is the host fd backing this endpoint.
+	fd int
+
+	// addr is the address at which this endpoint is bound.
+	addr string
+
+	// sndbuf is the size of the send buffer.
+	//
+	// N.B. When this is smaller than the host size, we present it via
+	// GetSockOpt and message splitting/rejection in SendMsg, but do not
+	// prevent lots of small messages from filling the real send buffer
+	// size on the host.
+	sndbuf int64 `state:"nosave"`
+
+	// stype is the type of Unix socket.
+	stype linux.SockType
+}
+
+// init performs initialization required for creating new ConnectedEndpoints and
+// for restoring them.
+func (c *ConnectedEndpoint) init() *syserr.Error {
+	family, err := syscall.GetsockoptInt(c.fd, syscall.SOL_SOCKET, syscall.SO_DOMAIN)
+	if err != nil {
+		return syserr.FromError(err)
+	}
+
+	if family != syscall.AF_UNIX {
+		// We only allow Unix sockets.
+		return syserr.ErrInvalidEndpointState
+	}
+
+	stype, err := syscall.GetsockoptInt(c.fd, syscall.SOL_SOCKET, syscall.SO_TYPE)
+	if err != nil {
+		return syserr.FromError(err)
+	}
+
+	if err := syscall.SetNonblock(c.fd, true); err != nil {
+		return syserr.FromError(err)
+	}
+
+	sndbuf, err := syscall.GetsockoptInt(c.fd, syscall.SOL_SOCKET, syscall.SO_SNDBUF)
+	if err != nil {
+		return syserr.FromError(err)
+	}
+
+	c.stype = linux.SockType(stype)
+	c.sndbuf = int64(sndbuf)
+
+	return nil
+}
+
+// NewConnectedEndpoint creates a new ConnectedEndpoint backed by a host fd
+// imported at sentry startup,
+//
+// The caller is responsible for calling Init(). Additionaly, Release needs to
+// be called twice because ConnectedEndpoint is both a transport.Receiver and
+// transport.ConnectedEndpoint.
+func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable bool) (*ConnectedEndpoint, *syserr.Error) {
+	e := ConnectedEndpoint{
+		fd:   hostFD,
+		addr: addr,
+	}
+
+	if err := e.init(); err != nil {
+		return nil, err
+	}
+
+	// AtomicRefCounters start off with a single reference. We need two.
+	e.ref.IncRef()
+	e.ref.EnableLeakCheck("host.ConnectedEndpoint")
+	return &e, nil
+}
+
+// Send implements transport.ConnectedEndpoint.Send.
+func (c *ConnectedEndpoint) Send(ctx context.Context, data [][]byte, controlMessages transport.ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	if !controlMessages.Empty() {
+		return 0, false, syserr.ErrInvalidEndpointState
+	}
+
+	// Since stream sockets don't preserve message boundaries, we can write
+	// only as much of the message as fits in the send buffer.
+	truncate := c.stype == linux.SOCK_STREAM
+
+	n, totalLen, err := fdWriteVec(c.fd, data, c.sndbuf, truncate)
+	if n < totalLen && err == nil {
+		// The host only returns a short write if it would otherwise
+		// block (and only for stream sockets).
+		err = syserror.EAGAIN
+	}
+	if n > 0 && err != syserror.EAGAIN {
+		// The caller may need to block to send more data, but
+		// otherwise there isn't anything that can be done about an
+		// error with a partial write.
+		err = nil
+	}
+
+	// There is no need for the callee to call SendNotify because fdWriteVec
+	// uses the host's sendmsg(2) and the host kernel's queue.
+	return n, false, syserr.FromError(err)
+}
+
+// SendNotify implements transport.ConnectedEndpoint.SendNotify.
+func (c *ConnectedEndpoint) SendNotify() {}
+
+// CloseSend implements transport.ConnectedEndpoint.CloseSend.
+func (c *ConnectedEndpoint) CloseSend() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if err := syscall.Shutdown(c.fd, syscall.SHUT_WR); err != nil {
+		// A well-formed UDS shutdown can't fail. See
+		// net/unix/af_unix.c:unix_shutdown.
+		panic(fmt.Sprintf("failed write shutdown on host socket %+v: %v", c, err))
+	}
+}
+
+// CloseNotify implements transport.ConnectedEndpoint.CloseNotify.
+func (c *ConnectedEndpoint) CloseNotify() {}
+
+// Writable implements transport.ConnectedEndpoint.Writable.
+func (c *ConnectedEndpoint) Writable() bool {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	return fdnotifier.NonBlockingPoll(int32(c.fd), waiter.EventOut)&waiter.EventOut != 0
+}
+
+// Passcred implements transport.ConnectedEndpoint.Passcred.
+func (c *ConnectedEndpoint) Passcred() bool {
+	// We don't support credential passing for host sockets.
+	return false
+}
+
+// GetLocalAddress implements transport.ConnectedEndpoint.GetLocalAddress.
+func (c *ConnectedEndpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
+	return tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, nil
+}
+
+// EventUpdate implements transport.ConnectedEndpoint.EventUpdate.
+func (c *ConnectedEndpoint) EventUpdate() {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	if c.fd != -1 {
+		fdnotifier.UpdateFD(int32(c.fd))
+	}
+}
+
+// Recv implements transport.Receiver.Recv.
+func (c *ConnectedEndpoint) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, transport.ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	var cm unet.ControlMessage
+	if numRights > 0 {
+		cm.EnableFDs(int(numRights))
+	}
+
+	// N.B. Unix sockets don't have a receive buffer, the send buffer
+	// serves both purposes.
+	rl, ml, cl, cTrunc, err := fdReadVec(c.fd, data, []byte(cm), peek, c.sndbuf)
+	if rl > 0 && err != nil {
+		// We got some data, so all we need to do on error is return
+		// the data that we got. Short reads are fine, no need to
+		// block.
+		err = nil
+	}
+	if err != nil {
+		return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err)
+	}
+
+	// There is no need for the callee to call RecvNotify because fdReadVec uses
+	// the host's recvmsg(2) and the host kernel's queue.
+
+	// Trim the control data if we received less than the full amount.
+	if cl < uint64(len(cm)) {
+		cm = cm[:cl]
+	}
+
+	// Avoid extra allocations in the case where there isn't any control data.
+	if len(cm) == 0 {
+		return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil
+	}
+
+	fds, err := cm.ExtractFDs()
+	if err != nil {
+		return 0, 0, transport.ControlMessages{}, false, tcpip.FullAddress{}, false, syserr.FromError(err)
+	}
+
+	if len(fds) == 0 {
+		return rl, ml, transport.ControlMessages{}, cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil
+	}
+	return rl, ml, control.NewVFS2(nil, nil, newSCMRights(fds)), cTrunc, tcpip.FullAddress{Addr: tcpip.Address(c.addr)}, false, nil
+}
+
+// RecvNotify implements transport.Receiver.RecvNotify.
+func (c *ConnectedEndpoint) RecvNotify() {}
+
+// CloseRecv implements transport.Receiver.CloseRecv.
+func (c *ConnectedEndpoint) CloseRecv() {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if err := syscall.Shutdown(c.fd, syscall.SHUT_RD); err != nil {
+		// A well-formed UDS shutdown can't fail. See
+		// net/unix/af_unix.c:unix_shutdown.
+		panic(fmt.Sprintf("failed read shutdown on host socket %+v: %v", c, err))
+	}
+}
+
+// Readable implements transport.Receiver.Readable.
+func (c *ConnectedEndpoint) Readable() bool {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+
+	return fdnotifier.NonBlockingPoll(int32(c.fd), waiter.EventIn)&waiter.EventIn != 0
+}
+
+// SendQueuedSize implements transport.Receiver.SendQueuedSize.
+func (c *ConnectedEndpoint) SendQueuedSize() int64 {
+	// TODO(gvisor.dev/issue/273): SendQueuedSize isn't supported for host
+	// sockets because we don't allow the sentry to call ioctl(2).
+	return -1
+}
+
+// RecvQueuedSize implements transport.Receiver.RecvQueuedSize.
+func (c *ConnectedEndpoint) RecvQueuedSize() int64 {
+	// TODO(gvisor.dev/issue/273): RecvQueuedSize isn't supported for host
+	// sockets because we don't allow the sentry to call ioctl(2).
+	return -1
+}
+
+// SendMaxQueueSize implements transport.Receiver.SendMaxQueueSize.
+func (c *ConnectedEndpoint) SendMaxQueueSize() int64 {
+	return int64(c.sndbuf)
+}
+
+// RecvMaxQueueSize implements transport.Receiver.RecvMaxQueueSize.
+func (c *ConnectedEndpoint) RecvMaxQueueSize() int64 {
+	// N.B. Unix sockets don't use the receive buffer. We'll claim it is
+	// the same size as the send buffer.
+	return int64(c.sndbuf)
+}
+
+func (c *ConnectedEndpoint) destroyLocked() {
+	c.fd = -1
+}
+
+// Release implements transport.ConnectedEndpoint.Release and
+// transport.Receiver.Release.
+func (c *ConnectedEndpoint) Release(ctx context.Context) {
+	c.ref.DecRefWithDestructor(ctx, func(context.Context) {
+		c.mu.Lock()
+		c.destroyLocked()
+		c.mu.Unlock()
+	})
+}
+
+// CloseUnread implements transport.ConnectedEndpoint.CloseUnread.
+func (c *ConnectedEndpoint) CloseUnread() {}
+
+// SCMConnectedEndpoint represents an endpoint backed by a host fd that was
+// passed through a gofer Unix socket. It resembles ConnectedEndpoint, with the
+// following differences:
+// - SCMConnectedEndpoint is not saveable, because the host cannot guarantee
+// the same descriptor number across S/R.
+// - SCMConnectedEndpoint holds ownership of its fd and notification queue.
+type SCMConnectedEndpoint struct {
+	ConnectedEndpoint
+
+	queue *waiter.Queue
+}
+
+// Init will do the initialization required without holding other locks.
+func (e *SCMConnectedEndpoint) Init() error {
+	return fdnotifier.AddFD(int32(e.fd), e.queue)
+}
+
+// Release implements transport.ConnectedEndpoint.Release and
+// transport.Receiver.Release.
+func (e *SCMConnectedEndpoint) Release(ctx context.Context) {
+	e.ref.DecRefWithDestructor(ctx, func(context.Context) {
+		e.mu.Lock()
+		if err := syscall.Close(e.fd); err != nil {
+			log.Warningf("Failed to close host fd %d: %v", err)
+		}
+		fdnotifier.RemoveFD(int32(e.fd))
+		e.destroyLocked()
+		e.mu.Unlock()
+	})
+}
+
+// NewSCMEndpoint creates a new SCMConnectedEndpoint backed by a host fd that
+// was passed through a Unix socket.
+//
+// The caller is responsible for calling Init(). Additionaly, Release needs to
+// be called twice because ConnectedEndpoint is both a transport.Receiver and
+// transport.ConnectedEndpoint.
+func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr string) (*SCMConnectedEndpoint, *syserr.Error) {
+	e := SCMConnectedEndpoint{
+		ConnectedEndpoint: ConnectedEndpoint{
+			fd:   hostFD,
+			addr: addr,
+		},
+		queue: queue,
+	}
+
+	if err := e.init(); err != nil {
+		return nil, err
+	}
+
+	// AtomicRefCounters start off with a single reference. We need two.
+	e.ref.IncRef()
+	e.ref.EnableLeakCheck("host.SCMConnectedEndpoint")
+	return &e, nil
+}
diff --git a/pkg/sentry/fsimpl/host/socket_iovec.go b/pkg/sentry/fsimpl/host/socket_iovec.go
new file mode 100644
index 000000000..fc0d5fd38
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/socket_iovec.go
@@ -0,0 +1,110 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/iovec"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// copyToMulti copies as many bytes from src to dst as possible.
+func copyToMulti(dst [][]byte, src []byte) {
+	for _, d := range dst {
+		done := copy(d, src)
+		src = src[done:]
+		if len(src) == 0 {
+			break
+		}
+	}
+}
+
+// copyFromMulti copies as many bytes from src to dst as possible.
+func copyFromMulti(dst []byte, src [][]byte) {
+	for _, s := range src {
+		done := copy(dst, s)
+		dst = dst[done:]
+		if len(dst) == 0 {
+			break
+		}
+	}
+}
+
+// buildIovec builds an iovec slice from the given []byte slice.
+//
+// If truncate, truncate bufs > maxlen. Otherwise, immediately return an error.
+//
+// If length < the total length of bufs, err indicates why, even when returning
+// a truncated iovec.
+//
+// If intermediate != nil, iovecs references intermediate rather than bufs and
+// the caller must copy to/from bufs as necessary.
+func buildIovec(bufs [][]byte, maxlen int64, truncate bool) (length int64, iovecs []syscall.Iovec, intermediate []byte, err error) {
+	var iovsRequired int
+	for _, b := range bufs {
+		length += int64(len(b))
+		if len(b) > 0 {
+			iovsRequired++
+		}
+	}
+
+	stopLen := length
+	if length > maxlen {
+		if truncate {
+			stopLen = maxlen
+			err = syserror.EAGAIN
+		} else {
+			return 0, nil, nil, syserror.EMSGSIZE
+		}
+	}
+
+	if iovsRequired > iovec.MaxIovs {
+		// The kernel will reject our call if we pass this many iovs.
+		// Use a single intermediate buffer instead.
+		b := make([]byte, stopLen)
+
+		return stopLen, []syscall.Iovec{{
+			Base: &b[0],
+			Len:  uint64(stopLen),
+		}}, b, err
+	}
+
+	var total int64
+	iovecs = make([]syscall.Iovec, 0, iovsRequired)
+	for i := range bufs {
+		l := len(bufs[i])
+		if l == 0 {
+			continue
+		}
+
+		stop := int64(l)
+		if total+stop > stopLen {
+			stop = stopLen - total
+		}
+
+		iovecs = append(iovecs, syscall.Iovec{
+			Base: &bufs[i][0],
+			Len:  uint64(stop),
+		})
+
+		total += stop
+		if total >= stopLen {
+			break
+		}
+	}
+
+	return total, iovecs, nil, err
+}
diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go
new file mode 100644
index 000000000..35ded24bc
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/socket_unsafe.go
@@ -0,0 +1,101 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+// fdReadVec receives from fd to bufs.
+//
+// If the total length of bufs is > maxlen, fdReadVec will do a partial read
+// and err will indicate why the message was truncated.
+func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (readLen int64, msgLen int64, controlLen uint64, controlTrunc bool, err error) {
+	flags := uintptr(syscall.MSG_DONTWAIT | syscall.MSG_TRUNC)
+	if peek {
+		flags |= syscall.MSG_PEEK
+	}
+
+	// Always truncate the receive buffer. All socket types will truncate
+	// received messages.
+	length, iovecs, intermediate, err := buildIovec(bufs, maxlen, true)
+	if err != nil && len(iovecs) == 0 {
+		// No partial write to do, return error immediately.
+		return 0, 0, 0, false, err
+	}
+
+	var msg syscall.Msghdr
+	if len(control) != 0 {
+		msg.Control = &control[0]
+		msg.Controllen = uint64(len(control))
+	}
+
+	if len(iovecs) != 0 {
+		msg.Iov = &iovecs[0]
+		msg.Iovlen = uint64(len(iovecs))
+	}
+
+	rawN, _, e := syscall.RawSyscall(syscall.SYS_RECVMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), flags)
+	if e != 0 {
+		// N.B. prioritize the syscall error over the buildIovec error.
+		return 0, 0, 0, false, e
+	}
+	n := int64(rawN)
+
+	// Copy data back to bufs.
+	if intermediate != nil {
+		copyToMulti(bufs, intermediate)
+	}
+
+	controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
+
+	if n > length {
+		return length, n, msg.Controllen, controlTrunc, err
+	}
+
+	return n, n, msg.Controllen, controlTrunc, err
+}
+
+// fdWriteVec sends from bufs to fd.
+//
+// If the total length of bufs is > maxlen && truncate, fdWriteVec will do a
+// partial write and err will indicate why the message was truncated.
+func fdWriteVec(fd int, bufs [][]byte, maxlen int64, truncate bool) (int64, int64, error) {
+	length, iovecs, intermediate, err := buildIovec(bufs, maxlen, truncate)
+	if err != nil && len(iovecs) == 0 {
+		// No partial write to do, return error immediately.
+		return 0, length, err
+	}
+
+	// Copy data to intermediate buf.
+	if intermediate != nil {
+		copyFromMulti(intermediate, bufs)
+	}
+
+	var msg syscall.Msghdr
+	if len(iovecs) > 0 {
+		msg.Iov = &iovecs[0]
+		msg.Iovlen = uint64(len(iovecs))
+	}
+
+	n, _, e := syscall.RawSyscall(syscall.SYS_SENDMSG, uintptr(fd), uintptr(unsafe.Pointer(&msg)), syscall.MSG_DONTWAIT|syscall.MSG_NOSIGNAL)
+	if e != 0 {
+		// N.B. prioritize the syscall error over the buildIovec error.
+		return 0, length, e
+	}
+
+	return int64(n), length, err
+}
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
new file mode 100644
index 000000000..d372c60cb
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -0,0 +1,390 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/unimpl"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// TTYFileDescription implements vfs.FileDescriptionImpl for a host file
+// descriptor that wraps a TTY FD.
+type TTYFileDescription struct {
+	fileDescription
+
+	// mu protects the fields below.
+	mu sync.Mutex `state:"nosave"`
+
+	// session is the session attached to this TTYFileDescription.
+	session *kernel.Session
+
+	// fgProcessGroup is the foreground process group that is currently
+	// connected to this TTY.
+	fgProcessGroup *kernel.ProcessGroup
+
+	// termios contains the terminal attributes for this TTY.
+	termios linux.KernelTermios
+}
+
+// InitForegroundProcessGroup sets the foreground process group and session for
+// the TTY. This should only be called once, after the foreground process group
+// has been created, but before it has started running.
+func (t *TTYFileDescription) InitForegroundProcessGroup(pg *kernel.ProcessGroup) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.fgProcessGroup != nil {
+		panic("foreground process group is already set")
+	}
+	t.fgProcessGroup = pg
+	t.session = pg.Session()
+}
+
+// ForegroundProcessGroup returns the foreground process for the TTY.
+func (t *TTYFileDescription) ForegroundProcessGroup() *kernel.ProcessGroup {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.fgProcessGroup
+}
+
+// Release implements fs.FileOperations.Release.
+func (t *TTYFileDescription) Release(ctx context.Context) {
+	t.mu.Lock()
+	t.fgProcessGroup = nil
+	t.mu.Unlock()
+
+	t.fileDescription.Release(ctx)
+}
+
+// PRead implements vfs.FileDescriptionImpl.
+//
+// Reading from a TTY is only allowed for foreground process groups. Background
+// process groups will either get EIO or a SIGTTIN.
+func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Are we allowed to do the read?
+	// drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change().
+	if err := t.checkChange(ctx, linux.SIGTTIN); err != nil {
+		return 0, err
+	}
+
+	// Do the read.
+	return t.fileDescription.PRead(ctx, dst, offset, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.
+//
+// Reading from a TTY is only allowed for foreground process groups. Background
+// process groups will either get EIO or a SIGTTIN.
+func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Are we allowed to do the read?
+	// drivers/tty/n_tty.c:n_tty_read()=>job_control()=>tty_check_change().
+	if err := t.checkChange(ctx, linux.SIGTTIN); err != nil {
+		return 0, err
+	}
+
+	// Do the read.
+	return t.fileDescription.Read(ctx, dst, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.
+func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Check whether TOSTOP is enabled. This corresponds to the check in
+	// drivers/tty/n_tty.c:n_tty_write().
+	if t.termios.LEnabled(linux.TOSTOP) {
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+	}
+	return t.fileDescription.PWrite(ctx, src, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.
+func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// Check whether TOSTOP is enabled. This corresponds to the check in
+	// drivers/tty/n_tty.c:n_tty_write().
+	if t.termios.LEnabled(linux.TOSTOP) {
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+	}
+	return t.fileDescription.Write(ctx, src, opts)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.
+func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	// Ignore arg[0]. This is the real FD:
+	fd := t.inode.hostFD
+	ioctl := args[1].Uint64()
+	switch ioctl {
+	case linux.TCGETS:
+		termios, err := ioctlGetTermios(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			return 0, err
+		}
+
+		var termios linux.Termios
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetTermios(fd, ioctl, &termios)
+		if err == nil {
+			t.termios.FromTermios(termios)
+		}
+		return 0, err
+
+	case linux.TIOCGPGRP:
+		// Args: pid_t *argp
+		// When successful, equivalent to *argp = tcgetpgrp(fd).
+		// Get the process group ID of the foreground process group on this
+		// terminal.
+
+		pidns := kernel.PIDNamespaceFromContext(ctx)
+		if pidns == nil {
+			return 0, syserror.ENOTTY
+		}
+
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		// Map the ProcessGroup into a ProcessGroupID in the task's PID namespace.
+		pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
+		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCSPGRP:
+		// Args: const pid_t *argp
+		// Equivalent to tcsetpgrp(fd, *argp).
+		// Set the foreground process group ID of this terminal.
+
+		task := kernel.TaskFromContext(ctx)
+		if task == nil {
+			return 0, syserror.ENOTTY
+		}
+
+		t.mu.Lock()
+		defer t.mu.Unlock()
+
+		// Check that we are allowed to set the process group.
+		if err := t.checkChange(ctx, linux.SIGTTOU); err != nil {
+			// drivers/tty/tty_io.c:tiocspgrp() converts -EIO from tty_check_change()
+			// to -ENOTTY.
+			if err == syserror.EIO {
+				return 0, syserror.ENOTTY
+			}
+			return 0, err
+		}
+
+		// Check that calling task's process group is in the TTY session.
+		if task.ThreadGroup().Session() != t.session {
+			return 0, syserror.ENOTTY
+		}
+
+		var pgID kernel.ProcessGroupID
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+
+		// pgID must be non-negative.
+		if pgID < 0 {
+			return 0, syserror.EINVAL
+		}
+
+		// Process group with pgID must exist in this PID namespace.
+		pidns := task.PIDNamespace()
+		pg := pidns.ProcessGroupWithID(pgID)
+		if pg == nil {
+			return 0, syserror.ESRCH
+		}
+
+		// Check that new process group is in the TTY session.
+		if pg.Session() != t.session {
+			return 0, syserror.EPERM
+		}
+
+		t.fgProcessGroup = pg
+		return 0, nil
+
+	case linux.TIOCGWINSZ:
+		// Args: struct winsize *argp
+		// Get window size.
+		winsize, err := ioctlGetWinsize(fd)
+		if err != nil {
+			return 0, err
+		}
+		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		})
+		return 0, err
+
+	case linux.TIOCSWINSZ:
+		// Args: const struct winsize *argp
+		// Set window size.
+
+		// Unlike setting the termios, any process group (even background ones) can
+		// set the winsize.
+
+		var winsize linux.Winsize
+		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
+			AddressSpaceActive: true,
+		}); err != nil {
+			return 0, err
+		}
+		err := ioctlSetWinsize(fd, &winsize)
+		return 0, err
+
+	// Unimplemented commands.
+	case linux.TIOCSETD,
+		linux.TIOCSBRK,
+		linux.TIOCCBRK,
+		linux.TCSBRK,
+		linux.TCSBRKP,
+		linux.TIOCSTI,
+		linux.TIOCCONS,
+		linux.FIONBIO,
+		linux.TIOCEXCL,
+		linux.TIOCNXCL,
+		linux.TIOCGEXCL,
+		linux.TIOCNOTTY,
+		linux.TIOCSCTTY,
+		linux.TIOCGSID,
+		linux.TIOCGETD,
+		linux.TIOCVHANGUP,
+		linux.TIOCGDEV,
+		linux.TIOCMGET,
+		linux.TIOCMSET,
+		linux.TIOCMBIC,
+		linux.TIOCMBIS,
+		linux.TIOCGICOUNT,
+		linux.TCFLSH,
+		linux.TIOCSSERIAL,
+		linux.TIOCGPTPEER:
+
+		unimpl.EmitUnimplementedEvent(ctx)
+		fallthrough
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// checkChange checks that the process group is allowed to read, write, or
+// change the state of the TTY.
+//
+// This corresponds to Linux drivers/tty/tty_io.c:tty_check_change(). The logic
+// is a bit convoluted, but documented inline.
+//
+// Preconditions: t.mu must be held.
+func (t *TTYFileDescription) checkChange(ctx context.Context, sig linux.Signal) error {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		// No task? Linux does not have an analog for this case, but
+		// tty_check_change only blocks specific cases and is
+		// surprisingly permissive. Allowing the change seems
+		// appropriate.
+		return nil
+	}
+
+	tg := task.ThreadGroup()
+	pg := tg.ProcessGroup()
+
+	// If the session for the task is different than the session for the
+	// controlling TTY, then the change is allowed. Seems like a bad idea,
+	// but that's exactly what linux does.
+	if tg.Session() != t.fgProcessGroup.Session() {
+		return nil
+	}
+
+	// If we are the foreground process group, then the change is allowed.
+	if pg == t.fgProcessGroup {
+		return nil
+	}
+
+	// We are not the foreground process group.
+
+	// Is the provided signal blocked or ignored?
+	if (task.SignalMask()&linux.SignalSetOf(sig) != 0) || tg.SignalHandlers().IsIgnored(sig) {
+		// If the signal is SIGTTIN, then we are attempting to read
+		// from the TTY. Don't send the signal and return EIO.
+		if sig == linux.SIGTTIN {
+			return syserror.EIO
+		}
+
+		// Otherwise, we are writing or changing terminal state. This is allowed.
+		return nil
+	}
+
+	// If the process group is an orphan, return EIO.
+	if pg.IsOrphan() {
+		return syserror.EIO
+	}
+
+	// Otherwise, send the signal to the process group and return ERESTARTSYS.
+	//
+	// Note that Linux also unconditionally sets TIF_SIGPENDING on current,
+	// but this isn't necessary in gVisor because the rationale given in
+	// 040b6362d58f "tty: fix leakage of -ERESTARTSYS to userland" doesn't
+	// apply: the sentry will handle -ERESTARTSYS in
+	// kernel.runApp.execute() even if the kernel.Task isn't interrupted.
+	//
+	// Linux ignores the result of kill_pgrp().
+	_ = pg.SendSignal(kernel.SignalInfoPriv(sig))
+	return kernel.ERESTARTSYS
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (t *TTYFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, typ fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return t.Locks().LockPOSIX(ctx, &t.vfsfd, uid, typ, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (t *TTYFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return t.Locks().UnlockPOSIX(ctx, &t.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
new file mode 100644
index 000000000..412bdb2eb
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -0,0 +1,56 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func toTimespec(ts linux.StatxTimestamp, omit bool) syscall.Timespec {
+	if omit {
+		return syscall.Timespec{
+			Sec:  0,
+			Nsec: unix.UTIME_OMIT,
+		}
+	}
+	return syscall.Timespec{
+		Sec:  ts.Sec,
+		Nsec: int64(ts.Nsec),
+	}
+}
+
+func unixToLinuxStatxTimestamp(ts unix.StatxTimestamp) linux.StatxTimestamp {
+	return linux.StatxTimestamp{Sec: ts.Sec, Nsec: ts.Nsec}
+}
+
+func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp {
+	return linux.StatxTimestamp{Sec: int64(ts.Sec), Nsec: uint32(ts.Nsec)}
+}
+
+// wouldBlock returns true for file types that can return EWOULDBLOCK
+// for blocking operations, e.g. pipes, character devices, and sockets.
+func wouldBlock(fileType uint32) bool {
+	return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
+}
+
+// isBlockError checks if an error is EAGAIN or EWOULDBLOCK.
+// If so, they can be transformed into syserror.ErrWouldBlock.
+func isBlockError(err error) bool {
+	return err == syserror.EAGAIN || err == syserror.EWOULDBLOCK
+}
diff --git a/pkg/sentry/fsimpl/proc/proc.go b/pkg/sentry/fsimpl/host/util_unsafe.go
index 31dec36de..5136ac844 100644
--- a/pkg/sentry/fsimpl/proc/proc.go
+++ b/pkg/sentry/fsimpl/host/util_unsafe.go
@@ -1,4 +1,4 @@
-// Copyright 2019 The gVisor Authors.
+// Copyright 2020 The gVisor Authors.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,5 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Package proc implements a partial in-memory file system for procfs.
-package proc
+package host
+
+import (
+	"syscall"
+	"unsafe"
+)
+
+func setTimestamps(fd int, ts *[2]syscall.Timespec) error {
+	_, _, errno := syscall.Syscall6(
+		syscall.SYS_UTIMENSAT,
+		uintptr(fd),
+		0, /* path */
+		uintptr(unsafe.Pointer(ts)),
+		0, /* flags */
+		0, 0)
+	if errno != 0 {
+		return errno
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
new file mode 100644
index 000000000..3835557fe
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -0,0 +1,75 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "kernfs",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "Dentry",
+    },
+)
+
+go_template_instance(
+    name = "slot_list",
+    out = "slot_list.go",
+    package = "kernfs",
+    prefix = "slot",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*slot",
+        "Linker": "*slot",
+    },
+)
+
+go_library(
+    name = "kernfs",
+    srcs = [
+        "dynamic_bytes_file.go",
+        "fd_impl_util.go",
+        "filesystem.go",
+        "fstree.go",
+        "inode_impl_util.go",
+        "kernfs.go",
+        "slot_list.go",
+        "symlink.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
+
+go_test(
+    name = "kernfs_test",
+    size = "small",
+    srcs = ["kernfs_test.go"],
+    deps = [
+        ":kernfs",
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/fsimpl/testutil",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
new file mode 100644
index 000000000..12adf727a
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -0,0 +1,147 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// DynamicBytesFile implements kernfs.Inode and represents a read-only
+// file whose contents are backed by a vfs.DynamicBytesSource.
+//
+// Must be instantiated with NewDynamicBytesFile or initialized with Init
+// before first use.
+//
+// +stateify savable
+type DynamicBytesFile struct {
+	InodeAttrs
+	InodeNoopRefCount
+	InodeNotDirectory
+	InodeNotSymlink
+
+	locks vfs.FileLocks
+	data  vfs.DynamicBytesSource
+}
+
+var _ Inode = (*DynamicBytesFile)(nil)
+
+// Init initializes a dynamic bytes file.
+func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
+	}
+	f.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
+	f.data = data
+}
+
+// Open implements Inode.Open.
+func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &DynamicBytesFD{}
+	if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// SetStat implements Inode.SetStat. By default DynamicBytesFile doesn't allow
+// inode attributes to be changed. Override SetStat() making it call
+// f.InodeAttrs to allow it.
+func (*DynamicBytesFile) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+// DynamicBytesFD implements vfs.FileDescriptionImpl for an FD backed by a
+// DynamicBytesFile.
+//
+// Must be initialized with Init before first use.
+//
+// +stateify savable
+type DynamicBytesFD struct {
+	vfs.FileDescriptionDefaultImpl
+	vfs.DynamicBytesFileDescriptionImpl
+	vfs.LockFD
+
+	vfsfd vfs.FileDescription
+	inode Inode
+}
+
+// Init initializes a DynamicBytesFD.
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
+	fd.LockFD.Init(locks)
+	if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+		return err
+	}
+	fd.inode = d.Impl().(*Dentry).inode
+	fd.SetDataSource(data)
+	return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *DynamicBytesFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.Seek(ctx, offset, whence)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *DynamicBytesFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.Read(ctx, dst, opts)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *DynamicBytesFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.PRead(ctx, dst, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *DynamicBytesFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.Write(ctx, src, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *DynamicBytesFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return fd.DynamicBytesFileDescriptionImpl.PWrite(ctx, src, offset, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *DynamicBytesFD) Release(context.Context) {}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *DynamicBytesFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return fd.inode.Stat(ctx, fs, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error {
+	// DynamicBytesFiles are immutable.
+	return syserror.EPERM
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *DynamicBytesFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *DynamicBytesFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
new file mode 100644
index 000000000..fcee6200a
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -0,0 +1,252 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"math"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory
+// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not
+// compatible with dynamic directories.
+//
+// Note that GenericDirectoryFD holds a lock over OrderedChildren while calling
+// IterDirents callback. The IterDirents callback therefore cannot hash or
+// unhash children, or recursively call IterDirents on the same underlying
+// inode.
+//
+// Must be initialize with Init before first use.
+//
+// Lock ordering: mu => children.mu.
+type GenericDirectoryFD struct {
+	vfs.FileDescriptionDefaultImpl
+	vfs.DirectoryFileDescriptionDefaultImpl
+	vfs.LockFD
+
+	vfsfd    vfs.FileDescription
+	children *OrderedChildren
+
+	// mu protects the fields below.
+	mu sync.Mutex
+
+	// off is the current directory offset. Protected by "mu".
+	off int64
+}
+
+// NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
+// dentry.
+func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
+	fd := &GenericDirectoryFD{}
+	if err := fd.Init(children, locks, opts); err != nil {
+		return nil, err
+	}
+	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return fd, nil
+}
+
+// Init initializes a GenericDirectoryFD. Use it when overriding
+// GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
+// correct implementation.
+func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error {
+	if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
+		// Can't open directories for writing.
+		return syserror.EISDIR
+	}
+	fd.LockFD.Init(locks)
+	fd.children = children
+	return nil
+}
+
+// VFSFileDescription returns a pointer to the vfs.FileDescription representing
+// this object.
+func (fd *GenericDirectoryFD) VFSFileDescription() *vfs.FileDescription {
+	return &fd.vfsfd
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *GenericDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return fd.FileDescriptionDefaultImpl.ConfigureMMap(ctx, opts)
+}
+
+// Read implmenets vfs.FileDescriptionImpl.Read.
+func (fd *GenericDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.Read(ctx, dst, opts)
+}
+
+// PRead implmenets vfs.FileDescriptionImpl.PRead.
+func (fd *GenericDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.PRead(ctx, dst, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *GenericDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.Write(ctx, src, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *GenericDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return fd.DirectoryFileDescriptionDefaultImpl.PWrite(ctx, src, offset, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *GenericDirectoryFD) Release(context.Context) {}
+
+func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem {
+	return fd.vfsfd.VirtualDentry().Mount().Filesystem()
+}
+
+func (fd *GenericDirectoryFD) inode() Inode {
+	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents. IterDirents holds
+// o.mu when calling cb.
+func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	opts := vfs.StatOptions{Mask: linux.STATX_INO}
+	// Handle ".".
+	if fd.off == 0 {
+		stat, err := fd.inode().Stat(ctx, fd.filesystem(), opts)
+		if err != nil {
+			return err
+		}
+		dirent := vfs.Dirent{
+			Name:    ".",
+			Type:    linux.DT_DIR,
+			Ino:     stat.Ino,
+			NextOff: 1,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return err
+		}
+		fd.off++
+	}
+
+	// Handle "..".
+	if fd.off == 1 {
+		vfsd := fd.vfsfd.VirtualDentry().Dentry()
+		parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode
+		stat, err := parentInode.Stat(ctx, fd.filesystem(), opts)
+		if err != nil {
+			return err
+		}
+		dirent := vfs.Dirent{
+			Name:    "..",
+			Type:    linux.FileMode(stat.Mode).DirentType(),
+			Ino:     stat.Ino,
+			NextOff: 2,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return err
+		}
+		fd.off++
+	}
+
+	// Handle static children.
+	fd.children.mu.RLock()
+	defer fd.children.mu.RUnlock()
+	// fd.off accounts for "." and "..", but fd.children do not track
+	// these.
+	childIdx := fd.off - 2
+	for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
+		inode := it.Dentry.Impl().(*Dentry).inode
+		stat, err := inode.Stat(ctx, fd.filesystem(), opts)
+		if err != nil {
+			return err
+		}
+		dirent := vfs.Dirent{
+			Name:    it.Name,
+			Type:    linux.FileMode(stat.Mode).DirentType(),
+			Ino:     stat.Ino,
+			NextOff: fd.off + 1,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return err
+		}
+		fd.off++
+	}
+
+	var err error
+	relOffset := fd.off - int64(len(fd.children.set)) - 2
+	fd.off, err = fd.inode().IterDirents(ctx, cb, fd.off, relOffset)
+	return err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as given.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	case linux.SEEK_END:
+		// TODO(gvisor.dev/issue/1193): This can prevent new files from showing up
+		// if they are added after SEEK_END.
+		offset = math.MaxInt64
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := fd.filesystem()
+	inode := fd.inode()
+	return inode.Stat(ctx, fs, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+	return inode.SetStat(ctx, fd.filesystem(), creds, opts)
+}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *GenericDirectoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return fd.DirectoryFileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *GenericDirectoryFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *GenericDirectoryFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
new file mode 100644
index 000000000..d7edb6342
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -0,0 +1,840 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+// This file implements vfs.FilesystemImpl for kernfs.
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// stepExistingLocked resolves rp.Component() in parent directory vfsd.
+//
+// stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+//
+// Postcondition: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) {
+	d := vfsd.Impl().(*Dentry)
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	// Directory searchable?
+	if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	name := rp.Component()
+	// Revalidation must be skipped if name is "." or ".."; d or its parent
+	// respectively can't be expected to transition from invalidated back to
+	// valid, so detecting invalidation and retrying would loop forever. This
+	// is consistent with Linux: fs/namei.c:walk_component() => lookup_fast()
+	// calls d_revalidate(), but walk_component() => handle_dots() does not.
+	if name == "." {
+		rp.Advance()
+		return vfsd, nil
+	}
+	if name == ".." {
+		if isRoot, err := rp.CheckRoot(ctx, vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return vfsd, nil
+		}
+		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return &d.parent.vfsd, nil
+	}
+	if len(name) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
+	d.dirMu.Lock()
+	next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, d.children[name])
+	d.dirMu.Unlock()
+	if err != nil {
+		return nil, err
+	}
+	if err := rp.CheckMount(ctx, &next.vfsd); err != nil {
+		return nil, err
+	}
+	// Resolve any symlink at current path component.
+	if mayFollowSymlinks && rp.ShouldFollowSymlink() && next.isSymlink() {
+		targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount())
+		if err != nil {
+			return nil, err
+		}
+		if targetVD.Ok() {
+			err := rp.HandleJump(targetVD)
+			targetVD.DecRef(ctx)
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			if err := rp.HandleSymlink(targetPathname); err != nil {
+				return nil, err
+			}
+		}
+		goto afterSymlink
+	}
+	rp.Advance()
+	return &next.vfsd, nil
+}
+
+// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
+// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
+// nil) to verify that the returned child (or lack thereof) is correct.
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+// parent.dirMu must be locked. parent.isDir(). name is not "." or "..".
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) {
+	if child != nil {
+		// Cached dentry exists, revalidate.
+		if !child.inode.Valid(ctx) {
+			delete(parent.children, name)
+			vfsObj.InvalidateDentry(ctx, &child.vfsd)
+			fs.deferDecRef(&child.vfsd) // Reference from Lookup.
+			child = nil
+		}
+	}
+	if child == nil {
+		// Dentry isn't cached; it either doesn't exist or failed
+		// revalidation. Attempt to resolve it via Lookup.
+		//
+		// FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return
+		// *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes
+		// that all dentries in the filesystem are (kernfs.)Dentry and performs
+		// vfs.DentryImpl casts accordingly.
+		childVFSD, err := parent.inode.Lookup(ctx, name)
+		if err != nil {
+			return nil, err
+		}
+		// Reference on childVFSD dropped by a corresponding Valid.
+		child = childVFSD.Impl().(*Dentry)
+		parent.insertChildLocked(name, child)
+	}
+	return child, nil
+}
+
+// walkExistingLocked resolves rp to an existing file.
+//
+// walkExistingLocked is loosely analogous to Linux's
+// fs/namei.c:path_lookupat().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
+	vfsd := rp.Start()
+	for !rp.Done() {
+		var err error
+		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
+		if err != nil {
+			return nil, nil, err
+		}
+	}
+	d := vfsd.Impl().(*Dentry)
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	return vfsd, d.inode, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory. It does not check that the returned directory is
+// searchable by the provider of rp.
+//
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
+	vfsd := rp.Start()
+	for !rp.Final() {
+		var err error
+		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
+		if err != nil {
+			return nil, nil, err
+		}
+	}
+	d := vfsd.Impl().(*Dentry)
+	if !d.isDir() {
+		return nil, nil, syserror.ENOTDIR
+	}
+	return vfsd, d.inode, nil
+}
+
+// checkCreateLocked checks that a file named rp.Component() may be created in
+// directory parentVFSD, then returns rp.Component().
+//
+// Preconditions: Filesystem.mu must be locked for at least reading. parentInode
+// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true.
+func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
+	if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return "", err
+	}
+	pc := rp.Component()
+	if pc == "." || pc == ".." {
+		return "", syserror.EEXIST
+	}
+	if len(pc) > linux.NAME_MAX {
+		return "", syserror.ENAMETOOLONG
+	}
+	// FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu.
+	if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok {
+		return "", syserror.EEXIST
+	}
+	if parentVFSD.IsDead() {
+		return "", syserror.ENOENT
+	}
+	return pc, nil
+}
+
+// checkDeleteLocked checks that the file represented by vfsd may be deleted.
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
+	parent := vfsd.Impl().(*Dentry).parent
+	if parent == nil {
+		return syserror.EBUSY
+	}
+	if parent.vfsd.IsDead() {
+		return syserror.ENOENT
+	}
+	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	return nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *Filesystem) Release(context.Context) {
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *Filesystem) Sync(ctx context.Context) error {
+	// All filesystem state is in-memory.
+	return nil
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	fs.mu.RLock()
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	if err != nil {
+		return err
+	}
+	return inode.CheckPermissions(ctx, creds, ats)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+	if err != nil {
+		return nil, err
+	}
+
+	if opts.CheckSearchable {
+		d := vfsd.Impl().(*Dentry)
+		if !d.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+			return nil, err
+		}
+	}
+	vfsd.IncRef() // Ownership transferred to caller.
+	return vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	vfsd, _, err := fs.walkParentDirLocked(ctx, rp)
+	if err != nil {
+		return nil, err
+	}
+	vfsd.IncRef() // Ownership transferred to caller.
+	return vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if rp.Mount() != vd.Mount() {
+		return syserror.EXDEV
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+
+	d := vd.Dentry().Impl().(*Dentry)
+	if d.isDir() {
+		return syserror.EPERM
+	}
+
+	childVFSD, err := parentInode.NewLink(ctx, pc, d.inode)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	return nil
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	childVFSD, err := parentInode.NewDir(ctx, pc, opts)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	return nil
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	newVFSD, err := parentInode.NewNode(ctx, pc, opts)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry))
+	return nil
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// Filter out flags that are not supported by kernfs. O_DIRECTORY and
+	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
+	// appropriate bits in rp), but are returned by
+	// FileDescriptionImpl.StatusFlags().
+	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK
+	ats := vfs.AccessTypesForOpenFlags(&opts)
+
+	// Do not create new file.
+	if opts.Flags&linux.O_CREAT == 0 {
+		fs.mu.RLock()
+		defer fs.processDeferredDecRefs(ctx)
+		defer fs.mu.RUnlock()
+		vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+		if err != nil {
+			return nil, err
+		}
+		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+			return nil, err
+		}
+		return inode.Open(ctx, rp, vfsd, opts)
+	}
+
+	// May create new file.
+	mustCreate := opts.Flags&linux.O_EXCL != 0
+	vfsd := rp.Start()
+	inode := vfsd.Impl().(*Dentry).inode
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	if rp.Done() {
+		if rp.MustBeDir() {
+			return nil, syserror.EISDIR
+		}
+		if mustCreate {
+			return nil, syserror.EEXIST
+		}
+		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+			return nil, err
+		}
+		return inode.Open(ctx, rp, vfsd, opts)
+	}
+afterTrailingSymlink:
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked(ctx)
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+	// Reject attempts to open directories with O_CREAT.
+	if rp.MustBeDir() {
+		return nil, syserror.EISDIR
+	}
+	pc := rp.Component()
+	if pc == "." || pc == ".." {
+		return nil, syserror.EISDIR
+	}
+	if len(pc) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
+	// Determine whether or not we need to create a file.
+	childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD, false /* mayFollowSymlinks */)
+	if err == syserror.ENOENT {
+		// Already checked for searchability above; now check for writability.
+		if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+			return nil, err
+		}
+		if err := rp.Mount().CheckBeginWrite(); err != nil {
+			return nil, err
+		}
+		defer rp.Mount().EndWrite()
+		// Create and open the child.
+		childVFSD, err = parentInode.NewFile(ctx, pc, opts)
+		if err != nil {
+			return nil, err
+		}
+		child := childVFSD.Impl().(*Dentry)
+		parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
+		return child.inode.Open(ctx, rp, childVFSD, opts)
+	}
+	if err != nil {
+		return nil, err
+	}
+	// Open existing file or follow symlink.
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	child := childVFSD.Impl().(*Dentry)
+	if rp.ShouldFollowSymlink() && child.isSymlink() {
+		targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount())
+		if err != nil {
+			return nil, err
+		}
+		if targetVD.Ok() {
+			err := rp.HandleJump(targetVD)
+			targetVD.DecRef(ctx)
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			if err := rp.HandleSymlink(targetPathname); err != nil {
+				return nil, err
+			}
+		}
+		// rp.Final() may no longer be true since we now need to resolve the
+		// symlink target.
+		goto afterTrailingSymlink
+	}
+	if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+		return nil, err
+	}
+	return child.inode.Open(ctx, rp, &child.vfsd, opts)
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	fs.mu.RLock()
+	d, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return "", err
+	}
+	if !d.Impl().(*Dentry).isSymlink() {
+		return "", syserror.EINVAL
+	}
+	return inode.Readlink(ctx)
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	// Only RENAME_NOREPLACE is supported.
+	if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
+		return syserror.EINVAL
+	}
+	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
+
+	fs.mu.Lock()
+	defer fs.processDeferredDecRefsLocked(ctx)
+	defer fs.mu.Unlock()
+
+	// Resolve the destination directory first to verify that it's on this
+	// Mount.
+	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+	if err != nil {
+		return err
+	}
+	dstDir := dstDirVFSD.Impl().(*Dentry)
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	srcDirVFSD := oldParentVD.Dentry()
+	srcDir := srcDirVFSD.Impl().(*Dentry)
+	srcDir.dirMu.Lock()
+	src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDir.children[oldName])
+	srcDir.dirMu.Unlock()
+	if err != nil {
+		return err
+	}
+	srcVFSD := &src.vfsd
+
+	// Can we remove the src dentry?
+	if err := checkDeleteLocked(ctx, rp, srcVFSD); err != nil {
+		return err
+	}
+
+	// Can we create the dst dentry?
+	var dst *Dentry
+	pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode)
+	switch err {
+	case nil:
+		// Ok, continue with rename as replacement.
+	case syserror.EEXIST:
+		if noReplace {
+			// Won't overwrite existing node since RENAME_NOREPLACE was requested.
+			return syserror.EEXIST
+		}
+		dst = dstDir.children[pc]
+		if dst == nil {
+			panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD))
+		}
+	default:
+		return err
+	}
+	var dstVFSD *vfs.Dentry
+	if dst != nil {
+		dstVFSD = &dst.vfsd
+	}
+
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	virtfs := rp.VirtualFilesystem()
+
+	// We can't deadlock here due to lock ordering because we're protected from
+	// concurrent renames by fs.mu held for writing.
+	srcDir.dirMu.Lock()
+	defer srcDir.dirMu.Unlock()
+	if srcDir != dstDir {
+		dstDir.dirMu.Lock()
+		defer dstDir.dirMu.Unlock()
+	}
+
+	if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
+		return err
+	}
+	replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD)
+	if err != nil {
+		virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
+		return err
+	}
+	delete(srcDir.children, src.name)
+	if srcDir != dstDir {
+		fs.deferDecRef(srcDirVFSD)
+		dstDir.IncRef()
+	}
+	src.parent = dstDir
+	src.name = pc
+	if dstDir.children == nil {
+		dstDir.children = make(map[string]*Dentry)
+	}
+	dstDir.children[pc] = src
+	virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaced)
+	return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
+		return err
+	}
+	d := vfsd.Impl().(*Dentry)
+	if !d.isDir() {
+		return syserror.ENOTDIR
+	}
+	if inode.HasChildren() {
+		return syserror.ENOTEMPTY
+	}
+	virtfs := rp.VirtualFilesystem()
+	parentDentry := d.parent
+	parentDentry.dirMu.Lock()
+	defer parentDentry.dirMu.Unlock()
+
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
+		return err
+	}
+	if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
+		virtfs.AbortDeleteDentry(vfsd)
+		return err
+	}
+	virtfs.CommitDeleteDentry(ctx, vfsd)
+	return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	fs.mu.RLock()
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return err
+	}
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	fs.mu.RLock()
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	return inode.Stat(ctx, fs.VFSFilesystem(), opts)
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	// TODO(gvisor.dev/issue/1193): actually implement statfs.
+	return linux.Statfs{}, syserror.ENOSYS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	if rp.Done() {
+		return syserror.EEXIST
+	}
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	childVFSD, err := parentInode.NewSymlink(ctx, pc, target)
+	if err != nil {
+		return err
+	}
+	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	return nil
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	vfsd, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
+		return err
+	}
+	d := vfsd.Impl().(*Dentry)
+	if d.isDir() {
+		return syserror.EISDIR
+	}
+	virtfs := rp.VirtualFilesystem()
+	parentDentry := d.parent
+	parentDentry.dirMu.Lock()
+	defer parentDentry.dirMu.Unlock()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
+		return err
+	}
+	if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
+		virtfs.AbortDeleteDentry(vfsd)
+		return err
+	}
+	virtfs.CommitDeleteDentry(ctx, vfsd)
+	return nil
+}
+
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	fs.mu.RLock()
+	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return nil, err
+	}
+	if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+		return nil, err
+	}
+	return nil, syserror.ECONNREFUSED
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return nil, err
+	}
+	// kernfs currently does not support extended attributes.
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return "", err
+	}
+	// kernfs currently does not support extended attributes.
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return err
+	}
+	// kernfs currently does not support extended attributes.
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs(ctx)
+	if err != nil {
+		return err
+	}
+	// kernfs currently does not support extended attributes.
+	return syserror.ENOTSUP
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
new file mode 100644
index 000000000..c3efcf3ec
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -0,0 +1,613 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// InodeNoopRefCount partially implements the Inode interface, specifically the
+// inodeRefs sub interface. InodeNoopRefCount implements a simple reference
+// count for inodes, performing no extra actions when references are obtained or
+// released. This is suitable for simple file inodes that don't reference any
+// resources.
+type InodeNoopRefCount struct {
+}
+
+// IncRef implements Inode.IncRef.
+func (InodeNoopRefCount) IncRef() {
+}
+
+// DecRef implements Inode.DecRef.
+func (InodeNoopRefCount) DecRef(context.Context) {
+}
+
+// TryIncRef implements Inode.TryIncRef.
+func (InodeNoopRefCount) TryIncRef() bool {
+	return true
+}
+
+// Destroy implements Inode.Destroy.
+func (InodeNoopRefCount) Destroy(context.Context) {
+}
+
+// InodeDirectoryNoNewChildren partially implements the Inode interface.
+// InodeDirectoryNoNewChildren represents a directory inode which does not
+// support creation of new children.
+type InodeDirectoryNoNewChildren struct{}
+
+// NewFile implements Inode.NewFile.
+func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewDir implements Inode.NewDir.
+func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewLink implements Inode.NewLink.
+func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// NewNode implements Inode.NewNode.
+func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+// InodeNotDirectory partially implements the Inode interface, specifically the
+// inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not
+// represent directories can embed this to provide no-op implementations for
+// directory-related functions.
+type InodeNotDirectory struct {
+}
+
+// HasChildren implements Inode.HasChildren.
+func (InodeNotDirectory) HasChildren() bool {
+	return false
+}
+
+// NewFile implements Inode.NewFile.
+func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+	panic("NewFile called on non-directory inode")
+}
+
+// NewDir implements Inode.NewDir.
+func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+	panic("NewDir called on non-directory inode")
+}
+
+// NewLink implements Inode.NewLinkink.
+func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+	panic("NewLink called on non-directory inode")
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+	panic("NewSymlink called on non-directory inode")
+}
+
+// NewNode implements Inode.NewNode.
+func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+	panic("NewNode called on non-directory inode")
+}
+
+// Unlink implements Inode.Unlink.
+func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
+	panic("Unlink called on non-directory inode")
+}
+
+// RmDir implements Inode.RmDir.
+func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
+	panic("RmDir called on non-directory inode")
+}
+
+// Rename implements Inode.Rename.
+func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
+	panic("Rename called on non-directory inode")
+}
+
+// Lookup implements Inode.Lookup.
+func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	panic("Lookup called on non-directory inode")
+}
+
+// IterDirents implements Inode.IterDirents.
+func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+	panic("IterDirents called on non-directory inode")
+}
+
+// Valid implements Inode.Valid.
+func (InodeNotDirectory) Valid(context.Context) bool {
+	return true
+}
+
+// InodeNoDynamicLookup partially implements the Inode interface, specifically
+// the inodeDynamicLookup sub interface. Directory inodes that do not support
+// dymanic entries (i.e. entries that are not "hashed" into the
+// vfs.Dentry.children) can embed this to provide no-op implementations for
+// functions related to dynamic entries.
+type InodeNoDynamicLookup struct{}
+
+// Lookup implements Inode.Lookup.
+func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	return nil, syserror.ENOENT
+}
+
+// IterDirents implements Inode.IterDirents.
+func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	return offset, nil
+}
+
+// Valid implements Inode.Valid.
+func (InodeNoDynamicLookup) Valid(ctx context.Context) bool {
+	return true
+}
+
+// InodeNotSymlink partially implements the Inode interface, specifically the
+// inodeSymlink sub interface. All inodes that are not symlinks may embed this
+// to return the appropriate errors from symlink-related functions.
+type InodeNotSymlink struct{}
+
+// Readlink implements Inode.Readlink.
+func (InodeNotSymlink) Readlink(context.Context) (string, error) {
+	return "", syserror.EINVAL
+}
+
+// Getlink implements Inode.Getlink.
+func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	return vfs.VirtualDentry{}, "", syserror.EINVAL
+}
+
+// InodeAttrs partially implements the Inode interface, specifically the
+// inodeMetadata sub interface. InodeAttrs provides functionality related to
+// inode attributes.
+//
+// Must be initialized by Init prior to first use.
+type InodeAttrs struct {
+	devMajor uint32
+	devMinor uint32
+	ino      uint64
+	mode     uint32
+	uid      uint32
+	gid      uint32
+	nlink    uint32
+}
+
+// Init initializes this InodeAttrs.
+func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) {
+	if mode.FileType() == 0 {
+		panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode))
+	}
+
+	nlink := uint32(1)
+	if mode.FileType() == linux.ModeDirectory {
+		nlink = 2
+	}
+	a.devMajor = devMajor
+	a.devMinor = devMinor
+	atomic.StoreUint64(&a.ino, ino)
+	atomic.StoreUint32(&a.mode, uint32(mode))
+	atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID))
+	atomic.StoreUint32(&a.gid, uint32(creds.EffectiveKGID))
+	atomic.StoreUint32(&a.nlink, nlink)
+}
+
+// DevMajor returns the device major number.
+func (a *InodeAttrs) DevMajor() uint32 {
+	return a.devMajor
+}
+
+// DevMinor returns the device minor number.
+func (a *InodeAttrs) DevMinor() uint32 {
+	return a.devMinor
+}
+
+// Ino returns the inode id.
+func (a *InodeAttrs) Ino() uint64 {
+	return atomic.LoadUint64(&a.ino)
+}
+
+// Mode implements Inode.Mode.
+func (a *InodeAttrs) Mode() linux.FileMode {
+	return linux.FileMode(atomic.LoadUint32(&a.mode))
+}
+
+// Stat partially implements Inode.Stat. Note that this function doesn't provide
+// all the stat fields, and the embedder should consider extending the result
+// with filesystem-specific fields.
+func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK
+	stat.DevMajor = a.devMajor
+	stat.DevMinor = a.devMinor
+	stat.Ino = atomic.LoadUint64(&a.ino)
+	stat.Mode = uint16(a.Mode())
+	stat.UID = atomic.LoadUint32(&a.uid)
+	stat.GID = atomic.LoadUint32(&a.gid)
+	stat.Nlink = atomic.LoadUint32(&a.nlink)
+
+	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
+
+	return stat, nil
+}
+
+// SetStat implements Inode.SetStat.
+func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
+		return syserror.EPERM
+	}
+	if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
+		return err
+	}
+
+	stat := opts.Stat
+	if stat.Mask&linux.STATX_MODE != 0 {
+		for {
+			old := atomic.LoadUint32(&a.mode)
+			new := old | uint32(stat.Mode & ^uint16(linux.S_IFMT))
+			if swapped := atomic.CompareAndSwapUint32(&a.mode, old, new); swapped {
+				break
+			}
+		}
+	}
+
+	if stat.Mask&linux.STATX_UID != 0 {
+		atomic.StoreUint32(&a.uid, stat.UID)
+	}
+	if stat.Mask&linux.STATX_GID != 0 {
+		atomic.StoreUint32(&a.gid, stat.GID)
+	}
+
+	// Note that not all fields are modifiable. For example, the file type and
+	// inode numbers are immutable after node creation.
+
+	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
+	// Also, STATX_SIZE will need some special handling, because read-only static
+	// files should return EIO for truncate operations.
+
+	return nil
+}
+
+// CheckPermissions implements Inode.CheckPermissions.
+func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(
+		creds,
+		ats,
+		a.Mode(),
+		auth.KUID(atomic.LoadUint32(&a.uid)),
+		auth.KGID(atomic.LoadUint32(&a.gid)),
+	)
+}
+
+// IncLinks implements Inode.IncLinks.
+func (a *InodeAttrs) IncLinks(n uint32) {
+	if atomic.AddUint32(&a.nlink, n) <= n {
+		panic("InodeLink.IncLinks called with no existing links")
+	}
+}
+
+// DecLinks implements Inode.DecLinks.
+func (a *InodeAttrs) DecLinks() {
+	if nlink := atomic.AddUint32(&a.nlink, ^uint32(0)); nlink == ^uint32(0) {
+		// Negative overflow
+		panic("Inode.DecLinks called at 0 links")
+	}
+}
+
+type slot struct {
+	Name   string
+	Dentry *vfs.Dentry
+	slotEntry
+}
+
+// OrderedChildrenOptions contains initialization options for OrderedChildren.
+type OrderedChildrenOptions struct {
+	// Writable indicates whether vfs.FilesystemImpl methods implemented by
+	// OrderedChildren may modify the tracked children. This applies to
+	// operations related to rename, unlink and rmdir. If an OrderedChildren is
+	// not writable, these operations all fail with EPERM.
+	Writable bool
+}
+
+// OrderedChildren partially implements the Inode interface. OrderedChildren can
+// be embedded in directory inodes to keep track of the children in the
+// directory, and can then be used to implement a generic directory FD -- see
+// GenericDirectoryFD. OrderedChildren is not compatible with dynamic
+// directories.
+//
+// Must be initialize with Init before first use.
+type OrderedChildren struct {
+	refs.AtomicRefCount
+
+	// Can children be modified by user syscalls? It set to false, interface
+	// methods that would modify the children return EPERM. Immutable.
+	writable bool
+
+	mu    sync.RWMutex
+	order slotList
+	set   map[string]*slot
+}
+
+// Init initializes an OrderedChildren.
+func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
+	o.writable = opts.Writable
+	o.set = make(map[string]*slot)
+}
+
+// DecRef implements Inode.DecRef.
+func (o *OrderedChildren) DecRef(ctx context.Context) {
+	o.AtomicRefCount.DecRefWithDestructor(ctx, o.Destroy)
+}
+
+// Destroy cleans up resources referenced by this OrderedChildren.
+func (o *OrderedChildren) Destroy(context.Context) {
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	o.order.Reset()
+	o.set = nil
+}
+
+// Populate inserts children into this OrderedChildren, and d's dentry
+// cache. Populate returns the number of directories inserted, which the caller
+// may use to update the link count for the parent directory.
+//
+// Precondition: d must represent a directory inode. children must not contain
+// any conflicting entries already in o.
+func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 {
+	var links uint32
+	for name, child := range children {
+		if child.isDir() {
+			links++
+		}
+		if err := o.Insert(name, child.VFSDentry()); err != nil {
+			panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d))
+		}
+		d.InsertChild(name, child)
+	}
+	return links
+}
+
+// HasChildren implements Inode.HasChildren.
+func (o *OrderedChildren) HasChildren() bool {
+	o.mu.RLock()
+	defer o.mu.RUnlock()
+	return len(o.set) > 0
+}
+
+// Insert inserts child into o. This ignores the writability of o, as this is
+// not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
+func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	if _, ok := o.set[name]; ok {
+		return syserror.EEXIST
+	}
+	s := &slot{
+		Name:   name,
+		Dentry: child,
+	}
+	o.order.PushBack(s)
+	o.set[name] = s
+	return nil
+}
+
+// Precondition: caller must hold o.mu for writing.
+func (o *OrderedChildren) removeLocked(name string) {
+	if s, ok := o.set[name]; ok {
+		delete(o.set, name)
+		o.order.Remove(s)
+	}
+}
+
+// Precondition: caller must hold o.mu for writing.
+func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry {
+	if s, ok := o.set[name]; ok {
+		// Existing slot with given name, simply replace the dentry.
+		var old *vfs.Dentry
+		old, s.Dentry = s.Dentry, new
+		return old
+	}
+
+	// No existing slot with given name, create and hash new slot.
+	s := &slot{
+		Name:   name,
+		Dentry: new,
+	}
+	o.order.PushBack(s)
+	o.set[name] = s
+	return nil
+}
+
+// Precondition: caller must hold o.mu for reading or writing.
+func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error {
+	s, ok := o.set[name]
+	if !ok {
+		return syserror.ENOENT
+	}
+	if s.Dentry != child {
+		panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child))
+	}
+	return nil
+}
+
+// Unlink implements Inode.Unlink.
+func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error {
+	if !o.writable {
+		return syserror.EPERM
+	}
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	if err := o.checkExistingLocked(name, child); err != nil {
+		return err
+	}
+
+	// TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
+	o.removeLocked(name)
+	return nil
+}
+
+// Rmdir implements Inode.Rmdir.
+func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error {
+	// We're not responsible for checking that child is a directory, that it's
+	// empty, or updating any link counts; so this is the same as unlink.
+	return o.Unlink(ctx, name, child)
+}
+
+type renameAcrossDifferentImplementationsError struct{}
+
+func (renameAcrossDifferentImplementationsError) Error() string {
+	return "rename across inodes with different implementations"
+}
+
+// Rename implements Inode.Rename.
+//
+// Precondition: Rename may only be called across two directory inodes with
+// identical implementations of Rename. Practically, this means filesystems that
+// implement Rename by embedding OrderedChildren for any directory
+// implementation must use OrderedChildren for all directory implementations
+// that will support Rename.
+//
+// Postcondition: reference on any replaced dentry transferred to caller.
+func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) {
+	dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren)
+	if !ok {
+		return nil, renameAcrossDifferentImplementationsError{}
+	}
+	if !o.writable || !dst.writable {
+		return nil, syserror.EPERM
+	}
+	// Note: There's a potential deadlock below if concurrent calls to Rename
+	// refer to the same src and dst directories in reverse. We avoid any
+	// ordering issues because the caller is required to serialize concurrent
+	// calls to Rename in accordance with the interface declaration.
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	if dst != o {
+		dst.mu.Lock()
+		defer dst.mu.Unlock()
+	}
+	if err := o.checkExistingLocked(oldname, child); err != nil {
+		return nil, err
+	}
+
+	// TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
+	replaced := dst.replaceChildLocked(newname, child)
+	return replaced, nil
+}
+
+// nthLocked returns an iterator to the nth child tracked by this object. The
+// iterator is valid until the caller releases o.mu. Returns nil if the
+// requested index falls out of bounds.
+//
+// Preconditon: Caller must hold o.mu for reading.
+func (o *OrderedChildren) nthLocked(i int64) *slot {
+	for it := o.order.Front(); it != nil && i >= 0; it = it.Next() {
+		if i == 0 {
+			return it
+		}
+		i--
+	}
+	return nil
+}
+
+// InodeSymlink partially implements Inode interface for symlinks.
+type InodeSymlink struct {
+	InodeNotDirectory
+}
+
+// Open implements Inode.Open.
+func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return nil, syserror.ELOOP
+}
+
+// StaticDirectory is a standard implementation of a directory with static
+// contents.
+//
+// +stateify savable
+type StaticDirectory struct {
+	InodeNotSymlink
+	InodeDirectoryNoNewChildren
+	InodeAttrs
+	InodeNoDynamicLookup
+	OrderedChildren
+
+	locks vfs.FileLocks
+}
+
+var _ Inode = (*StaticDirectory)(nil)
+
+// NewStaticDir creates a new static directory and returns its dentry.
+func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry {
+	inode := &StaticDirectory{}
+	inode.Init(creds, devMajor, devMinor, ino, perm)
+
+	dentry := &Dentry{}
+	dentry.Init(inode)
+
+	inode.OrderedChildren.Init(OrderedChildrenOptions{})
+	links := inode.OrderedChildren.Populate(dentry, children)
+	inode.IncLinks(links)
+
+	return dentry
+}
+
+// Init initializes StaticDirectory.
+func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
+	}
+	s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
+}
+
+// Open implements kernfs.Inode.
+func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+// AlwaysValid partially implements kernfs.inodeDynamicLookup.
+type AlwaysValid struct{}
+
+// Valid implements kernfs.inodeDynamicLookup.
+func (*AlwaysValid) Valid(context.Context) bool {
+	return true
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
new file mode 100644
index 000000000..080118841
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -0,0 +1,456 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kernfs provides the tools to implement inode-based filesystems.
+// Kernfs has two main features:
+//
+// 1. The Inode interface, which maps VFS2's path-based filesystem operations to
+//    specific filesystem nodes. Kernfs uses the Inode interface to provide a
+//    blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as
+//    the synchronization mechanism for all filesystem operations by holding a
+//    filesystem-wide lock across all operations.
+//
+// 2. Various utility types which provide generic implementations for various
+//    parts of the Inode and vfs.FileDescription interfaces. Client filesystems
+//    based on kernfs can embed the appropriate set of these to avoid having to
+//    reimplement common filesystem operations. See inode_impl_util.go and
+//    fd_impl_util.go.
+//
+// Reference Model:
+//
+// Kernfs dentries represents named pointers to inodes. Dentries and inode have
+// independent lifetimes and reference counts. A child dentry unconditionally
+// holds a reference on its parent directory's dentry. A dentry also holds a
+// reference on the inode it points to. Multiple dentries can point to the same
+// inode (for example, in the case of hardlinks). File descriptors hold a
+// reference to the dentry they're opened on.
+//
+// Dentries are guaranteed to exist while holding Filesystem.mu for
+// reading. Dropping dentries require holding Filesystem.mu for writing. To
+// queue dentries for destruction from a read critical section, see
+// Filesystem.deferDecRef.
+//
+// Lock ordering:
+//
+// kernfs.Filesystem.mu
+//   kernfs.Dentry.dirMu
+//     vfs.VirtualFilesystem.mountMu
+//       vfs.Dentry.mu
+//   kernfs.Filesystem.droppedDentriesMu
+//   (inode implementation locks, if any)
+package kernfs
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+)
+
+// Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
+// filesystem. Concrete implementations are expected to embed this in their own
+// Filesystem type.
+type Filesystem struct {
+	vfsfs vfs.Filesystem
+
+	droppedDentriesMu sync.Mutex
+
+	// droppedDentries is a list of dentries waiting to be DecRef()ed. This is
+	// used to defer dentry destruction until mu can be acquired for
+	// writing. Protected by droppedDentriesMu.
+	droppedDentries []*vfs.Dentry
+
+	// mu synchronizes the lifetime of Dentries on this filesystem. Holding it
+	// for reading guarantees continued existence of any resolved dentries, but
+	// the dentry tree may be modified.
+	//
+	// Kernfs dentries can only be DecRef()ed while holding mu for writing. For
+	// example:
+	//
+	//   fs.mu.Lock()
+	//   defer fs.mu.Unlock()
+	//   ...
+	//   dentry1.DecRef()
+	//   defer dentry2.DecRef() // Ok, will run before Unlock.
+	//
+	// If discarding dentries in a read context, use Filesystem.deferDecRef. For
+	// example:
+	//
+	//   fs.mu.RLock()
+	//   fs.mu.processDeferredDecRefs()
+	//   defer fs.mu.RUnlock()
+	//   ...
+	//   fs.deferDecRef(dentry)
+	mu sync.RWMutex
+
+	// nextInoMinusOne is used to to allocate inode numbers on this
+	// filesystem. Must be accessed by atomic operations.
+	nextInoMinusOne uint64
+}
+
+// deferDecRef defers dropping a dentry ref until the next call to
+// processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
+//
+// Precondition: d must not already be pending destruction.
+func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
+	fs.droppedDentriesMu.Lock()
+	fs.droppedDentries = append(fs.droppedDentries, d)
+	fs.droppedDentriesMu.Unlock()
+}
+
+// processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
+// droppedDentries list. See comment on Filesystem.mu.
+func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) {
+	fs.mu.Lock()
+	fs.processDeferredDecRefsLocked(ctx)
+	fs.mu.Unlock()
+}
+
+// Precondition: fs.mu must be held for writing.
+func (fs *Filesystem) processDeferredDecRefsLocked(ctx context.Context) {
+	fs.droppedDentriesMu.Lock()
+	for _, d := range fs.droppedDentries {
+		d.DecRef(ctx)
+	}
+	fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse.
+	fs.droppedDentriesMu.Unlock()
+}
+
+// VFSFilesystem returns the generic vfs filesystem object.
+func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem {
+	return &fs.vfsfs
+}
+
+// NextIno allocates a new inode number on this filesystem.
+func (fs *Filesystem) NextIno() uint64 {
+	return atomic.AddUint64(&fs.nextInoMinusOne, 1)
+}
+
+// These consts are used in the Dentry.flags field.
+const (
+	// Dentry points to a directory inode.
+	dflagsIsDir = 1 << iota
+
+	// Dentry points to a symlink inode.
+	dflagsIsSymlink
+)
+
+// Dentry implements vfs.DentryImpl.
+//
+// A kernfs dentry is similar to a dentry in a traditional filesystem: it's a
+// named reference to an inode. A dentry generally lives as long as it's part of
+// a mounted filesystem tree. Kernfs doesn't cache dentries once all references
+// to them are removed. Dentries hold a single reference to the inode they point
+// to, and child dentries hold a reference on their parent.
+//
+// Must be initialized by Init prior to first use.
+type Dentry struct {
+	vfsd vfs.Dentry
+
+	refs.AtomicRefCount
+
+	// flags caches useful information about the dentry from the inode. See the
+	// dflags* consts above. Must be accessed by atomic ops.
+	flags uint32
+
+	parent *Dentry
+	name   string
+
+	// dirMu protects children and the names of child Dentries.
+	dirMu    sync.Mutex
+	children map[string]*Dentry
+
+	inode Inode
+}
+
+// Init initializes this dentry.
+//
+// Precondition: Caller must hold a reference on inode.
+//
+// Postcondition: Caller's reference on inode is transferred to the dentry.
+func (d *Dentry) Init(inode Inode) {
+	d.vfsd.Init(d)
+	d.inode = inode
+	ftype := inode.Mode().FileType()
+	if ftype == linux.ModeDirectory {
+		d.flags |= dflagsIsDir
+	}
+	if ftype == linux.ModeSymlink {
+		d.flags |= dflagsIsSymlink
+	}
+}
+
+// VFSDentry returns the generic vfs dentry for this kernfs dentry.
+func (d *Dentry) VFSDentry() *vfs.Dentry {
+	return &d.vfsd
+}
+
+// isDir checks whether the dentry points to a directory inode.
+func (d *Dentry) isDir() bool {
+	return atomic.LoadUint32(&d.flags)&dflagsIsDir != 0
+}
+
+// isSymlink checks whether the dentry points to a symlink inode.
+func (d *Dentry) isSymlink() bool {
+	return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *Dentry) DecRef(ctx context.Context) {
+	d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy)
+}
+
+// Precondition: Dentry must be removed from VFS' dentry cache.
+func (d *Dentry) destroy(ctx context.Context) {
+	d.inode.DecRef(ctx) // IncRef from Init.
+	d.inode = nil
+	if d.parent != nil {
+		d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild.
+	}
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+//
+// Although Linux technically supports inotify on pseudo filesystems (inotify
+// is implemented at the vfs layer), it is not particularly useful. It is left
+// unimplemented until someone actually needs it.
+func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *Dentry) Watches() *vfs.Watches {
+	return nil
+}
+
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+func (d *Dentry) OnZeroWatches(context.Context) {}
+
+// InsertChild inserts child into the vfs dentry cache with the given name under
+// this dentry. This does not update the directory inode, so calling this on
+// its own isn't sufficient to insert a child into a directory. InsertChild
+// updates the link count on d if required.
+//
+// Precondition: d must represent a directory inode.
+func (d *Dentry) InsertChild(name string, child *Dentry) {
+	d.dirMu.Lock()
+	d.insertChildLocked(name, child)
+	d.dirMu.Unlock()
+}
+
+// insertChildLocked is equivalent to InsertChild, with additional
+// preconditions.
+//
+// Precondition: d.dirMu must be locked.
+func (d *Dentry) insertChildLocked(name string, child *Dentry) {
+	if !d.isDir() {
+		panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
+	}
+	d.IncRef() // DecRef in child's Dentry.destroy.
+	child.parent = d
+	child.name = name
+	if d.children == nil {
+		d.children = make(map[string]*Dentry)
+	}
+	d.children[name] = child
+}
+
+// Inode returns the dentry's inode.
+func (d *Dentry) Inode() Inode {
+	return d.inode
+}
+
+// The Inode interface maps filesystem-level operations that operate on paths to
+// equivalent operations on specific filesystem nodes.
+//
+// The interface methods are groups into logical categories as sub interfaces
+// below. Generally, an implementation for each sub interface can be provided by
+// embedding an appropriate type from inode_impl_utils.go. The sub interfaces
+// are purely organizational. Methods declared directly in the main interface
+// have no generic implementations, and should be explicitly provided by the
+// client filesystem.
+//
+// Generally, implementations are not responsible for tasks that are common to
+// all filesystems. These include:
+//
+// - Checking that dentries passed to methods are of the appropriate file type.
+// - Checking permissions.
+// - Updating link and reference counts.
+//
+// Specific responsibilities of implementations are documented below.
+type Inode interface {
+	// Methods related to reference counting. A generic implementation is
+	// provided by InodeNoopRefCount. These methods are generally called by the
+	// equivalent Dentry methods.
+	inodeRefs
+
+	// Methods related to node metadata. A generic implementation is provided by
+	// InodeAttrs.
+	inodeMetadata
+
+	// Method for inodes that represent symlink. InodeNotSymlink provides a
+	// blanket implementation for all non-symlink inodes.
+	inodeSymlink
+
+	// Method for inodes that represent directories. InodeNotDirectory provides
+	// a blanket implementation for all non-directory inodes.
+	inodeDirectory
+
+	// Method for inodes that represent dynamic directories and their
+	// children. InodeNoDynamicLookup provides a blanket implementation for all
+	// non-dynamic-directory inodes.
+	inodeDynamicLookup
+
+	// Open creates a file description for the filesystem object represented by
+	// this inode. The returned file description should hold a reference on the
+	// inode for its lifetime.
+	//
+	// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
+	// the inode on which Open() is being called.
+	Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+}
+
+type inodeRefs interface {
+	IncRef()
+	DecRef(ctx context.Context)
+	TryIncRef() bool
+	// Destroy is called when the inode reaches zero references. Destroy release
+	// all resources (references) on objects referenced by the inode, including
+	// any child dentries.
+	Destroy(ctx context.Context)
+}
+
+type inodeMetadata interface {
+	// CheckPermissions checks that creds may access this inode for the
+	// requested access type, per the the rules of
+	// fs/namei.c:generic_permission().
+	CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error
+
+	// Mode returns the (struct stat)::st_mode value for this inode. This is
+	// separated from Stat for performance.
+	Mode() linux.FileMode
+
+	// Stat returns the metadata for this inode. This corresponds to
+	// vfs.FilesystemImpl.StatAt.
+	Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
+
+	// SetStat updates the metadata for this inode. This corresponds to
+	// vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking
+	// if the operation can be performed (see vfs.CheckSetStat() for common
+	// checks).
+	SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error
+}
+
+// Precondition: All methods in this interface may only be called on directory
+// inodes.
+type inodeDirectory interface {
+	// The New{File,Dir,Node,Symlink} methods below should return a new inode
+	// hashed into this inode.
+	//
+	// These inode constructors are inode-level operations rather than
+	// filesystem-level operations to allow client filesystems to mix different
+	// implementations based on the new node's location in the
+	// filesystem.
+
+	// HasChildren returns true if the directory inode has any children.
+	HasChildren() bool
+
+	// NewFile creates a new regular file inode.
+	NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error)
+
+	// NewDir creates a new directory inode.
+	NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error)
+
+	// NewLink creates a new hardlink to a specified inode in this
+	// directory. Implementations should create a new kernfs Dentry pointing to
+	// target, and update target's link count.
+	NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error)
+
+	// NewSymlink creates a new symbolic link inode.
+	NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error)
+
+	// NewNode creates a new filesystem node for a mknod syscall.
+	NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error)
+
+	// Unlink removes a child dentry from this directory inode.
+	Unlink(ctx context.Context, name string, child *vfs.Dentry) error
+
+	// RmDir removes an empty child directory from this directory
+	// inode. Implementations must update the parent directory's link count,
+	// if required. Implementations are not responsible for checking that child
+	// is a directory, checking for an empty directory.
+	RmDir(ctx context.Context, name string, child *vfs.Dentry) error
+
+	// Rename is called on the source directory containing an inode being
+	// renamed. child should point to the resolved child in the source
+	// directory. If Rename replaces a dentry in the destination directory, it
+	// should return the replaced dentry or nil otherwise.
+	//
+	// Precondition: Caller must serialize concurrent calls to Rename.
+	Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error)
+}
+
+type inodeDynamicLookup interface {
+	// Lookup should return an appropriate dentry if name should resolve to a
+	// child of this dynamic directory inode. This gives the directory an
+	// opportunity on every lookup to resolve additional entries that aren't
+	// hashed into the directory. This is only called when the inode is a
+	// directory. If the inode is not a directory, or if the directory only
+	// contains a static set of children, the implementer can unconditionally
+	// return an appropriate error (ENOTDIR and ENOENT respectively).
+	//
+	// The child returned by Lookup will be hashed into the VFS dentry tree. Its
+	// lifetime can be controlled by the filesystem implementation with an
+	// appropriate implementation of Valid.
+	//
+	// Lookup returns the child with an extra reference and the caller owns this
+	// reference.
+	Lookup(ctx context.Context, name string) (*vfs.Dentry, error)
+
+	// Valid should return true if this inode is still valid, or needs to
+	// be resolved again by a call to Lookup.
+	Valid(ctx context.Context) bool
+
+	// IterDirents is used to iterate over dynamically created entries. It invokes
+	// cb on each entry in the directory represented by the FileDescription.
+	// 'offset' is the offset for the entire IterDirents call, which may include
+	// results from the caller (e.g. "." and ".."). 'relOffset' is the offset
+	// inside the entries returned by this IterDirents invocation. In other words,
+	// 'offset' should be used to calculate each vfs.Dirent.NextOff as well as
+	// the return value, while 'relOffset' is the place to start iteration.
+	IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
+}
+
+type inodeSymlink interface {
+	// Readlink returns the target of a symbolic link. If an inode is not a
+	// symlink, the implementation should return EINVAL.
+	Readlink(ctx context.Context) (string, error)
+
+	// Getlink returns the target of a symbolic link, as used by path
+	// resolution:
+	//
+	// - If the inode is a "magic link" (a link whose target is most accurately
+	// represented as a VirtualDentry), Getlink returns (ok VirtualDentry, "",
+	// nil). A reference is taken on the returned VirtualDentry.
+	//
+	// - If the inode is an ordinary symlink, Getlink returns (zero-value
+	// VirtualDentry, symlink target, nil).
+	//
+	// - If the inode is not a symlink, Getlink returns (zero-value
+	// VirtualDentry, "", EINVAL).
+	Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error)
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
new file mode 100644
index 000000000..c5d5afedf
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -0,0 +1,330 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs_test
+
+import (
+	"bytes"
+	"fmt"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+const defaultMode linux.FileMode = 01777
+const staticFileContent = "This is sample content for a static test file."
+
+// RootDentryFn is a generator function for creating the root dentry of a test
+// filesystem. See newTestSystem.
+type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
+
+// newTestSystem sets up a minimal environment for running a test, including an
+// instance of a test filesystem. Tests can control the contents of the
+// filesystem by providing an appropriate rootFn, which should return a
+// pre-populated root dentry.
+func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
+	ctx := contexttest.Context(t)
+	creds := auth.CredentialsFromContext(ctx)
+	v := &vfs.VirtualFilesystem{}
+	if err := v.Init(ctx); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
+	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("Failed to create testfs root mount: %v", err)
+	}
+	return testutil.NewSystem(ctx, t, v, mns)
+}
+
+type fsType struct {
+	rootFn RootDentryFn
+}
+
+type filesystem struct {
+	kernfs.Filesystem
+}
+
+type file struct {
+	kernfs.DynamicBytesFile
+	content string
+}
+
+func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry {
+	f := &file{}
+	f.content = content
+	f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777)
+
+	d := &kernfs.Dentry{}
+	d.Init(f)
+	return d
+}
+
+func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%s", f.content)
+	return nil
+}
+
+type attrs struct {
+	kernfs.InodeAttrs
+}
+
+func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+type readonlyDir struct {
+	attrs
+	kernfs.InodeNotSymlink
+	kernfs.InodeNoDynamicLookup
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
+
+	dentry kernfs.Dentry
+}
+
+func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+	dir := &readonlyDir{}
+	dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
+	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	dir.dentry.Init(dir)
+
+	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
+
+	return &dir.dentry
+}
+
+func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+type dir struct {
+	attrs
+	kernfs.InodeNotSymlink
+	kernfs.InodeNoDynamicLookup
+	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
+
+	fs     *filesystem
+	dentry kernfs.Dentry
+}
+
+func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+	dir := &dir{}
+	dir.fs = fs
+	dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
+	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
+	dir.dentry.Init(dir)
+
+	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
+
+	return &dir.dentry
+}
+
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+	creds := auth.CredentialsFromContext(ctx)
+	dir := d.fs.newDir(creds, opts.Mode, nil)
+	dirVFSD := dir.VFSDentry()
+	if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil {
+		dir.DecRef(ctx)
+		return nil, err
+	}
+	d.IncLinks(1)
+	return dirVFSD, nil
+}
+
+func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+	creds := auth.CredentialsFromContext(ctx)
+	f := d.fs.newFile(creds, "")
+	fVFSD := f.VFSDentry()
+	if err := d.OrderedChildren.Insert(name, fVFSD); err != nil {
+		f.DecRef(ctx)
+		return nil, err
+	}
+	return fVFSD, nil
+}
+
+func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+	return nil, syserror.EPERM
+}
+
+func (fsType) Name() string {
+	return "kernfs"
+}
+
+func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	fs := &filesystem{}
+	fs.VFSFilesystem().Init(vfsObj, &fst, fs)
+	root := fst.rootFn(creds, fs)
+	return fs.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+// -------------------- Remainder of the file are test cases --------------------
+
+func TestBasic(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"file1": fs.newFile(creds, staticFileContent),
+		})
+	})
+	defer sys.Destroy()
+	sys.GetDentryOrDie(sys.PathOpAtRoot("file1")).DecRef(sys.Ctx)
+}
+
+func TestMkdirGetDentry(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"dir1": fs.newDir(creds, 0755, nil),
+		})
+	})
+	defer sys.Destroy()
+
+	pop := sys.PathOpAtRoot("dir1/a new directory")
+	if err := sys.VFS.MkdirAt(sys.Ctx, sys.Creds, pop, &vfs.MkdirOptions{Mode: 0755}); err != nil {
+		t.Fatalf("MkdirAt for PathOperation %+v failed: %v", pop, err)
+	}
+	sys.GetDentryOrDie(pop).DecRef(sys.Ctx)
+}
+
+func TestReadStaticFile(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"file1": fs.newFile(creds, staticFileContent),
+		})
+	})
+	defer sys.Destroy()
+
+	pop := sys.PathOpAtRoot("file1")
+	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+	}
+	defer fd.DecRef(sys.Ctx)
+
+	content, err := sys.ReadToEnd(fd)
+	if err != nil {
+		t.Fatalf("Read failed: %v", err)
+	}
+	if diff := cmp.Diff(staticFileContent, content); diff != "" {
+		t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff)
+	}
+}
+
+func TestCreateNewFileInStaticDir(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			"dir1": fs.newDir(creds, 0755, nil),
+		})
+	})
+	defer sys.Destroy()
+
+	pop := sys.PathOpAtRoot("dir1/newfile")
+	opts := &vfs.OpenOptions{Flags: linux.O_CREAT | linux.O_EXCL, Mode: defaultMode}
+	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, opts)
+	if err != nil {
+		t.Fatalf("OpenAt(pop:%+v, opts:%+v) failed: %v", pop, opts, err)
+	}
+
+	// Close the file. The file should persist.
+	fd.DecRef(sys.Ctx)
+
+	fd, err = sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
+	}
+	fd.DecRef(sys.Ctx)
+}
+
+func TestDirFDReadWrite(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, nil)
+	})
+	defer sys.Destroy()
+
+	pop := sys.PathOpAtRoot("/")
+	fd, err := sys.VFS.OpenAt(sys.Ctx, sys.Creds, pop, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+	}
+	defer fd.DecRef(sys.Ctx)
+
+	// Read/Write should fail for directory FDs.
+	if _, err := fd.Read(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.ReadOptions{}); err != syserror.EISDIR {
+		t.Fatalf("Read for directory FD failed with unexpected error: %v", err)
+	}
+	if _, err := fd.Write(sys.Ctx, usermem.BytesIOSequence([]byte{}), vfs.WriteOptions{}); err != syserror.EBADF {
+		t.Fatalf("Write for directory FD failed with unexpected error: %v", err)
+	}
+}
+
+func TestDirFDIterDirents(t *testing.T) {
+	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
+		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+			// Fill root with nodes backed by various inode implementations.
+			"dir1": fs.newReadonlyDir(creds, 0755, nil),
+			"dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{
+				"dir3": fs.newDir(creds, 0755, nil),
+			}),
+			"file1": fs.newFile(creds, staticFileContent),
+		})
+	})
+	defer sys.Destroy()
+
+	pop := sys.PathOpAtRoot("/")
+	sys.AssertAllDirentTypes(sys.ListDirents(pop), map[string]testutil.DirentType{
+		"dir1":  linux.DT_DIR,
+		"dir2":  linux.DT_DIR,
+		"file1": linux.DT_REG,
+	})
+}
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
new file mode 100644
index 000000000..2ab3f53fd
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -0,0 +1,66 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// StaticSymlink provides an Inode implementation for symlinks that point to
+// a immutable target.
+type StaticSymlink struct {
+	InodeAttrs
+	InodeNoopRefCount
+	InodeSymlink
+
+	target string
+}
+
+var _ Inode = (*StaticSymlink)(nil)
+
+// NewStaticSymlink creates a new symlink file pointing to 'target'.
+func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry {
+	inode := &StaticSymlink{}
+	inode.Init(creds, devMajor, devMinor, ino, target)
+
+	d := &Dentry{}
+	d.Init(inode)
+	return d
+}
+
+// Init initializes the instance.
+func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) {
+	s.target = target
+	s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777)
+}
+
+// Readlink implements Inode.
+func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
+	return s.target, nil
+}
+
+// Getlink implements Inode.Getlink.
+func (s *StaticSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	return vfs.VirtualDentry{}, s.target, nil
+}
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*StaticSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
deleted file mode 100644
index 04d667273..000000000
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ /dev/null
@@ -1,76 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
-go_template_instance(
-    name = "dentry_list",
-    out = "dentry_list.go",
-    package = "memfs",
-    prefix = "dentry",
-    template = "//pkg/ilist:generic_list",
-    types = {
-        "Element": "*dentry",
-        "Linker": "*dentry",
-    },
-)
-
-go_library(
-    name = "memfs",
-    srcs = [
-        "dentry_list.go",
-        "directory.go",
-        "filesystem.go",
-        "memfs.go",
-        "named_pipe.go",
-        "regular_file.go",
-        "symlink.go",
-    ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs",
-    deps = [
-        "//pkg/abi/linux",
-        "//pkg/amutex",
-        "//pkg/sentry/arch",
-        "//pkg/sentry/context",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/kernel/pipe",
-        "//pkg/sentry/usermem",
-        "//pkg/sentry/vfs",
-        "//pkg/syserror",
-    ],
-)
-
-go_test(
-    name = "benchmark_test",
-    size = "small",
-    srcs = ["benchmark_test.go"],
-    deps = [
-        ":memfs",
-        "//pkg/abi/linux",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
-        "//pkg/sentry/fs",
-        "//pkg/sentry/fs/tmpfs",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/vfs",
-        "//pkg/syserror",
-    ],
-)
-
-go_test(
-    name = "memfs_test",
-    size = "small",
-    srcs = ["pipe_test.go"],
-    embed = [":memfs"],
-    deps = [
-        "//pkg/abi/linux",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/usermem",
-        "//pkg/sentry/vfs",
-        "//pkg/syserror",
-    ],
-)
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
deleted file mode 100644
index f006c40cd..000000000
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ /dev/null
@@ -1,579 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"fmt"
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// stepLocked resolves rp.Component() in parent directory vfsd.
-//
-// stepLocked is loosely analogous to fs/namei.c:walk_component().
-//
-// Preconditions: filesystem.mu must be locked. !rp.Done(). inode ==
-// vfsd.Impl().(*dentry).inode.
-func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode) (*vfs.Dentry, *inode, error) {
-	if !inode.isDir() {
-		return nil, nil, syserror.ENOTDIR
-	}
-	if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
-		return nil, nil, err
-	}
-afterSymlink:
-	nextVFSD, err := rp.ResolveComponent(vfsd)
-	if err != nil {
-		return nil, nil, err
-	}
-	if nextVFSD == nil {
-		// Since the Dentry tree is the sole source of truth for memfs, if it's
-		// not in the Dentry tree, it doesn't exist.
-		return nil, nil, syserror.ENOENT
-	}
-	nextInode := nextVFSD.Impl().(*dentry).inode
-	if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
-		// TODO: symlink traversals update access time
-		if err := rp.HandleSymlink(symlink.target); err != nil {
-			return nil, nil, err
-		}
-		goto afterSymlink // don't check the current directory again
-	}
-	rp.Advance()
-	return nextVFSD, nextInode, nil
-}
-
-// walkExistingLocked resolves rp to an existing file.
-//
-// walkExistingLocked is loosely analogous to Linux's
-// fs/namei.c:path_lookupat().
-//
-// Preconditions: filesystem.mu must be locked.
-func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*dentry).inode
-	for !rp.Done() {
-		var err error
-		vfsd, inode, err = stepLocked(rp, vfsd, inode)
-		if err != nil {
-			return nil, nil, err
-		}
-	}
-	if rp.MustBeDir() && !inode.isDir() {
-		return nil, nil, syserror.ENOTDIR
-	}
-	return vfsd, inode, nil
-}
-
-// walkParentDirLocked resolves all but the last path component of rp to an
-// existing directory. It does not check that the returned directory is
-// searchable by the provider of rp.
-//
-// walkParentDirLocked is loosely analogous to Linux's
-// fs/namei.c:path_parentat().
-//
-// Preconditions: filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*dentry).inode
-	for !rp.Final() {
-		var err error
-		vfsd, inode, err = stepLocked(rp, vfsd, inode)
-		if err != nil {
-			return nil, nil, err
-		}
-	}
-	if !inode.isDir() {
-		return nil, nil, syserror.ENOTDIR
-	}
-	return vfsd, inode, nil
-}
-
-// checkCreateLocked checks that a file named rp.Component() may be created in
-// directory parentVFSD, then returns rp.Component().
-//
-// Preconditions: filesystem.mu must be locked. parentInode ==
-// parentVFSD.Impl().(*dentry).inode. parentInode.isDir() == true.
-func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *inode) (string, error) {
-	if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
-		return "", err
-	}
-	pc := rp.Component()
-	if pc == "." || pc == ".." {
-		return "", syserror.EEXIST
-	}
-	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
-	if err != nil {
-		return "", err
-	}
-	if childVFSD != nil {
-		return "", syserror.EEXIST
-	}
-	if parentVFSD.IsDisowned() {
-		return "", syserror.ENOENT
-	}
-	return pc, nil
-}
-
-// checkDeleteLocked checks that the file represented by vfsd may be deleted.
-func checkDeleteLocked(vfsd *vfs.Dentry) error {
-	parentVFSD := vfsd.Parent()
-	if parentVFSD == nil {
-		return syserror.EBUSY
-	}
-	if parentVFSD.IsDisowned() {
-		return syserror.ENOENT
-	}
-	return nil
-}
-
-// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
-func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	vfsd, inode, err := walkExistingLocked(rp)
-	if err != nil {
-		return nil, err
-	}
-	if opts.CheckSearchable {
-		if !inode.isDir() {
-			return nil, syserror.ENOTDIR
-		}
-		if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
-			return nil, err
-		}
-	}
-	inode.incRef() // vfsd.IncRef(&fs.vfsfs)
-	return vfsd, nil
-}
-
-// LinkAt implements vfs.FilesystemImpl.LinkAt.
-func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if rp.Mount() != vd.Mount() {
-		return syserror.EXDEV
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	d := vd.Dentry().Impl().(*dentry)
-	if d.inode.isDir() {
-		return syserror.EPERM
-	}
-	d.inode.incLinksLocked()
-	child := fs.newDentry(d.inode)
-	parentVFSD.InsertChild(&child.vfsd, pc)
-	parentInode.impl.(*directory).childList.PushBack(child)
-	return nil
-}
-
-// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
-func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
-	parentVFSD.InsertChild(&child.vfsd, pc)
-	parentInode.impl.(*directory).childList.PushBack(child)
-	parentInode.incLinksLocked() // from child's ".."
-	return nil
-}
-
-// MknodAt implements vfs.FilesystemImpl.MknodAt.
-func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-
-	switch opts.Mode.FileType() {
-	case 0:
-		// "Zero file type is equivalent to type S_IFREG." - mknod(2)
-		fallthrough
-	case linux.ModeRegular:
-		// TODO(b/138862511): Implement.
-		return syserror.EINVAL
-
-	case linux.ModeNamedPipe:
-		child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
-		parentVFSD.InsertChild(&child.vfsd, pc)
-		parentInode.impl.(*directory).childList.PushBack(child)
-		return nil
-
-	case linux.ModeSocket:
-		// TODO(b/138862511): Implement.
-		return syserror.EINVAL
-
-	case linux.ModeCharacterDevice:
-		fallthrough
-	case linux.ModeBlockDevice:
-		// TODO(b/72101894): We don't support creating block or character
-		// devices at the moment.
-		//
-		// When we start supporting block and character devices, we'll
-		// need to check for CAP_MKNOD here.
-		return syserror.EPERM
-
-	default:
-		// "EINVAL - mode requested creation of something other than a
-		// regular file, device special file, FIFO or socket." - mknod(2)
-		return syserror.EINVAL
-	}
-}
-
-// OpenAt implements vfs.FilesystemImpl.OpenAt.
-func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	// Filter out flags that are not supported by memfs. O_DIRECTORY and
-	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
-	// appropriate bits in rp), but are returned by
-	// FileDescriptionImpl.StatusFlags(). O_NONBLOCK is supported only by
-	// pipes.
-	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK
-
-	if opts.Flags&linux.O_CREAT == 0 {
-		fs.mu.RLock()
-		defer fs.mu.RUnlock()
-		vfsd, inode, err := walkExistingLocked(rp)
-		if err != nil {
-			return nil, err
-		}
-		return inode.open(ctx, rp, vfsd, opts.Flags, false)
-	}
-
-	mustCreate := opts.Flags&linux.O_EXCL != 0
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*dentry).inode
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	if rp.Done() {
-		if rp.MustBeDir() {
-			return nil, syserror.EISDIR
-		}
-		if mustCreate {
-			return nil, syserror.EEXIST
-		}
-		return inode.open(ctx, rp, vfsd, opts.Flags, false)
-	}
-afterTrailingSymlink:
-	// Walk to the parent directory of the last path component.
-	for !rp.Final() {
-		var err error
-		vfsd, inode, err = stepLocked(rp, vfsd, inode)
-		if err != nil {
-			return nil, err
-		}
-	}
-	if !inode.isDir() {
-		return nil, syserror.ENOTDIR
-	}
-	// Check for search permission in the parent directory.
-	if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
-		return nil, err
-	}
-	// Reject attempts to open directories with O_CREAT.
-	if rp.MustBeDir() {
-		return nil, syserror.EISDIR
-	}
-	pc := rp.Component()
-	if pc == "." || pc == ".." {
-		return nil, syserror.EISDIR
-	}
-	// Determine whether or not we need to create a file.
-	childVFSD, err := rp.ResolveChild(vfsd, pc)
-	if err != nil {
-		return nil, err
-	}
-	if childVFSD == nil {
-		// Already checked for searchability above; now check for writability.
-		if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
-			return nil, err
-		}
-		if err := rp.Mount().CheckBeginWrite(); err != nil {
-			return nil, err
-		}
-		defer rp.Mount().EndWrite()
-		// Create and open the child.
-		childInode := fs.newRegularFile(rp.Credentials(), opts.Mode)
-		child := fs.newDentry(childInode)
-		vfsd.InsertChild(&child.vfsd, pc)
-		inode.impl.(*directory).childList.PushBack(child)
-		return childInode.open(ctx, rp, &child.vfsd, opts.Flags, true)
-	}
-	// Open existing file or follow symlink.
-	if mustCreate {
-		return nil, syserror.EEXIST
-	}
-	childInode := childVFSD.Impl().(*dentry).inode
-	if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
-		// TODO: symlink traversals update access time
-		if err := rp.HandleSymlink(symlink.target); err != nil {
-			return nil, err
-		}
-		// rp.Final() may no longer be true since we now need to resolve the
-		// symlink target.
-		goto afterTrailingSymlink
-	}
-	return childInode.open(ctx, rp, childVFSD, opts.Flags, false)
-}
-
-func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
-	ats := vfs.AccessTypesForOpenFlags(flags)
-	if !afterCreate {
-		if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil {
-			return nil, err
-		}
-	}
-	switch impl := i.impl.(type) {
-	case *regularFile:
-		var fd regularFileFD
-		fd.flags = flags
-		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
-		fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
-		if fd.writable {
-			if err := rp.Mount().CheckBeginWrite(); err != nil {
-				return nil, err
-			}
-			// Mount.EndWrite() is called by regularFileFD.Release().
-		}
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
-		if flags&linux.O_TRUNC != 0 {
-			impl.mu.Lock()
-			impl.data = impl.data[:0]
-			atomic.StoreInt64(&impl.dataLen, 0)
-			impl.mu.Unlock()
-		}
-		return &fd.vfsfd, nil
-	case *directory:
-		// Can't open directories writably.
-		if ats&vfs.MayWrite != 0 {
-			return nil, syserror.EISDIR
-		}
-		var fd directoryFD
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
-		fd.flags = flags
-		return &fd.vfsfd, nil
-	case *symlink:
-		// Can't open symlinks without O_PATH (which is unimplemented).
-		return nil, syserror.ELOOP
-	case *namedPipe:
-		return newNamedPipeFD(ctx, impl, rp, vfsd, flags)
-	default:
-		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
-	}
-}
-
-// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
-func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
-	fs.mu.RLock()
-	_, inode, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
-	if err != nil {
-		return "", err
-	}
-	symlink, ok := inode.impl.(*symlink)
-	if !ok {
-		return "", syserror.EINVAL
-	}
-	return symlink.target, nil
-}
-
-// RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
-	if rp.Done() {
-		return syserror.ENOENT
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	_, err = checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	// TODO: actually implement RenameAt
-	return syserror.EPERM
-}
-
-// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
-func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	vfsd, inode, err := walkExistingLocked(rp)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(vfsd); err != nil {
-		return err
-	}
-	if !inode.isDir() {
-		return syserror.ENOTDIR
-	}
-	if vfsd.HasChildren() {
-		return syserror.ENOTEMPTY
-	}
-	if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
-		return err
-	}
-	// Remove from parent directory's childList.
-	vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
-	inode.decRef()
-	return nil
-}
-
-// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
-func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
-	fs.mu.RLock()
-	_, _, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
-	if err != nil {
-		return err
-	}
-	if opts.Stat.Mask == 0 {
-		return nil
-	}
-	// TODO: implement inode.setStat
-	return syserror.EPERM
-}
-
-// StatAt implements vfs.FilesystemImpl.StatAt.
-func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
-	fs.mu.RLock()
-	_, inode, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
-	if err != nil {
-		return linux.Statx{}, err
-	}
-	var stat linux.Statx
-	inode.statTo(&stat)
-	return stat, nil
-}
-
-// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
-func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
-	fs.mu.RLock()
-	_, _, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
-	if err != nil {
-		return linux.Statfs{}, err
-	}
-	// TODO: actually implement statfs
-	return linux.Statfs{}, syserror.ENOSYS
-}
-
-// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
-func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
-	parentVFSD.InsertChild(&child.vfsd, pc)
-	parentInode.impl.(*directory).childList.PushBack(child)
-	return nil
-}
-
-// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
-func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	vfsd, inode, err := walkExistingLocked(rp)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(vfsd); err != nil {
-		return err
-	}
-	if inode.isDir() {
-		return syserror.EISDIR
-	}
-	if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
-		return err
-	}
-	// Remove from parent directory's childList.
-	vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
-	inode.decLinksLocked()
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
deleted file mode 100644
index 64c851c1a..000000000
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ /dev/null
@@ -1,302 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package memfs provides a filesystem implementation that behaves like tmpfs:
-// the Dentry tree is the sole source of truth for the state of the filesystem.
-//
-// memfs is intended primarily to demonstrate filesystem implementation
-// patterns. Real uses cases for an in-memory filesystem should use tmpfs
-// instead.
-//
-// Lock order:
-//
-// filesystem.mu
-//   regularFileFD.offMu
-//     regularFile.mu
-//   inode.mu
-package memfs
-
-import (
-	"fmt"
-	"sync"
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// FilesystemType implements vfs.FilesystemType.
-type FilesystemType struct{}
-
-// filesystem implements vfs.FilesystemImpl.
-type filesystem struct {
-	vfsfs vfs.Filesystem
-
-	// mu serializes changes to the Dentry tree.
-	mu sync.RWMutex
-
-	nextInoMinusOne uint64 // accessed using atomic memory operations
-}
-
-// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
-func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	var fs filesystem
-	fs.vfsfs.Init(&fs)
-	root := fs.newDentry(fs.newDirectory(creds, 01777))
-	return &fs.vfsfs, &root.vfsd, nil
-}
-
-// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
-}
-
-// Sync implements vfs.FilesystemImpl.Sync.
-func (fs *filesystem) Sync(ctx context.Context) error {
-	// All filesystem state is in-memory.
-	return nil
-}
-
-// dentry implements vfs.DentryImpl.
-type dentry struct {
-	vfsd vfs.Dentry
-
-	// inode is the inode represented by this dentry. Multiple Dentries may
-	// share a single non-directory inode (with hard links). inode is
-	// immutable.
-	inode *inode
-
-	// memfs doesn't count references on dentries; because the dentry tree is
-	// the sole source of truth, it is by definition always consistent with the
-	// state of the filesystem. However, it does count references on inodes,
-	// because inode resources are released when all references are dropped.
-	// (memfs doesn't really have resources to release, but we implement
-	// reference counting because tmpfs regular files will.)
-
-	// dentryEntry (ugh) links dentries into their parent directory.childList.
-	dentryEntry
-}
-
-func (fs *filesystem) newDentry(inode *inode) *dentry {
-	d := &dentry{
-		inode: inode,
-	}
-	d.vfsd.Init(d)
-	return d
-}
-
-// IncRef implements vfs.DentryImpl.IncRef.
-func (d *dentry) IncRef(vfsfs *vfs.Filesystem) {
-	d.inode.incRef()
-}
-
-// TryIncRef implements vfs.DentryImpl.TryIncRef.
-func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
-	return d.inode.tryIncRef()
-}
-
-// DecRef implements vfs.DentryImpl.DecRef.
-func (d *dentry) DecRef(vfsfs *vfs.Filesystem) {
-	d.inode.decRef()
-}
-
-// inode represents a filesystem object.
-type inode struct {
-	// refs is a reference count. refs is accessed using atomic memory
-	// operations.
-	//
-	// A reference is held on all inodes that are reachable in the filesystem
-	// tree. For non-directories (which may have multiple hard links), this
-	// means that a reference is dropped when nlink reaches 0. For directories,
-	// nlink never reaches 0 due to the "." entry; instead,
-	// filesystem.RmdirAt() drops the reference.
-	refs int64
-
-	// Inode metadata; protected by mu and accessed using atomic memory
-	// operations unless otherwise specified.
-	mu    sync.RWMutex
-	mode  uint32 // excluding file type bits, which are based on impl
-	nlink uint32 // protected by filesystem.mu instead of inode.mu
-	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
-	gid   uint32 // auth.KGID, but ...
-	ino   uint64 // immutable
-
-	impl interface{} // immutable
-}
-
-func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
-	i.refs = 1
-	i.mode = uint32(mode)
-	i.uid = uint32(creds.EffectiveKUID)
-	i.gid = uint32(creds.EffectiveKGID)
-	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
-	// i.nlink initialized by caller
-	i.impl = impl
-}
-
-// Preconditions: filesystem.mu must be locked for writing.
-func (i *inode) incLinksLocked() {
-	if atomic.AddUint32(&i.nlink, 1) <= 1 {
-		panic("memfs.inode.incLinksLocked() called with no existing links")
-	}
-}
-
-// Preconditions: filesystem.mu must be locked for writing.
-func (i *inode) decLinksLocked() {
-	if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 {
-		i.decRef()
-	} else if nlink == ^uint32(0) { // negative overflow
-		panic("memfs.inode.decLinksLocked() called with no existing links")
-	}
-}
-
-func (i *inode) incRef() {
-	if atomic.AddInt64(&i.refs, 1) <= 1 {
-		panic("memfs.inode.incRef() called without holding a reference")
-	}
-}
-
-func (i *inode) tryIncRef() bool {
-	for {
-		refs := atomic.LoadInt64(&i.refs)
-		if refs == 0 {
-			return false
-		}
-		if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) {
-			return true
-		}
-	}
-}
-
-func (i *inode) decRef() {
-	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
-		// This is unnecessary; it's mostly to simulate what tmpfs would do.
-		if regfile, ok := i.impl.(*regularFile); ok {
-			regfile.mu.Lock()
-			regfile.data = nil
-			atomic.StoreInt64(&regfile.dataLen, 0)
-			regfile.mu.Unlock()
-		}
-	} else if refs < 0 {
-		panic("memfs.inode.decRef() called without holding a reference")
-	}
-}
-
-func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
-	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
-}
-
-// Go won't inline this function, and returning linux.Statx (which is quite
-// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
-// output parameter.
-func (i *inode) statTo(stat *linux.Statx) {
-	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
-	stat.Blksize = 1 // usermem.PageSize in tmpfs
-	stat.Nlink = atomic.LoadUint32(&i.nlink)
-	stat.UID = atomic.LoadUint32(&i.uid)
-	stat.GID = atomic.LoadUint32(&i.gid)
-	stat.Mode = uint16(atomic.LoadUint32(&i.mode))
-	stat.Ino = i.ino
-	// TODO: device number
-	switch impl := i.impl.(type) {
-	case *regularFile:
-		stat.Mode |= linux.S_IFREG
-		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
-		stat.Size = uint64(atomic.LoadInt64(&impl.dataLen))
-		// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
-		// a uint64 accessed using atomic memory operations to avoid taking
-		// locks).
-		stat.Blocks = allocatedBlocksForSize(stat.Size)
-	case *directory:
-		stat.Mode |= linux.S_IFDIR
-	case *symlink:
-		stat.Mode |= linux.S_IFLNK
-		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
-		stat.Size = uint64(len(impl.target))
-		stat.Blocks = allocatedBlocksForSize(stat.Size)
-	case *namedPipe:
-		stat.Mode |= linux.S_IFIFO
-	default:
-		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
-	}
-}
-
-// allocatedBlocksForSize returns the number of 512B blocks needed to
-// accommodate the given size in bytes, as appropriate for struct
-// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
-// size is independent of the "preferred block size for I/O", struct
-// stat::st_blksize and struct statx::stx_blksize.)
-func allocatedBlocksForSize(size uint64) uint64 {
-	return (size + 511) / 512
-}
-
-func (i *inode) direntType() uint8 {
-	switch i.impl.(type) {
-	case *regularFile:
-		return linux.DT_REG
-	case *directory:
-		return linux.DT_DIR
-	case *symlink:
-		return linux.DT_LNK
-	default:
-		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
-	}
-}
-
-// fileDescription is embedded by memfs implementations of
-// vfs.FileDescriptionImpl.
-type fileDescription struct {
-	vfsfd vfs.FileDescription
-	vfs.FileDescriptionDefaultImpl
-
-	flags uint32 // status flags; immutable
-}
-
-func (fd *fileDescription) filesystem() *filesystem {
-	return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
-}
-
-func (fd *fileDescription) inode() *inode {
-	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
-}
-
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
-	return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
-	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
-	// no-op.
-	return nil
-}
-
-// Stat implements vfs.FileDescriptionImpl.Stat.
-func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
-	var stat linux.Statx
-	fd.inode().statTo(&stat)
-	return stat, nil
-}
-
-// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	if opts.Stat.Mask == 0 {
-		return nil
-	}
-	// TODO: implement inode.setStat
-	return syserror.EPERM
-}
diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go
deleted file mode 100644
index b7f4853b3..000000000
--- a/pkg/sentry/fsimpl/memfs/regular_file.go
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"io"
-	"sync"
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-type regularFile struct {
-	inode inode
-
-	mu   sync.RWMutex
-	data []byte
-	// dataLen is len(data), but accessed using atomic memory operations to
-	// avoid locking in inode.stat().
-	dataLen int64
-}
-
-func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
-	file := &regularFile{}
-	file.inode.init(file, fs, creds, mode)
-	file.inode.nlink = 1 // from parent directory
-	return &file.inode
-}
-
-type regularFileFD struct {
-	fileDescription
-
-	// These are immutable.
-	readable bool
-	writable bool
-
-	// off is the file offset. off is accessed using atomic memory operations.
-	// offMu serializes operations that may mutate off.
-	off   int64
-	offMu sync.Mutex
-}
-
-// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *regularFileFD) Release() {
-	if fd.writable {
-		fd.vfsfd.VirtualDentry().Mount().EndWrite()
-	}
-}
-
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
-	if !fd.readable {
-		return 0, syserror.EINVAL
-	}
-	f := fd.inode().impl.(*regularFile)
-	f.mu.RLock()
-	if offset >= int64(len(f.data)) {
-		f.mu.RUnlock()
-		return 0, io.EOF
-	}
-	n, err := dst.CopyOut(ctx, f.data[offset:])
-	f.mu.RUnlock()
-	return int64(n), err
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	fd.offMu.Lock()
-	n, err := fd.PRead(ctx, dst, fd.off, opts)
-	fd.off += n
-	fd.offMu.Unlock()
-	return n, err
-}
-
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	if !fd.writable {
-		return 0, syserror.EINVAL
-	}
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
-	srclen := src.NumBytes()
-	if srclen == 0 {
-		return 0, nil
-	}
-	f := fd.inode().impl.(*regularFile)
-	f.mu.Lock()
-	end := offset + srclen
-	if end < offset {
-		// Overflow.
-		f.mu.Unlock()
-		return 0, syserror.EFBIG
-	}
-	if end > f.dataLen {
-		f.data = append(f.data, make([]byte, end-f.dataLen)...)
-		atomic.StoreInt64(&f.dataLen, end)
-	}
-	n, err := src.CopyIn(ctx, f.data[offset:end])
-	f.mu.Unlock()
-	return int64(n), err
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
-	fd.offMu.Lock()
-	n, err := fd.PWrite(ctx, src, fd.off, opts)
-	fd.off += n
-	fd.offMu.Unlock()
-	return n, err
-}
-
-// Seek implements vfs.FileDescriptionImpl.Seek.
-func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
-	fd.offMu.Lock()
-	defer fd.offMu.Unlock()
-	switch whence {
-	case linux.SEEK_SET:
-		// use offset as specified
-	case linux.SEEK_CUR:
-		offset += fd.off
-	case linux.SEEK_END:
-		offset += atomic.LoadInt64(&fd.inode().impl.(*regularFile).dataLen)
-	default:
-		return 0, syserror.EINVAL
-	}
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
-	fd.off = offset
-	return offset, nil
-}
-
-// Sync implements vfs.FileDescriptionImpl.Sync.
-func (fd *regularFileFD) Sync(ctx context.Context) error {
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD
new file mode 100644
index 000000000..8cf5b35d3
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/BUILD
@@ -0,0 +1,41 @@
+load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "overlay",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "dentry",
+    },
+)
+
+go_library(
+    name = "overlay",
+    srcs = [
+        "copy_up.go",
+        "directory.go",
+        "filesystem.go",
+        "fstree.go",
+        "non_directory.go",
+        "overlay.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
new file mode 100644
index 000000000..b3d19ff82
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -0,0 +1,262 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"fmt"
+	"io"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isCopiedUp() bool {
+	return atomic.LoadUint32(&d.copiedUp) != 0
+}
+
+// copyUpLocked ensures that d exists on the upper layer, i.e. d.upperVD.Ok().
+//
+// Preconditions: filesystem.renameMu must be locked.
+func (d *dentry) copyUpLocked(ctx context.Context) error {
+	// Fast path.
+	if d.isCopiedUp() {
+		return nil
+	}
+
+	ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
+	switch ftype {
+	case linux.S_IFREG, linux.S_IFDIR, linux.S_IFLNK, linux.S_IFBLK, linux.S_IFCHR:
+		// Can be copied-up.
+	default:
+		// Can't be copied-up.
+		return syserror.EPERM
+	}
+
+	// Ensure that our parent directory is copied-up.
+	if d.parent == nil {
+		// d is a filesystem root with no upper layer.
+		return syserror.EROFS
+	}
+	if err := d.parent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+
+	d.copyMu.Lock()
+	defer d.copyMu.Unlock()
+	if d.upperVD.Ok() {
+		// Raced with another call to d.copyUpLocked().
+		return nil
+	}
+	if d.vfsd.IsDead() {
+		// Raced with deletion of d.
+		return syserror.ENOENT
+	}
+
+	// Perform copy-up.
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	newpop := vfs.PathOperation{
+		Root:  d.parent.upperVD,
+		Start: d.parent.upperVD,
+		Path:  fspath.Parse(d.name),
+	}
+	cleanupUndoCopyUp := func() {
+		var err error
+		if ftype == linux.S_IFDIR {
+			err = vfsObj.RmdirAt(ctx, d.fs.creds, &newpop)
+		} else {
+			err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop)
+		}
+		if err != nil {
+			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)
+		}
+	}
+	switch ftype {
+	case linux.S_IFREG:
+		oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.lowerVDs[0],
+			Start: d.lowerVDs[0],
+		}, &vfs.OpenOptions{
+			Flags: linux.O_RDONLY,
+		})
+		if err != nil {
+			return err
+		}
+		defer oldFD.DecRef(ctx)
+		newFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &newpop, &vfs.OpenOptions{
+			Flags: linux.O_WRONLY | linux.O_CREAT | linux.O_EXCL,
+			Mode:  linux.FileMode(d.mode &^ linux.S_IFMT),
+		})
+		if err != nil {
+			return err
+		}
+		defer newFD.DecRef(ctx)
+		bufIOSeq := usermem.BytesIOSequence(make([]byte, 32*1024)) // arbitrary buffer size
+		for {
+			readN, readErr := oldFD.Read(ctx, bufIOSeq, vfs.ReadOptions{})
+			if readErr != nil && readErr != io.EOF {
+				cleanupUndoCopyUp()
+				return readErr
+			}
+			total := int64(0)
+			for total < readN {
+				writeN, writeErr := newFD.Write(ctx, bufIOSeq.DropFirst64(total), vfs.WriteOptions{})
+				total += writeN
+				if writeErr != nil {
+					cleanupUndoCopyUp()
+					return writeErr
+				}
+			}
+			if readErr == io.EOF {
+				break
+			}
+		}
+		if err := newFD.SetStat(ctx, vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  d.uid,
+				GID:  d.gid,
+			},
+		}); err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		d.upperVD = newFD.VirtualDentry()
+		d.upperVD.IncRef()
+
+	case linux.S_IFDIR:
+		if err := vfsObj.MkdirAt(ctx, d.fs.creds, &newpop, &vfs.MkdirOptions{
+			Mode: linux.FileMode(d.mode &^ linux.S_IFMT),
+		}); err != nil {
+			return err
+		}
+		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  d.uid,
+				GID:  d.gid,
+			},
+		}); err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+		if err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		d.upperVD = upperVD
+
+	case linux.S_IFLNK:
+		target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.lowerVDs[0],
+			Start: d.lowerVDs[0],
+		})
+		if err != nil {
+			return err
+		}
+		if err := vfsObj.SymlinkAt(ctx, d.fs.creds, &newpop, target); err != nil {
+			return err
+		}
+		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID,
+				Mode: uint16(d.mode),
+				UID:  d.uid,
+				GID:  d.gid,
+			},
+		}); err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+		if err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		d.upperVD = upperVD
+
+	case linux.S_IFBLK, linux.S_IFCHR:
+		lowerStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.lowerVDs[0],
+			Start: d.lowerVDs[0],
+		}, &vfs.StatOptions{})
+		if err != nil {
+			return err
+		}
+		if err := vfsObj.MknodAt(ctx, d.fs.creds, &newpop, &vfs.MknodOptions{
+			Mode:     linux.FileMode(d.mode),
+			DevMajor: lowerStat.RdevMajor,
+			DevMinor: lowerStat.RdevMinor,
+		}); err != nil {
+			return err
+		}
+		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  d.uid,
+				GID:  d.gid,
+			},
+		}); err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		upperVD, err := vfsObj.GetDentryAt(ctx, d.fs.creds, &newpop, &vfs.GetDentryOptions{})
+		if err != nil {
+			cleanupUndoCopyUp()
+			return err
+		}
+		d.upperVD = upperVD
+
+	default:
+		// Should have rejected this at the beginning of this function?
+		panic(fmt.Sprintf("unexpected file type %o", ftype))
+	}
+
+	// TODO(gvisor.dev/issue/1199): copy up xattrs
+
+	// Update the dentry's device and inode numbers (except for directories,
+	// for which these remain overlay-assigned).
+	if ftype != linux.S_IFDIR {
+		upperStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.upperVD,
+			Start: d.upperVD,
+		}, &vfs.StatOptions{
+			Mask: linux.STATX_INO,
+		})
+		if err != nil {
+			d.upperVD.DecRef(ctx)
+			d.upperVD = vfs.VirtualDentry{}
+			cleanupUndoCopyUp()
+			return err
+		}
+		if upperStat.Mask&linux.STATX_INO == 0 {
+			d.upperVD.DecRef(ctx)
+			d.upperVD = vfs.VirtualDentry{}
+			cleanupUndoCopyUp()
+			return syserror.EREMOTE
+		}
+		atomic.StoreUint32(&d.devMajor, upperStat.DevMajor)
+		atomic.StoreUint32(&d.devMinor, upperStat.DevMinor)
+		atomic.StoreUint64(&d.ino, upperStat.Ino)
+	}
+
+	atomic.StoreUint32(&d.copiedUp, 1)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go
new file mode 100644
index 000000000..6a79f7ffe
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/directory.go
@@ -0,0 +1,289 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func (d *dentry) isDir() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
+}
+
+// Preconditions: d.dirMu must be locked. d.isDir().
+func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string]bool, error) {
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	var readdirErr error
+	whiteouts := make(map[string]bool)
+	var maybeWhiteouts []string
+	d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool {
+		layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  layerVD,
+			Start: layerVD,
+		}, &vfs.OpenOptions{
+			Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+		})
+		if err != nil {
+			readdirErr = err
+			return false
+		}
+		defer layerFD.DecRef(ctx)
+
+		// Reuse slice allocated for maybeWhiteouts from a previous layer to
+		// reduce allocations.
+		maybeWhiteouts = maybeWhiteouts[:0]
+		err = layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+			if dirent.Name == "." || dirent.Name == ".." {
+				return nil
+			}
+			if _, ok := whiteouts[dirent.Name]; ok {
+				// This file has been whited-out in a previous layer.
+				return nil
+			}
+			if dirent.Type == linux.DT_CHR {
+				// We have to determine if this is a whiteout, which doesn't
+				// count against the directory's emptiness. However, we can't
+				// do so while holding locks held by layerFD.IterDirents().
+				maybeWhiteouts = append(maybeWhiteouts, dirent.Name)
+				return nil
+			}
+			// Non-whiteout file in the directory prevents rmdir.
+			return syserror.ENOTEMPTY
+		}))
+		if err != nil {
+			readdirErr = err
+			return false
+		}
+
+		for _, maybeWhiteoutName := range maybeWhiteouts {
+			stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+				Root:  layerVD,
+				Start: layerVD,
+				Path:  fspath.Parse(maybeWhiteoutName),
+			}, &vfs.StatOptions{})
+			if err != nil {
+				readdirErr = err
+				return false
+			}
+			if stat.RdevMajor != 0 || stat.RdevMinor != 0 {
+				// This file is a real character device, not a whiteout.
+				readdirErr = syserror.ENOTEMPTY
+				return false
+			}
+			whiteouts[maybeWhiteoutName] = isUpper
+		}
+		// Continue iteration since we haven't found any non-whiteout files in
+		// this directory yet.
+		return true
+	})
+	return whiteouts, readdirErr
+}
+
+type directoryFD struct {
+	fileDescription
+	vfs.DirectoryFileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+
+	mu      sync.Mutex
+	off     int64
+	dirents []vfs.Dirent
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release(ctx context.Context) {
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	d := fd.dentry()
+	if fd.dirents == nil {
+		ds, err := d.getDirents(ctx)
+		if err != nil {
+			return err
+		}
+		fd.dirents = ds
+	}
+
+	for fd.off < int64(len(fd.dirents)) {
+		if err := cb.Handle(fd.dirents[fd.off]); err != nil {
+			return err
+		}
+		fd.off++
+	}
+	return nil
+}
+
+// Preconditions: d.isDir().
+func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
+	d.fs.renameMu.RLock()
+	defer d.fs.renameMu.RUnlock()
+	d.dirMu.Lock()
+	defer d.dirMu.Unlock()
+
+	if d.dirents != nil {
+		return d.dirents, nil
+	}
+
+	parent := genericParentOrSelf(d)
+	dirents := []vfs.Dirent{
+		{
+			Name:    ".",
+			Type:    linux.DT_DIR,
+			Ino:     d.ino,
+			NextOff: 1,
+		},
+		{
+			Name:    "..",
+			Type:    uint8(atomic.LoadUint32(&parent.mode) >> 12),
+			Ino:     parent.ino,
+			NextOff: 2,
+		},
+	}
+
+	// Merge dirents from all layers comprising this directory.
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	var readdirErr error
+	prevDirents := make(map[string]struct{})
+	var maybeWhiteouts []vfs.Dirent
+	d.iterLayers(func(layerVD vfs.VirtualDentry, isUpper bool) bool {
+		layerFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  layerVD,
+			Start: layerVD,
+		}, &vfs.OpenOptions{
+			Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+		})
+		if err != nil {
+			readdirErr = err
+			return false
+		}
+		defer layerFD.DecRef(ctx)
+
+		// Reuse slice allocated for maybeWhiteouts from a previous layer to
+		// reduce allocations.
+		maybeWhiteouts = maybeWhiteouts[:0]
+		err = layerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
+			if dirent.Name == "." || dirent.Name == ".." {
+				return nil
+			}
+			if _, ok := prevDirents[dirent.Name]; ok {
+				// This file is hidden by, or merged with, another file with
+				// the same name in a previous layer.
+				return nil
+			}
+			prevDirents[dirent.Name] = struct{}{}
+			if dirent.Type == linux.DT_CHR {
+				// We can't determine if this file is a whiteout while holding
+				// locks held by layerFD.IterDirents().
+				maybeWhiteouts = append(maybeWhiteouts, dirent)
+				return nil
+			}
+			dirent.NextOff = int64(len(dirents) + 1)
+			dirents = append(dirents, dirent)
+			return nil
+		}))
+		if err != nil {
+			readdirErr = err
+			return false
+		}
+
+		for _, dirent := range maybeWhiteouts {
+			stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
+				Root:  layerVD,
+				Start: layerVD,
+				Path:  fspath.Parse(dirent.Name),
+			}, &vfs.StatOptions{})
+			if err != nil {
+				readdirErr = err
+				return false
+			}
+			if stat.RdevMajor == 0 && stat.RdevMinor == 0 {
+				// This file is a whiteout; don't emit a dirent for it.
+				continue
+			}
+			dirent.NextOff = int64(len(dirents) + 1)
+			dirents = append(dirents, dirent)
+		}
+		return true
+	})
+	if readdirErr != nil {
+		return nil, readdirErr
+	}
+
+	// Cache dirents for future directoryFDs.
+	d.dirents = dirents
+	return dirents, nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		if offset < 0 {
+			return 0, syserror.EINVAL
+		}
+		if offset == 0 {
+			// Ensure that the next call to fd.IterDirents() calls
+			// fd.dentry().getDirents().
+			fd.dirents = nil
+		}
+		fd.off = offset
+		return fd.off, nil
+	case linux.SEEK_CUR:
+		offset += fd.off
+		if offset < 0 {
+			return 0, syserror.EINVAL
+		}
+		// Don't clear fd.dirents in this case, even if offset == 0.
+		fd.off = offset
+		return fd.off, nil
+	default:
+		return 0, syserror.EINVAL
+	}
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync. Forwards sync to the upper
+// layer, if there is one. The lower layer doesn't need to sync because it
+// never changes.
+func (fd *directoryFD) Sync(ctx context.Context) error {
+	d := fd.dentry()
+	if !d.isCopiedUp() {
+		return nil
+	}
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	pop := vfs.PathOperation{
+		Root:  d.upperVD,
+		Start: d.upperVD,
+	}
+	upperFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
+	if err != nil {
+		return err
+	}
+	err = upperFD.Sync(ctx)
+	upperFD.DecRef(ctx)
+	return err
+}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
new file mode 100644
index 000000000..986b36ead
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -0,0 +1,1364 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for
+// opaque directories.
+// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE
+const _OVL_XATTR_OPAQUE = "trusted.overlay.opaque"
+
+func isWhiteout(stat *linux.Statx) bool {
+	return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0
+}
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	if fs.opts.UpperRoot.Ok() {
+		return fs.opts.UpperRoot.Mount().Filesystem().Impl().Sync(ctx)
+	}
+	return nil
+}
+
+var dentrySlicePool = sync.Pool{
+	New: func() interface{} {
+		ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
+		return &ds
+	},
+}
+
+func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
+	if ds == nil {
+		ds = dentrySlicePool.Get().(*[]*dentry)
+	}
+	*ds = append(*ds, d)
+	return ds
+}
+
+// Preconditions: ds != nil.
+func putDentrySlice(ds *[]*dentry) {
+	// Allow dentries to be GC'd.
+	for i := range *ds {
+		(*ds)[i] = nil
+	}
+	*ds = (*ds)[:0]
+	dentrySlicePool.Put(ds)
+}
+
+// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls
+// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for
+// writing.
+//
+// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
+// but dentry slices are allocated lazily, and it's much easier to say "defer
+// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
+func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+	fs.renameMu.RUnlock()
+	if *ds == nil {
+		return
+	}
+	if len(**ds) != 0 {
+		fs.renameMu.Lock()
+		for _, d := range **ds {
+			d.checkDropLocked(ctx)
+		}
+		fs.renameMu.Unlock()
+	}
+	putDentrySlice(*ds)
+}
+
+func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+	if *ds == nil {
+		fs.renameMu.Unlock()
+		return
+	}
+	for _, d := range **ds {
+		d.checkDropLocked(ctx)
+	}
+	fs.renameMu.Unlock()
+	putDentrySlice(*ds)
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may have a reference count of zero, and which therefore
+// should be dropped once traversal is complete, are appended to ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done().
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	name := rp.Component()
+	if name == "." {
+		rp.Advance()
+		return d, nil
+	}
+	if name == ".." {
+		if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return d, nil
+		}
+		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return d.parent, nil
+	}
+	child, err := fs.getChildLocked(ctx, d, name, ds)
+	if err != nil {
+		return nil, err
+	}
+	if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+		return nil, err
+	}
+	if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+	if child, ok := parent.children[name]; ok {
+		return child, nil
+	}
+	child, err := fs.lookupLocked(ctx, parent, name)
+	if err != nil {
+		return nil, err
+	}
+	if parent.children == nil {
+		parent.children = make(map[string]*dentry)
+	}
+	parent.children[name] = child
+	// child's refcount is initially 0, so it may be dropped after traversal.
+	*ds = appendDentry(*ds, child)
+	return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
+	childPath := fspath.Parse(name)
+	child := fs.newDentry()
+	existsOnAnyLayer := false
+	var lookupErr error
+
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+	parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool {
+		childVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  parentVD,
+			Start: parentVD,
+			Path:  childPath,
+		}, &vfs.GetDentryOptions{})
+		if err == syserror.ENOENT || err == syserror.ENAMETOOLONG {
+			// The file doesn't exist on this layer. Proceed to the next one.
+			return true
+		}
+		if err != nil {
+			lookupErr = err
+			return false
+		}
+
+		mask := uint32(linux.STATX_TYPE)
+		if !existsOnAnyLayer {
+			// Mode, UID, GID, and (for non-directories) inode number come from
+			// the topmost layer on which the file exists.
+			mask |= linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+		}
+		stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  childVD,
+			Start: childVD,
+		}, &vfs.StatOptions{
+			Mask: mask,
+		})
+		if err != nil {
+			lookupErr = err
+			return false
+		}
+		if stat.Mask&mask != mask {
+			lookupErr = syserror.EREMOTE
+			return false
+		}
+
+		if isWhiteout(&stat) {
+			// This is a whiteout, so it "doesn't exist" on this layer, and
+			// layers below this one are ignored.
+			return false
+		}
+		isDir := stat.Mode&linux.S_IFMT == linux.S_IFDIR
+		if existsOnAnyLayer && !isDir {
+			// Directories are not merged with non-directory files from lower
+			// layers; instead, layers including and below the first
+			// non-directory file are ignored. (This file must be a directory
+			// on previous layers, since lower layers aren't searched for
+			// non-directory files.)
+			return false
+		}
+
+		// Update child to include this layer.
+		if isUpper {
+			child.upperVD = childVD
+			child.copiedUp = 1
+		} else {
+			child.lowerVDs = append(child.lowerVDs, childVD)
+		}
+		if !existsOnAnyLayer {
+			existsOnAnyLayer = true
+			child.mode = uint32(stat.Mode)
+			child.uid = stat.UID
+			child.gid = stat.GID
+			child.devMajor = stat.DevMajor
+			child.devMinor = stat.DevMinor
+			child.ino = stat.Ino
+		}
+
+		// For non-directory files, only the topmost layer that contains a file
+		// matters.
+		if !isDir {
+			return false
+		}
+
+		// Directories are merged with directories from lower layers if they
+		// are not explicitly opaque.
+		opaqueVal, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  childVD,
+			Start: childVD,
+		}, &vfs.GetxattrOptions{
+			Name: _OVL_XATTR_OPAQUE,
+			Size: 1,
+		})
+		return !(err == nil && opaqueVal == "y")
+	})
+
+	if lookupErr != nil {
+		child.destroyLocked(ctx)
+		return nil, lookupErr
+	}
+	if !existsOnAnyLayer {
+		child.destroyLocked(ctx)
+		return nil, syserror.ENOENT
+	}
+
+	// Device and inode numbers were copied from the topmost layer above;
+	// override them if necessary.
+	if child.isDir() {
+		child.devMajor = linux.UNNAMED_MAJOR
+		child.devMinor = fs.dirDevMinor
+		child.ino = fs.newDirIno()
+	} else if !child.upperVD.Ok() {
+		child.devMajor = linux.UNNAMED_MAJOR
+		child.devMinor = fs.lowerDevMinors[child.lowerVDs[0].Mount().Filesystem()]
+	}
+
+	parent.IncRef()
+	child.parent = parent
+	child.name = name
+	return child, nil
+}
+
+// lookupLayerLocked is similar to lookupLocked, but only returns information
+// about the file rather than a dentry.
+//
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) {
+	childPath := fspath.Parse(name)
+	lookupLayer := lookupLayerNone
+	var lookupErr error
+
+	parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool {
+		stat, err := fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  parentVD,
+			Start: parentVD,
+			Path:  childPath,
+		}, &vfs.StatOptions{
+			Mask: linux.STATX_TYPE,
+		})
+		if err == syserror.ENOENT || err == syserror.ENAMETOOLONG {
+			// The file doesn't exist on this layer. Proceed to the next
+			// one.
+			return true
+		}
+		if err != nil {
+			lookupErr = err
+			return false
+		}
+		if stat.Mask&linux.STATX_TYPE == 0 {
+			// Linux's overlayfs tends to return EREMOTE in cases where a file
+			// is unusable for reasons that are not better captured by another
+			// errno.
+			lookupErr = syserror.EREMOTE
+			return false
+		}
+		if isWhiteout(&stat) {
+			// This is a whiteout, so it "doesn't exist" on this layer, and
+			// layers below this one are ignored.
+			if isUpper {
+				lookupLayer = lookupLayerUpperWhiteout
+			}
+			return false
+		}
+		// The file exists; we can stop searching.
+		if isUpper {
+			lookupLayer = lookupLayerUpper
+		} else {
+			lookupLayer = lookupLayerLower
+		}
+		return false
+	})
+
+	return lookupLayer, lookupErr
+}
+
+type lookupLayer int
+
+const (
+	// lookupLayerNone indicates that no file exists at the given path on the
+	// upper layer, and is either whited out or does not exist on lower layers.
+	// Therefore, the file does not exist in the overlay filesystem, and file
+	// creation may proceed normally (if an upper layer exists).
+	lookupLayerNone lookupLayer = iota
+
+	// lookupLayerLower indicates that no file exists at the given path on the
+	// upper layer, but exists on a lower layer. Therefore, the file exists in
+	// the overlay filesystem, but must be copied-up before mutation.
+	lookupLayerLower
+
+	// lookupLayerUpper indicates that a non-whiteout file exists at the given
+	// path on the upper layer. Therefore, the file exists in the overlay
+	// filesystem, and is already copied-up.
+	lookupLayerUpper
+
+	// lookupLayerUpperWhiteout indicates that a whiteout exists at the given
+	// path on the upper layer. Therefore, the file does not exist in the
+	// overlay filesystem, and file creation must remove the whiteout before
+	// proceeding.
+	lookupLayerUpperWhiteout
+)
+
+func (ll lookupLayer) existsInOverlay() bool {
+	return ll == lookupLayerLower || ll == lookupLayerUpper
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// Preconditions: fs.renameMu must be locked. !rp.Done().
+func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+	for !rp.Final() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	for !rp.Done() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	start := rp.Start().Impl().(*dentry)
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EEXIST
+	}
+	if !dir && rp.MustBeDir() {
+		return syserror.ENOENT
+	}
+	if parent.vfsd.IsDead() {
+		return syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+
+	// Determine if a file already exists at name.
+	if _, ok := parent.children[name]; ok {
+		return syserror.EEXIST
+	}
+	childLayer, err := fs.lookupLayerLocked(ctx, parent, name)
+	if err != nil {
+		return err
+	}
+	if childLayer.existsInOverlay() {
+		return syserror.EEXIST
+	}
+
+	// Ensure that the parent directory is copied-up so that we can create the
+	// new file in the upper layer.
+	if err := parent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+
+	// Finally create the new file.
+	if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil {
+		return err
+	}
+	parent.dirents = nil
+	return nil
+}
+
+// Preconditions: pop's parent directory has been copied up.
+func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) error {
+	return vfsObj.MknodAt(ctx, fs.creds, pop, &vfs.MknodOptions{
+		Mode: linux.S_IFCHR, // permissions == include/linux/fs.h:WHITEOUT_MODE == 0
+		// DevMajor == DevMinor == 0, from include/linux/fs.h:WHITEOUT_DEV
+	})
+}
+
+func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) {
+	if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil {
+		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err)
+	}
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.checkPermissions(creds, ats)
+}
+
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+		return nil, err
+	}
+	layerVD := d.topLayer()
+	return fs.vfsfs.VirtualFilesystem().BoundEndpointAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	}, &opts)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	if opts.CheckSearchable {
+		if !d.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+			return nil, err
+		}
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	start := rp.Start().Impl().(*dentry)
+	d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+		if rp.Mount() != vd.Mount() {
+			return syserror.EXDEV
+		}
+		old := vd.Dentry().Impl().(*dentry)
+		if old.isDir() {
+			return syserror.EPERM
+		}
+		if err := old.copyUpLocked(ctx); err != nil {
+			return err
+		}
+		vfsObj := fs.vfsfs.VirtualFilesystem()
+		newpop := vfs.PathOperation{
+			Root:  parent.upperVD,
+			Start: parent.upperVD,
+			Path:  fspath.Parse(childName),
+		}
+		if haveUpperWhiteout {
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
+				return err
+			}
+		}
+		if err := vfsObj.LinkAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  old.upperVD,
+			Start: old.upperVD,
+		}, &newpop); err != nil {
+			if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
+			}
+			return err
+		}
+		creds := rp.Credentials()
+		if err := vfsObj.SetStatAt(ctx, fs.creds, &newpop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  uint32(creds.EffectiveKUID),
+				GID:  uint32(creds.EffectiveKGID),
+			},
+		}); err != nil {
+			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil {
+				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr)
+			} else if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
+			}
+			return err
+		}
+		return nil
+	})
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+		vfsObj := fs.vfsfs.VirtualFilesystem()
+		pop := vfs.PathOperation{
+			Root:  parent.upperVD,
+			Start: parent.upperVD,
+			Path:  fspath.Parse(childName),
+		}
+		if haveUpperWhiteout {
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+				return err
+			}
+		}
+		if err := vfsObj.MkdirAt(ctx, fs.creds, &pop, &opts); err != nil {
+			if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		creds := rp.Credentials()
+		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  uint32(creds.EffectiveKUID),
+				GID:  uint32(creds.EffectiveKGID),
+			},
+		}); err != nil {
+			if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
+				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)
+			} else if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		if haveUpperWhiteout {
+			// There may be directories on lower layers (previously hidden by
+			// the whiteout) that the new directory should not be merged with.
+			// Mark it opaque to prevent merging.
+			if err := vfsObj.SetxattrAt(ctx, fs.creds, &pop, &vfs.SetxattrOptions{
+				Name:  _OVL_XATTR_OPAQUE,
+				Value: "y",
+			}); err != nil {
+				if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
+					ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr)
+				} else {
+					fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+				}
+				return err
+			}
+		}
+		return nil
+	})
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+		// Disallow attempts to create whiteouts.
+		if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 {
+			return syserror.EPERM
+		}
+		vfsObj := fs.vfsfs.VirtualFilesystem()
+		pop := vfs.PathOperation{
+			Root:  parent.upperVD,
+			Start: parent.upperVD,
+			Path:  fspath.Parse(childName),
+		}
+		if haveUpperWhiteout {
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+				return err
+			}
+		}
+		if err := vfsObj.MknodAt(ctx, fs.creds, &pop, &opts); err != nil {
+			if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		creds := rp.Credentials()
+		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  uint32(creds.EffectiveKUID),
+				GID:  uint32(creds.EffectiveKGID),
+			},
+		}); err != nil {
+			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)
+			} else if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		return nil
+	})
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	mayCreate := opts.Flags&linux.O_CREAT != 0
+	mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)
+
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+
+	start := rp.Start().Impl().(*dentry)
+	if rp.Done() {
+		if mustCreate {
+			return nil, syserror.EEXIST
+		}
+		return start.openLocked(ctx, rp, &opts)
+	}
+
+afterTrailingSymlink:
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+	// Determine whether or not we need to create a file.
+	parent.dirMu.Lock()
+	child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
+	if err == syserror.ENOENT && mayCreate {
+		fd, err := fs.createAndOpenLocked(ctx, rp, parent, &opts, &ds)
+		parent.dirMu.Unlock()
+		return fd, err
+	}
+	if err != nil {
+		parent.dirMu.Unlock()
+		return nil, err
+	}
+	// Open existing child or follow symlink.
+	parent.dirMu.Unlock()
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	if child.isSymlink() && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		start = parent
+		goto afterTrailingSymlink
+	}
+	return child.openLocked(ctx, rp, &opts)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(opts)
+	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
+		return nil, err
+	}
+	if ats.MayWrite() {
+		if err := d.copyUpLocked(ctx); err != nil {
+			return nil, err
+		}
+	}
+	mnt := rp.Mount()
+
+	// Directory FDs open FDs from each layer when directory entries are read,
+	// so they don't require opening an FD from d.topLayer() up front.
+	ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
+	if ftype == linux.S_IFDIR {
+		// Can't open directories with O_CREAT.
+		if opts.Flags&linux.O_CREAT != 0 {
+			return nil, syserror.EISDIR
+		}
+		// Can't open directories writably.
+		if ats&vfs.MayWrite != 0 {
+			return nil, syserror.EISDIR
+		}
+		if opts.Flags&linux.O_DIRECT != 0 {
+			return nil, syserror.EINVAL
+		}
+		fd := &directoryFD{}
+		fd.LockFD.Init(&d.locks)
+		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
+			UseDentryMetadata: true,
+		}); err != nil {
+			return nil, err
+		}
+		return &fd.vfsfd, nil
+	}
+
+	layerVD, isUpper := d.topLayerInfo()
+	layerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	}, opts)
+	if err != nil {
+		return nil, err
+	}
+	layerFlags := layerFD.StatusFlags()
+	fd := &nonDirectoryFD{
+		copiedUp:    isUpper,
+		cachedFD:    layerFD,
+		cachedFlags: layerFlags,
+	}
+	fd.LockFD.Init(&d.locks)
+	layerFDOpts := layerFD.Options()
+	if err := fd.vfsfd.Init(fd, layerFlags, mnt, &d.vfsd, &layerFDOpts); err != nil {
+		layerFD.DecRef(ctx)
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// Preconditions: parent.dirMu must be locked. parent does not already contain
+// a child named rp.Component().
+func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
+	creds := rp.Credentials()
+	if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return nil, err
+	}
+	if parent.vfsd.IsDead() {
+		return nil, syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return nil, err
+	}
+	defer mnt.EndWrite()
+
+	if err := parent.copyUpLocked(ctx); err != nil {
+		return nil, err
+	}
+
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+	childName := rp.Component()
+	pop := vfs.PathOperation{
+		Root:  parent.upperVD,
+		Start: parent.upperVD,
+		Path:  fspath.Parse(childName),
+	}
+	// We don't know if a whiteout exists on the upper layer; speculatively
+	// unlink it.
+	//
+	// TODO(gvisor.dev/issue/1199): Modify OpenAt => stepLocked so that we do
+	// know whether a whiteout exists.
+	var haveUpperWhiteout bool
+	switch err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err {
+	case nil:
+		haveUpperWhiteout = true
+	case syserror.ENOENT:
+		haveUpperWhiteout = false
+	default:
+		return nil, err
+	}
+	// Create the file on the upper layer, and get an FD representing it.
+	upperFD, err := vfsObj.OpenAt(ctx, fs.creds, &pop, &vfs.OpenOptions{
+		Flags: opts.Flags&^vfs.FileCreationFlags | linux.O_CREAT | linux.O_EXCL,
+		Mode:  opts.Mode,
+	})
+	if err != nil {
+		if haveUpperWhiteout {
+			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+		}
+		return nil, err
+	}
+	// Change the file's owner to the caller. We can't use upperFD.SetStat()
+	// because it will pick up creds from ctx.
+	if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_UID | linux.STATX_GID,
+			UID:  uint32(creds.EffectiveKUID),
+			GID:  uint32(creds.EffectiveKGID),
+		},
+	}); err != nil {
+		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)
+		} else if haveUpperWhiteout {
+			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+		}
+		return nil, err
+	}
+	// Re-lookup to get a dentry representing the new file, which is needed for
+	// the returned FD.
+	child, err := fs.getChildLocked(ctx, parent, childName, ds)
+	if err != nil {
+		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr)
+		} else if haveUpperWhiteout {
+			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+		}
+		return nil, err
+	}
+	// Finally construct the overlay FD.
+	upperFlags := upperFD.StatusFlags()
+	fd := &nonDirectoryFD{
+		copiedUp:    true,
+		cachedFD:    upperFD,
+		cachedFlags: upperFlags,
+	}
+	fd.LockFD.Init(&child.locks)
+	upperFDOpts := upperFD.Options()
+	if err := fd.vfsfd.Init(fd, upperFlags, mnt, &child.vfsd, &upperFDOpts); err != nil {
+		upperFD.DecRef(ctx)
+		// Don't bother with cleanup; the file was created successfully, we
+		// just can't open it anymore for some reason.
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	layerVD := d.topLayer()
+	return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	})
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	if opts.Flags != 0 {
+		return syserror.EINVAL
+	}
+
+	var ds *[]*dentry
+	fs.renameMu.Lock()
+	defer fs.renameMuUnlockAndCheckDrop(ctx, &ds)
+	newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
+	if err != nil {
+		return err
+	}
+	newName := rp.Component()
+	if newName == "." || newName == ".." {
+		return syserror.EBUSY
+	}
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	// FIXME(gvisor.dev/issue/1199): Actually implement rename.
+	_ = newParent
+	return syserror.EXDEV
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	start := rp.Start().Impl().(*dentry)
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	name := rp.Component()
+	if name == "." {
+		return syserror.EINVAL
+	}
+	if name == ".." {
+		return syserror.ENOTEMPTY
+	}
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+
+	// Ensure that parent is copied-up before potentially holding child.copyMu
+	// below.
+	if err := parent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+
+	// Unlike UnlinkAt, we need a dentry representing the child directory being
+	// removed in order to verify that it's empty.
+	child, err := fs.getChildLocked(ctx, parent, name, &ds)
+	if err != nil {
+		return err
+	}
+	if !child.isDir() {
+		return syserror.ENOTDIR
+	}
+	child.dirMu.Lock()
+	defer child.dirMu.Unlock()
+	whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx)
+	if err != nil {
+		return err
+	}
+	child.copyMu.RLock()
+	defer child.copyMu.RUnlock()
+	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+		return err
+	}
+
+	pop := vfs.PathOperation{
+		Root:  parent.upperVD,
+		Start: parent.upperVD,
+		Path:  fspath.Parse(name),
+	}
+	if child.upperVD.Ok() {
+		cleanupRecreateWhiteouts := func() {
+			if !child.upperVD.Ok() {
+				return
+			}
+			for whiteoutName, whiteoutUpper := range whiteouts {
+				if !whiteoutUpper {
+					continue
+				}
+				if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{
+					Root:  child.upperVD,
+					Start: child.upperVD,
+					Path:  fspath.Parse(whiteoutName),
+				}); err != nil && err != syserror.EEXIST {
+					ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)
+				}
+			}
+		}
+		// Remove existing whiteouts on the upper layer.
+		for whiteoutName, whiteoutUpper := range whiteouts {
+			if !whiteoutUpper {
+				continue
+			}
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
+				Root:  child.upperVD,
+				Start: child.upperVD,
+				Path:  fspath.Parse(whiteoutName),
+			}); err != nil {
+				cleanupRecreateWhiteouts()
+				vfsObj.AbortDeleteDentry(&child.vfsd)
+				return err
+			}
+		}
+		// Remove the existing directory on the upper layer.
+		if err := vfsObj.RmdirAt(ctx, fs.creds, &pop); err != nil {
+			cleanupRecreateWhiteouts()
+			vfsObj.AbortDeleteDentry(&child.vfsd)
+			return err
+		}
+	}
+	if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
+		// Don't attempt to recover from this: the original directory is
+		// already gone, so any dentries representing it are invalid, and
+		// creating a new directory won't undo that.
+		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err)
+		vfsObj.AbortDeleteDentry(&child.vfsd)
+		return err
+	}
+
+	vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
+	delete(parent.children, name)
+	ds = appendDentry(ds, child)
+	parent.dirents = nil
+	return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+		return err
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// Changes to d's attributes are serialized by d.copyMu.
+	d.copyMu.Lock()
+	defer d.copyMu.Unlock()
+	if err := d.fs.vfsfs.VirtualFilesystem().SetStatAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.upperVD,
+		Start: d.upperVD,
+	}, &opts); err != nil {
+		return err
+	}
+	d.updateAfterSetStatLocked(&opts)
+	return nil
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+
+	var stat linux.Statx
+	if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
+		layerVD := d.topLayer()
+		stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  layerVD,
+			Start: layerVD,
+		}, &vfs.StatOptions{
+			Mask: layerMask,
+			Sync: opts.Sync,
+		})
+		if err != nil {
+			return linux.Statx{}, err
+		}
+	}
+	d.statInternalTo(ctx, &opts, &stat)
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	return fs.statFS(ctx)
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
+		vfsObj := fs.vfsfs.VirtualFilesystem()
+		pop := vfs.PathOperation{
+			Root:  parent.upperVD,
+			Start: parent.upperVD,
+			Path:  fspath.Parse(childName),
+		}
+		if haveUpperWhiteout {
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+				return err
+			}
+		}
+		if err := vfsObj.SymlinkAt(ctx, fs.creds, &pop, target); err != nil {
+			if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		creds := rp.Credentials()
+		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
+			Stat: linux.Statx{
+				Mask: linux.STATX_UID | linux.STATX_GID,
+				UID:  uint32(creds.EffectiveKUID),
+				GID:  uint32(creds.EffectiveKGID),
+			},
+		}); err != nil {
+			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
+				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr)
+			} else if haveUpperWhiteout {
+				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
+			}
+			return err
+		}
+		return nil
+	})
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	start := rp.Start().Impl().(*dentry)
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return err
+	}
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	if err := rp.Mount().CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer rp.Mount().EndWrite()
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EISDIR
+	}
+	if rp.MustBeDir() {
+		return syserror.ENOTDIR
+	}
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+
+	// Ensure that parent is copied-up before potentially holding child.copyMu
+	// below.
+	if err := parent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+
+	child := parent.children[name]
+	var childLayer lookupLayer
+	if child != nil {
+		if child.isDir() {
+			return syserror.EISDIR
+		}
+		if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+			return err
+		}
+		// Hold child.copyMu to prevent it from being copied-up during
+		// deletion.
+		child.copyMu.RLock()
+		defer child.copyMu.RUnlock()
+		if child.upperVD.Ok() {
+			childLayer = lookupLayerUpper
+		} else {
+			childLayer = lookupLayerLower
+		}
+	} else {
+		// Determine if the file being unlinked actually exists. Holding
+		// parent.dirMu prevents a dentry from being instantiated for the file,
+		// which in turn prevents it from being copied-up, so this result is
+		// stable.
+		childLayer, err = fs.lookupLayerLocked(ctx, parent, name)
+		if err != nil {
+			return err
+		}
+		if !childLayer.existsInOverlay() {
+			return syserror.ENOENT
+		}
+	}
+
+	pop := vfs.PathOperation{
+		Root:  parent.upperVD,
+		Start: parent.upperVD,
+		Path:  fspath.Parse(name),
+	}
+	if childLayer == lookupLayerUpper {
+		// Remove the existing file on the upper layer.
+		if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
+			if child != nil {
+				vfsObj.AbortDeleteDentry(&child.vfsd)
+			}
+			return err
+		}
+	}
+	if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
+		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err)
+		if child != nil {
+			vfsObj.AbortDeleteDentry(&child.vfsd)
+		}
+		return err
+	}
+
+	if child != nil {
+		vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
+		delete(parent.children, name)
+		ds = appendDentry(ds, child)
+	}
+	parent.dirents = nil
+	return nil
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	// TODO(gvisor.dev/issue/1199): Linux overlayfs actually allows listxattr,
+	// but not any other xattr syscalls. For now we just reject all of them.
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	_, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return syserror.ENOTSUP
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.renameMu.RLock()
+	defer fs.renameMu.RUnlock()
+	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
+}
diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go
new file mode 100644
index 000000000..d3060a481
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/non_directory.go
@@ -0,0 +1,266 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (d *dentry) isSymlink() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
+}
+
+func (d *dentry) readlink(ctx context.Context) (string, error) {
+	layerVD := d.topLayer()
+	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	})
+}
+
+type nonDirectoryFD struct {
+	fileDescription
+
+	// If copiedUp is false, cachedFD represents
+	// fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents
+	// fileDescription.dentry().upperVD. cachedFlags is the last known value of
+	// cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are
+	// protected by mu.
+	mu          sync.Mutex
+	copiedUp    bool
+	cachedFD    *vfs.FileDescription
+	cachedFlags uint32
+}
+
+func (fd *nonDirectoryFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return nil, err
+	}
+	wrappedFD.IncRef()
+	return wrappedFD, nil
+}
+
+func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) {
+	d := fd.dentry()
+	statusFlags := fd.vfsfd.StatusFlags()
+	if !fd.copiedUp && d.isCopiedUp() {
+		// Switch to the copied-up file.
+		upperVD := d.topLayer()
+		upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  upperVD,
+			Start: upperVD,
+		}, &vfs.OpenOptions{
+			Flags: statusFlags,
+		})
+		if err != nil {
+			return nil, err
+		}
+		oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR)
+		if oldOffErr == nil {
+			if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil {
+				upperFD.DecRef(ctx)
+				return nil, err
+			}
+		}
+		fd.cachedFD.DecRef(ctx)
+		fd.copiedUp = true
+		fd.cachedFD = upperFD
+		fd.cachedFlags = statusFlags
+	} else if fd.cachedFlags != statusFlags {
+		if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil {
+			return nil, err
+		}
+		fd.cachedFlags = statusFlags
+	}
+	return fd.cachedFD, nil
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *nonDirectoryFD) Release(ctx context.Context) {
+	fd.cachedFD.DecRef(ctx)
+	fd.cachedFD = nil
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *nonDirectoryFD) OnClose(ctx context.Context) error {
+	// Linux doesn't define ovl_file_operations.flush at all (i.e. its
+	// equivalent to OnClose is a no-op). We pass through to
+	// fd.cachedFD.OnClose() without upgrading if fd.dentry() has been
+	// copied-up, since OnClose is mostly used to define post-close writeback,
+	// and if fd.cachedFD hasn't been updated then it can't have been used to
+	// mutate fd.dentry() anyway.
+	fd.mu.Lock()
+	if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags {
+		if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil {
+			fd.mu.Unlock()
+			return err
+		}
+		fd.cachedFlags = statusFlags
+	}
+	wrappedFD := fd.cachedFD
+	defer wrappedFD.IncRef()
+	fd.mu.Unlock()
+	return wrappedFD.OnClose(ctx)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
+		wrappedFD, err := fd.getCurrentFD(ctx)
+		if err != nil {
+			return linux.Statx{}, err
+		}
+		stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{
+			Mask: layerMask,
+			Sync: opts.Sync,
+		})
+		wrappedFD.DecRef(ctx)
+		if err != nil {
+			return linux.Statx{}, err
+		}
+	}
+	fd.dentry().statInternalTo(ctx, &opts, &stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	d := fd.dentry()
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+		return err
+	}
+	mnt := fd.vfsfd.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// Changes to d's attributes are serialized by d.copyMu.
+	d.copyMu.Lock()
+	defer d.copyMu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return err
+	}
+	if err := wrappedFD.SetStat(ctx, opts); err != nil {
+		return err
+	}
+	d.updateAfterSetStatLocked(&opts)
+	return nil
+}
+
+// StatFS implements vfs.FileDescriptionImpl.StatFS.
+func (fd *nonDirectoryFD) StatFS(ctx context.Context) (linux.Statfs, error) {
+	return fd.filesystem().statFS(ctx)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.PRead(ctx, dst, offset, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *nonDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// Hold fd.mu during the read to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Read(ctx, dst, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.PWrite(ctx, src, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *nonDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// Hold fd.mu during the write to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Write(ctx, src, opts)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *nonDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	// Hold fd.mu during the seek to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Seek(ctx, offset, whence)
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *nonDirectoryFD) Sync(ctx context.Context) error {
+	fd.mu.Lock()
+	if !fd.dentry().isCopiedUp() {
+		fd.mu.Unlock()
+		return nil
+	}
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		fd.mu.Unlock()
+		return err
+	}
+	wrappedFD.IncRef()
+	defer wrappedFD.DecRef(ctx)
+	fd.mu.Unlock()
+	return wrappedFD.Sync(ctx)
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.ConfigureMMap(ctx, opts)
+}
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
new file mode 100644
index 000000000..75cc006bf
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -0,0 +1,627 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package overlay provides an overlay filesystem implementation, which
+// synthesizes a filesystem by composing one or more immutable filesystems
+// ("lower layers") with an optional mutable filesystem ("upper layer").
+//
+// Lock order:
+//
+// directoryFD.mu / nonDirectoryFD.mu
+//   filesystem.renameMu
+//     dentry.dirMu
+//       dentry.copyMu
+//
+// Locking dentry.dirMu in multiple dentries requires that parent dentries are
+// locked before child dentries, and that filesystem.renameMu is locked to
+// stabilize this relationship.
+package overlay
+
+import (
+	"strings"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the default filesystem name.
+const Name = "overlay"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to
+// FilesystemType.GetFilesystem.
+type FilesystemOptions struct {
+	// Callers passing FilesystemOptions to
+	// overlay.FilesystemType.GetFilesystem() are responsible for ensuring that
+	// the vfs.Mounts comprising the layers of the overlay filesystem do not
+	// contain submounts.
+
+	// If UpperRoot.Ok(), it is the root of the writable upper layer of the
+	// overlay.
+	UpperRoot vfs.VirtualDentry
+
+	// LowerRoots contains the roots of the immutable lower layers of the
+	// overlay. LowerRoots is immutable.
+	LowerRoots []vfs.VirtualDentry
+}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// Immutable options.
+	opts FilesystemOptions
+
+	// creds is a copy of the filesystem's creator's credentials, which are
+	// used for accesses to the filesystem's layers. creds is immutable.
+	creds *auth.Credentials
+
+	// dirDevMinor is the device minor number used for directories. dirDevMinor
+	// is immutable.
+	dirDevMinor uint32
+
+	// lowerDevMinors maps lower layer filesystems to device minor numbers
+	// assigned to non-directory files originating from that filesystem.
+	// lowerDevMinors is immutable.
+	lowerDevMinors map[*vfs.Filesystem]uint32
+
+	// renameMu synchronizes renaming with non-renaming operations in order to
+	// ensure consistent lock ordering between dentry.dirMu in different
+	// dentries.
+	renameMu sync.RWMutex
+
+	// lastDirIno is the last inode number assigned to a directory. lastDirIno
+	// is accessed using atomic memory operations.
+	lastDirIno uint64
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	fsoptsRaw := opts.InternalData
+	fsopts, haveFSOpts := fsoptsRaw.(FilesystemOptions)
+	if fsoptsRaw != nil && !haveFSOpts {
+		ctx.Warningf("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
+		return nil, nil, syserror.EINVAL
+	}
+	if haveFSOpts {
+		if len(fsopts.LowerRoots) == 0 {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty")
+			return nil, nil, syserror.EINVAL
+		}
+		if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified")
+			return nil, nil, syserror.EINVAL
+		}
+		// We don't enforce a maximum number of lower layers when not
+		// configured by applications; the sandbox owner can have an overlay
+		// filesystem with any number of lower layers.
+	} else {
+		vfsroot := vfs.RootFromContext(ctx)
+		defer vfsroot.DecRef(ctx)
+		upperPathname, ok := mopts["upperdir"]
+		if ok {
+			delete(mopts, "upperdir")
+			// Linux overlayfs also requires a workdir when upperdir is
+			// specified; we don't, so silently ignore this option.
+			delete(mopts, "workdir")
+			upperPath := fspath.Parse(upperPathname)
+			if !upperPath.Absolute {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
+				return nil, nil, syserror.EINVAL
+			}
+			upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
+				Root:               vfsroot,
+				Start:              vfsroot,
+				Path:               upperPath,
+				FollowFinalSymlink: true,
+			}, &vfs.GetDentryOptions{
+				CheckSearchable: true,
+			})
+			if err != nil {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
+				return nil, nil, err
+			}
+			defer upperRoot.DecRef(ctx)
+			privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
+			if err != nil {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
+				return nil, nil, err
+			}
+			defer privateUpperRoot.DecRef(ctx)
+			fsopts.UpperRoot = privateUpperRoot
+		}
+		lowerPathnamesStr, ok := mopts["lowerdir"]
+		if !ok {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: missing required option lowerdir")
+			return nil, nil, syserror.EINVAL
+		}
+		delete(mopts, "lowerdir")
+		lowerPathnames := strings.Split(lowerPathnamesStr, ":")
+		const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
+		if len(lowerPathnames) < 2 && !fsopts.UpperRoot.Ok() {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified")
+			return nil, nil, syserror.EINVAL
+		}
+		if len(lowerPathnames) > maxLowerLayers {
+			ctx.Warningf("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers)
+			return nil, nil, syserror.EINVAL
+		}
+		for _, lowerPathname := range lowerPathnames {
+			lowerPath := fspath.Parse(lowerPathname)
+			if !lowerPath.Absolute {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
+				return nil, nil, syserror.EINVAL
+			}
+			lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
+				Root:               vfsroot,
+				Start:              vfsroot,
+				Path:               lowerPath,
+				FollowFinalSymlink: true,
+			}, &vfs.GetDentryOptions{
+				CheckSearchable: true,
+			})
+			if err != nil {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
+				return nil, nil, err
+			}
+			defer lowerRoot.DecRef(ctx)
+			privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */)
+			if err != nil {
+				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
+				return nil, nil, err
+			}
+			defer privateLowerRoot.DecRef(ctx)
+			fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot)
+		}
+	}
+	if len(mopts) != 0 {
+		ctx.Warningf("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
+	// Allocate device numbers.
+	dirDevMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+	lowerDevMinors := make(map[*vfs.Filesystem]uint32)
+	for _, lowerRoot := range fsopts.LowerRoots {
+		lowerFS := lowerRoot.Mount().Filesystem()
+		if _, ok := lowerDevMinors[lowerFS]; !ok {
+			devMinor, err := vfsObj.GetAnonBlockDevMinor()
+			if err != nil {
+				vfsObj.PutAnonBlockDevMinor(dirDevMinor)
+				for _, lowerDevMinor := range lowerDevMinors {
+					vfsObj.PutAnonBlockDevMinor(lowerDevMinor)
+				}
+				return nil, nil, err
+			}
+			lowerDevMinors[lowerFS] = devMinor
+		}
+	}
+
+	// Take extra references held by the filesystem.
+	if fsopts.UpperRoot.Ok() {
+		fsopts.UpperRoot.IncRef()
+	}
+	for _, lowerRoot := range fsopts.LowerRoots {
+		lowerRoot.IncRef()
+	}
+
+	fs := &filesystem{
+		opts:           fsopts,
+		creds:          creds.Fork(),
+		dirDevMinor:    dirDevMinor,
+		lowerDevMinors: lowerDevMinors,
+	}
+	fs.vfsfs.Init(vfsObj, &fstype, fs)
+
+	// Construct the root dentry.
+	root := fs.newDentry()
+	root.refs = 1
+	if fs.opts.UpperRoot.Ok() {
+		fs.opts.UpperRoot.IncRef()
+		root.copiedUp = 1
+		root.upperVD = fs.opts.UpperRoot
+	}
+	for _, lowerRoot := range fs.opts.LowerRoots {
+		lowerRoot.IncRef()
+		root.lowerVDs = append(root.lowerVDs, lowerRoot)
+	}
+	rootTopVD := root.topLayer()
+	// Get metadata from the topmost layer. See fs.lookupLocked().
+	const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+	rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+		Root:  rootTopVD,
+		Start: rootTopVD,
+	}, &vfs.StatOptions{
+		Mask: rootStatMask,
+	})
+	if err != nil {
+		root.destroyLocked(ctx)
+		fs.vfsfs.DecRef(ctx)
+		return nil, nil, err
+	}
+	if rootStat.Mask&rootStatMask != rootStatMask {
+		root.destroyLocked(ctx)
+		fs.vfsfs.DecRef(ctx)
+		return nil, nil, syserror.EREMOTE
+	}
+	if isWhiteout(&rootStat) {
+		ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
+		root.destroyLocked(ctx)
+		fs.vfsfs.DecRef(ctx)
+		return nil, nil, syserror.EINVAL
+	}
+	root.mode = uint32(rootStat.Mode)
+	root.uid = rootStat.UID
+	root.gid = rootStat.GID
+	if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR {
+		root.devMajor = linux.UNNAMED_MAJOR
+		root.devMinor = fs.dirDevMinor
+		root.ino = fs.newDirIno()
+	} else if !root.upperVD.Ok() {
+		root.devMajor = linux.UNNAMED_MAJOR
+		root.devMinor = fs.lowerDevMinors[root.lowerVDs[0].Mount().Filesystem()]
+		root.ino = rootStat.Ino
+	} else {
+		root.devMajor = rootStat.DevMajor
+		root.devMinor = rootStat.DevMinor
+		root.ino = rootStat.Ino
+	}
+
+	return &fs.vfsfs, &root.vfsd, nil
+}
+
+// clonePrivateMount creates a non-recursive bind mount rooted at vd, not
+// associated with any MountNamespace, and returns the root of the new mount.
+// (This is required to ensure that each layer of an overlay comprises only a
+// single mount, and therefore can't cross into e.g. the overlay filesystem
+// itself, risking lock recursion.) A reference is held on the returned
+// VirtualDentry.
+func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) {
+	oldmnt := vd.Mount()
+	opts := oldmnt.Options()
+	if forceReadOnly {
+		opts.ReadOnly = true
+	}
+	newmnt, err := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts)
+	if err != nil {
+		return vfs.VirtualDentry{}, err
+	}
+	return vfs.MakeVirtualDentry(newmnt, vd.Dentry()), nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+	vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor)
+	for _, lowerDevMinor := range fs.lowerDevMinors {
+		vfsObj.PutAnonBlockDevMinor(lowerDevMinor)
+	}
+	if fs.opts.UpperRoot.Ok() {
+		fs.opts.UpperRoot.DecRef(ctx)
+	}
+	for _, lowerRoot := range fs.opts.LowerRoots {
+		lowerRoot.DecRef(ctx)
+	}
+}
+
+func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) {
+	// Always statfs the root of the topmost layer. Compare Linux's
+	// fs/overlayfs/super.c:ovl_statfs().
+	var rootVD vfs.VirtualDentry
+	if fs.opts.UpperRoot.Ok() {
+		rootVD = fs.opts.UpperRoot
+	} else {
+		rootVD = fs.opts.LowerRoots[0]
+	}
+	fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  rootVD,
+		Start: rootVD,
+	})
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC
+	return fsstat, nil
+}
+
+func (fs *filesystem) newDirIno() uint64 {
+	return atomic.AddUint64(&fs.lastDirIno, 1)
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+	vfsd vfs.Dentry
+
+	refs int64
+
+	// fs is the owning filesystem. fs is immutable.
+	fs *filesystem
+
+	// mode, uid, and gid are the file mode, owner, and group of the file in
+	// the topmost layer (and therefore the overlay file as well), and are used
+	// for permission checks on this dentry. These fields are protected by
+	// copyMu and accessed using atomic memory operations.
+	mode uint32
+	uid  uint32
+	gid  uint32
+
+	// copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and
+	// 0 otherwise. copiedUp is accessed using atomic memory operations.
+	copiedUp uint32
+
+	// parent is the dentry corresponding to this dentry's parent directory.
+	// name is this dentry's name in parent. If this dentry is a filesystem
+	// root, parent is nil and name is the empty string. parent and name are
+	// protected by fs.renameMu.
+	parent *dentry
+	name   string
+
+	// If this dentry represents a directory, children maps the names of
+	// children for which dentries have been instantiated to those dentries,
+	// and dirents (if not nil) is a cache of dirents as returned by
+	// directoryFDs representing this directory. children is protected by
+	// dirMu.
+	dirMu    sync.Mutex
+	children map[string]*dentry
+	dirents  []vfs.Dirent
+
+	// upperVD and lowerVDs are the files from the overlay filesystem's layers
+	// that comprise the file on the overlay filesystem.
+	//
+	// If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e.
+	// be copied up) with copyMu locked for writing; otherwise, it is
+	// immutable. lowerVDs is always immutable.
+	copyMu   sync.RWMutex
+	upperVD  vfs.VirtualDentry
+	lowerVDs []vfs.VirtualDentry
+
+	// inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <=
+	// len(inlineLowerVDs).
+	inlineLowerVDs [1]vfs.VirtualDentry
+
+	// devMajor, devMinor, and ino are the device major/minor and inode numbers
+	// used by this dentry. These fields are protected by copyMu and accessed
+	// using atomic memory operations.
+	devMajor uint32
+	devMinor uint32
+	ino      uint64
+
+	locks vfs.FileLocks
+}
+
+// newDentry creates a new dentry. The dentry initially has no references; it
+// is the caller's responsibility to set the dentry's reference count and/or
+// call dentry.destroy() as appropriate. The dentry is initially invalid in
+// that it contains no layers; the caller is responsible for setting them.
+func (fs *filesystem) newDentry() *dentry {
+	d := &dentry{
+		fs: fs,
+	}
+	d.lowerVDs = d.inlineLowerVDs[:0]
+	d.vfsd.Init(d)
+	return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+	// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
+	// d.checkDropLocked().
+	atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&d.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef(ctx context.Context) {
+	if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+		d.fs.renameMu.Lock()
+		d.checkDropLocked(ctx)
+		d.fs.renameMu.Unlock()
+	} else if refs < 0 {
+		panic("overlay.dentry.DecRef() called without holding a reference")
+	}
+}
+
+// checkDropLocked should be called after d's reference count becomes 0 or it
+// becomes deleted.
+//
+// Preconditions: d.fs.renameMu must be locked for writing.
+func (d *dentry) checkDropLocked(ctx context.Context) {
+	// Dentries with a positive reference count must be retained. (The only way
+	// to obtain a reference on a dentry with zero references is via path
+	// resolution, which requires renameMu, so if d.refs is zero then it will
+	// remain zero while we hold renameMu for writing.) Dentries with a
+	// negative reference count have already been destroyed.
+	if atomic.LoadInt64(&d.refs) != 0 {
+		return
+	}
+	// Refs is still zero; destroy it.
+	d.destroyLocked(ctx)
+	return
+}
+
+// destroyLocked destroys the dentry.
+//
+// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0.
+func (d *dentry) destroyLocked(ctx context.Context) {
+	switch atomic.LoadInt64(&d.refs) {
+	case 0:
+		// Mark the dentry destroyed.
+		atomic.StoreInt64(&d.refs, -1)
+	case -1:
+		panic("overlay.dentry.destroyLocked() called on already destroyed dentry")
+	default:
+		panic("overlay.dentry.destroyLocked() called with references on the dentry")
+	}
+
+	if d.upperVD.Ok() {
+		d.upperVD.DecRef(ctx)
+	}
+	for _, lowerVD := range d.lowerVDs {
+		lowerVD.DecRef(ctx)
+	}
+
+	if d.parent != nil {
+		d.parent.dirMu.Lock()
+		if !d.vfsd.IsDead() {
+			delete(d.parent.children, d.name)
+		}
+		d.parent.dirMu.Unlock()
+		// Drop the reference held by d on its parent without recursively
+		// locking d.fs.renameMu.
+		if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+			d.parent.checkDropLocked(ctx)
+		} else if refs < 0 {
+			panic("overlay.dentry.DecRef() called without holding a reference")
+		}
+	}
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) {
+	// TODO(gvisor.dev/issue/1479): Implement inotify.
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+	// TODO(gvisor.dev/issue/1479): Implement inotify.
+	return nil
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+//
+// TODO(gvisor.dev/issue/1479): Implement inotify.
+func (d *dentry) OnZeroWatches(context.Context) {}
+
+// iterLayers invokes yield on each layer comprising d, from top to bottom. If
+// any call to yield returns false, iterLayer stops iteration.
+func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) {
+	if d.isCopiedUp() {
+		if !yield(d.upperVD, true) {
+			return
+		}
+	}
+	for _, lowerVD := range d.lowerVDs {
+		if !yield(lowerVD, false) {
+			return
+		}
+	}
+}
+
+func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) {
+	if d.isCopiedUp() {
+		return d.upperVD, true
+	}
+	return d.lowerVDs[0], false
+}
+
+func (d *dentry) topLayer() vfs.VirtualDentry {
+	vd, _ := d.topLayerInfo()
+	return vd
+}
+
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+// statInternalMask is the set of stat fields that is set by
+// dentry.statInternalTo().
+const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+
+// statInternalTo writes fields to stat that are stored in d, and therefore do
+// not requiring invoking StatAt on the overlay's layers.
+func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) {
+	stat.Mask |= statInternalMask
+	if d.isDir() {
+		// Linux sets nlink to 1 for merged directories
+		// (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is
+		// correct more often ("." and the directory's entry in its parent),
+		// and some of our tests expect this.
+		stat.Nlink = 2
+	}
+	stat.UID = atomic.LoadUint32(&d.uid)
+	stat.GID = atomic.LoadUint32(&d.gid)
+	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
+	stat.Ino = atomic.LoadUint64(&d.ino)
+	stat.DevMajor = atomic.LoadUint32(&d.devMajor)
+	stat.DevMinor = atomic.LoadUint32(&d.devMinor)
+}
+
+// Preconditions: d.copyMu must be locked for writing.
+func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) {
+	if opts.Stat.Mask&linux.STATX_MODE != 0 {
+		atomic.StoreUint32(&d.mode, (d.mode&linux.S_IFMT)|uint32(opts.Stat.Mode&^linux.S_IFMT))
+	}
+	if opts.Stat.Mask&linux.STATX_UID != 0 {
+		atomic.StoreUint32(&d.uid, opts.Stat.UID)
+	}
+	if opts.Stat.Mask&linux.STATX_GID != 0 {
+		atomic.StoreUint32(&d.gid, opts.Stat.GID)
+	}
+}
+
+// fileDescription is embedded by overlay implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) dentry() *dentry {
+	return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/pipefs/BUILD b/pkg/sentry/fsimpl/pipefs/BUILD
new file mode 100644
index 000000000..5950a2d59
--- /dev/null
+++ b/pkg/sentry/fsimpl/pipefs/BUILD
@@ -0,0 +1,21 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "pipefs",
+    srcs = ["pipefs.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
new file mode 100644
index 000000000..2ca793db9
--- /dev/null
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -0,0 +1,165 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pipefs provides the filesystem implementation backing
+// Kernel.PipeMount.
+package pipefs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type filesystemType struct{}
+
+// Name implements vfs.FilesystemType.Name.
+func (filesystemType) Name() string {
+	return "pipefs"
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	panic("pipefs.filesystemType.GetFilesystem should never be called")
+}
+
+type filesystem struct {
+	kernfs.Filesystem
+
+	devMinor uint32
+}
+
+// NewFilesystem sets up and returns a new vfs.Filesystem implemented by pipefs.
+func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, err
+	}
+	fs := &filesystem{
+		devMinor: devMinor,
+	}
+	fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs)
+	return fs.Filesystem.VFSFilesystem(), nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release(ctx)
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode)
+	b.PrependComponent(fmt.Sprintf("pipe:[%d]", inode.ino))
+	return vfs.PrependPathSyntheticError{}
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	kernfs.InodeNoopRefCount
+
+	locks vfs.FileLocks
+	pipe  *pipe.VFSPipe
+
+	ino uint64
+	uid auth.KUID
+	gid auth.KGID
+	// We use the creation timestamp for all of atime, mtime, and ctime.
+	ctime ktime.Time
+}
+
+func newInode(ctx context.Context, fs *filesystem) *inode {
+	creds := auth.CredentialsFromContext(ctx)
+	return &inode{
+		pipe:  pipe.NewVFSPipe(false /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+		ino:   fs.Filesystem.NextIno(),
+		uid:   creds.EffectiveKUID,
+		gid:   creds.EffectiveKGID,
+		ctime: ktime.NowFromContext(ctx),
+	}
+}
+
+const pipeMode = 0600 | linux.S_IFIFO
+
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
+func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, pipeMode, i.uid, i.gid)
+}
+
+// Mode implements kernfs.Inode.Mode.
+func (i *inode) Mode() linux.FileMode {
+	return pipeMode
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (i *inode) Stat(_ context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	ts := linux.NsecToStatxTimestamp(i.ctime.Nanoseconds())
+	return linux.Statx{
+		Mask:     linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS,
+		Blksize:  usermem.PageSize,
+		Nlink:    1,
+		UID:      uint32(i.uid),
+		GID:      uint32(i.gid),
+		Mode:     pipeMode,
+		Ino:      i.ino,
+		Size:     0,
+		Blocks:   0,
+		Atime:    ts,
+		Ctime:    ts,
+		Mtime:    ts,
+		DevMajor: linux.UNNAMED_MAJOR,
+		DevMinor: vfsfs.Impl().(*filesystem).devMinor,
+	}, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat.
+func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	return syserror.EPERM
+}
+
+// TODO(gvisor.dev/issue/1193): kernfs does not provide a way to implement
+// statfs, from which we should indicate PIPEFS_MAGIC.
+
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks)
+}
+
+// NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
+// and write ends of a newly-created pipe, as for pipe(2) and pipe2(2).
+//
+// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem().
+func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, *vfs.FileDescription) {
+	fs := mnt.Filesystem().Impl().(*filesystem)
+	inode := newInode(ctx, fs)
+	var d kernfs.Dentry
+	d.Init(inode)
+	defer d.DecRef(ctx)
+	return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags)
+}
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index ade6ac946..f074e6056 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -1,50 +1,68 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools:defs.bzl", "go_library", "go_test")
 
-package(licenses = ["notice"])
+licenses(["notice"])
 
 go_library(
     name = "proc",
     srcs = [
-        "filesystems.go",
-        "loadavg.go",
-        "meminfo.go",
-        "mounts.go",
-        "net.go",
-        "proc.go",
-        "stat.go",
-        "sys.go",
+        "filesystem.go",
+        "subtasks.go",
         "task.go",
-        "version.go",
+        "task_fds.go",
+        "task_files.go",
+        "task_net.go",
+        "tasks.go",
+        "tasks_files.go",
+        "tasks_sys.go",
     ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc",
+    visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
+        "//pkg/context",
         "//pkg/log",
-        "//pkg/sentry/context",
-        "//pkg/sentry/fs",
+        "//pkg/refs",
+        "//pkg/safemem",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/inet",
         "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
         "//pkg/sentry/limits",
         "//pkg/sentry/mm",
         "//pkg/sentry/socket",
         "//pkg/sentry/socket/unix",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
-        "//pkg/sentry/usermem",
         "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/tcpip/header",
+        "//pkg/tcpip/network/ipv4",
+        "//pkg/usermem",
     ],
 )
 
 go_test(
     name = "proc_test",
     size = "small",
-    srcs = ["net_test.go"],
-    embed = [":proc"],
+    srcs = [
+        "tasks_sys_test.go",
+        "tasks_test.go",
+    ],
+    library = ":proc",
     deps = [
         "//pkg/abi/linux",
-        "//pkg/sentry/context/contexttest",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/fsimpl/testutil",
+        "//pkg/sentry/fsimpl/tmpfs",
         "//pkg/sentry/inet",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
new file mode 100644
index 000000000..2463d51cd
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -0,0 +1,117 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package proc implements a partial in-memory file system for procfs.
+package proc
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// Name is the default filesystem name.
+const Name = "proc"
+
+// FilesystemType is the factory class for procfs.
+//
+// +stateify savable
+type FilesystemType struct{}
+
+var _ vfs.FilesystemType = (*FilesystemType)(nil)
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+type filesystem struct {
+	kernfs.Filesystem
+
+	devMinor uint32
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	k := kernel.KernelFromContext(ctx)
+	if k == nil {
+		return nil, nil, fmt.Errorf("procfs requires a kernel")
+	}
+	pidns := kernel.PIDNamespaceFromContext(ctx)
+	if pidns == nil {
+		return nil, nil, fmt.Errorf("procfs requires a PID namespace")
+	}
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+	procfs := &filesystem{
+		devMinor: devMinor,
+	}
+	procfs.VFSFilesystem().Init(vfsObj, &ft, procfs)
+
+	var cgroups map[string]string
+	if opts.InternalData != nil {
+		data := opts.InternalData.(*InternalData)
+		cgroups = data.Cgroups
+	}
+
+	_, dentry := procfs.newTasksInode(k, pidns, cgroups)
+	return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release(ctx)
+}
+
+// dynamicInode is an overfitted interface for common Inodes with
+// dynamicByteSource types used in procfs.
+type dynamicInode interface {
+	kernfs.Inode
+	vfs.DynamicBytesSource
+
+	Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
+}
+
+func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+type staticFile struct {
+	kernfs.DynamicBytesFile
+	vfs.StaticData
+}
+
+var _ dynamicInode = (*staticFile)(nil)
+
+func newStaticFile(data string) *staticFile {
+	return &staticFile{StaticData: vfs.StaticData{Data: data}}
+}
+
+// InternalData contains internal data passed in to the procfs mount via
+// vfs.GetFilesystemOptions.InternalData.
+type InternalData struct {
+	Cgroups map[string]string
+}
diff --git a/pkg/sentry/fsimpl/proc/filesystems.go b/pkg/sentry/fsimpl/proc/filesystems.go
deleted file mode 100644
index c36c4aff5..000000000
--- a/pkg/sentry/fsimpl/proc/filesystems.go
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-// filesystemsData implements vfs.DynamicBytesSource for /proc/filesystems.
-//
-// +stateify savable
-type filesystemsData struct{}
-
-// TODO(b/138862512): Implement vfs.DynamicBytesSource.Generate for
-// filesystemsData. We would need to retrive filesystem names from
-// vfs.VirtualFilesystem. Also needs vfs replacement for
-// fs.Filesystem.AllowUserList() and fs.FilesystemRequiresDev.
diff --git a/pkg/sentry/fsimpl/proc/loadavg.go b/pkg/sentry/fsimpl/proc/loadavg.go
deleted file mode 100644
index 9135afef1..000000000
--- a/pkg/sentry/fsimpl/proc/loadavg.go
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-)
-
-// loadavgData backs /proc/loadavg.
-//
-// +stateify savable
-type loadavgData struct{}
-
-var _ vfs.DynamicBytesSource = (*loadavgData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (d *loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	// TODO(b/62345059): Include real data in fields.
-	// Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods.
-	// Column 4-5: currently running processes and the total number of processes.
-	// Column 6: the last process ID used.
-	fmt.Fprintf(buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0)
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/meminfo.go b/pkg/sentry/fsimpl/proc/meminfo.go
deleted file mode 100644
index 9a827cd66..000000000
--- a/pkg/sentry/fsimpl/proc/meminfo.go
+++ /dev/null
@@ -1,77 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-)
-
-// meminfoData implements vfs.DynamicBytesSource for /proc/meminfo.
-//
-// +stateify savable
-type meminfoData struct {
-	// k is the owning Kernel.
-	k *kernel.Kernel
-}
-
-var _ vfs.DynamicBytesSource = (*meminfoData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (d *meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	mf := d.k.MemoryFile()
-	mf.UpdateUsage()
-	snapshot, totalUsage := usage.MemoryAccounting.Copy()
-	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
-	anon := snapshot.Anonymous + snapshot.Tmpfs
-	file := snapshot.PageCache + snapshot.Mapped
-	// We don't actually have active/inactive LRUs, so just make up numbers.
-	activeFile := (file / 2) &^ (usermem.PageSize - 1)
-	inactiveFile := file - activeFile
-
-	fmt.Fprintf(buf, "MemTotal:       %8d kB\n", totalSize/1024)
-	memFree := (totalSize - totalUsage) / 1024
-	// We use MemFree as MemAvailable because we don't swap.
-	// TODO(rahat): When reclaim is implemented the value of MemAvailable
-	// should change.
-	fmt.Fprintf(buf, "MemFree:        %8d kB\n", memFree)
-	fmt.Fprintf(buf, "MemAvailable:   %8d kB\n", memFree)
-	fmt.Fprintf(buf, "Buffers:               0 kB\n") // memory usage by block devices
-	fmt.Fprintf(buf, "Cached:         %8d kB\n", (file+snapshot.Tmpfs)/1024)
-	// Emulate a system with no swap, which disables inactivation of anon pages.
-	fmt.Fprintf(buf, "SwapCache:             0 kB\n")
-	fmt.Fprintf(buf, "Active:         %8d kB\n", (anon+activeFile)/1024)
-	fmt.Fprintf(buf, "Inactive:       %8d kB\n", inactiveFile/1024)
-	fmt.Fprintf(buf, "Active(anon):   %8d kB\n", anon/1024)
-	fmt.Fprintf(buf, "Inactive(anon):        0 kB\n")
-	fmt.Fprintf(buf, "Active(file):   %8d kB\n", activeFile/1024)
-	fmt.Fprintf(buf, "Inactive(file): %8d kB\n", inactiveFile/1024)
-	fmt.Fprintf(buf, "Unevictable:           0 kB\n") // TODO(b/31823263)
-	fmt.Fprintf(buf, "Mlocked:               0 kB\n") // TODO(b/31823263)
-	fmt.Fprintf(buf, "SwapTotal:             0 kB\n")
-	fmt.Fprintf(buf, "SwapFree:              0 kB\n")
-	fmt.Fprintf(buf, "Dirty:                 0 kB\n")
-	fmt.Fprintf(buf, "Writeback:             0 kB\n")
-	fmt.Fprintf(buf, "AnonPages:      %8d kB\n", anon/1024)
-	fmt.Fprintf(buf, "Mapped:         %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know
-	fmt.Fprintf(buf, "Shmem:          %8d kB\n", snapshot.Tmpfs/1024)
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/net.go b/pkg/sentry/fsimpl/proc/net.go
deleted file mode 100644
index fd46eebf8..000000000
--- a/pkg/sentry/fsimpl/proc/net.go
+++ /dev/null
@@ -1,338 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
-	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	"gvisor.dev/gvisor/pkg/sentry/inet"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/socket"
-	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
-	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-)
-
-// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6.
-//
-// +stateify savable
-type ifinet6 struct {
-	s inet.Stack
-}
-
-var _ vfs.DynamicBytesSource = (*ifinet6)(nil)
-
-func (n *ifinet6) contents() []string {
-	var lines []string
-	nics := n.s.Interfaces()
-	for id, naddrs := range n.s.InterfaceAddrs() {
-		nic, ok := nics[id]
-		if !ok {
-			// NIC was added after NICNames was called. We'll just
-			// ignore it.
-			continue
-		}
-
-		for _, a := range naddrs {
-			// IPv6 only.
-			if a.Family != linux.AF_INET6 {
-				continue
-			}
-
-			// Fields:
-			// IPv6 address displayed in 32 hexadecimal chars without colons
-			// Netlink device number (interface index) in hexadecimal (use nic id)
-			// Prefix length in hexadecimal
-			// Scope value (use 0)
-			// Interface flags
-			// Device name
-			lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name))
-		}
-	}
-	return lines
-}
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	for _, l := range n.contents() {
-		buf.WriteString(l)
-	}
-	return nil
-}
-
-// netDev implements vfs.DynamicBytesSource for /proc/net/dev.
-//
-// +stateify savable
-type netDev struct {
-	s inet.Stack
-}
-
-var _ vfs.DynamicBytesSource = (*netDev)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (n *netDev) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	interfaces := n.s.Interfaces()
-	buf.WriteString("Inter-|   Receive                                                |  Transmit\n")
-	buf.WriteString(" face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed\n")
-
-	for _, i := range interfaces {
-		// Implements the same format as
-		// net/core/net-procfs.c:dev_seq_printf_stats.
-		var stats inet.StatDev
-		if err := n.s.Statistics(&stats, i.Name); err != nil {
-			log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err)
-			continue
-		}
-		fmt.Fprintf(
-			buf,
-			"%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n",
-			i.Name,
-			// Received
-			stats[0], // bytes
-			stats[1], // packets
-			stats[2], // errors
-			stats[3], // dropped
-			stats[4], // fifo
-			stats[5], // frame
-			stats[6], // compressed
-			stats[7], // multicast
-			// Transmitted
-			stats[8],  // bytes
-			stats[9],  // packets
-			stats[10], // errors
-			stats[11], // dropped
-			stats[12], // fifo
-			stats[13], // frame
-			stats[14], // compressed
-			stats[15], // multicast
-		)
-	}
-
-	return nil
-}
-
-// netUnix implements vfs.DynamicBytesSource for /proc/net/unix.
-//
-// +stateify savable
-type netUnix struct {
-	k *kernel.Kernel
-}
-
-var _ vfs.DynamicBytesSource = (*netUnix)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (n *netUnix) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	buf.WriteString("Num       RefCount Protocol Flags    Type St Inode Path\n")
-	for _, se := range n.k.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref %v in socket table, racing with destruction?", se.Sock)
-			continue
-		}
-		sfile := s.(*fs.File)
-		if family, _, _ := sfile.FileOperations.(socket.Socket).Type(); family != linux.AF_UNIX {
-			s.DecRef()
-			// Not a unix socket.
-			continue
-		}
-		sops := sfile.FileOperations.(*unix.SocketOperations)
-
-		addr, err := sops.Endpoint().GetLocalAddress()
-		if err != nil {
-			log.Warningf("Failed to retrieve socket name from %+v: %v", sfile, err)
-			addr.Addr = "<unknown>"
-		}
-
-		sockFlags := 0
-		if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok {
-			if ce.Listening() {
-				// For unix domain sockets, linux reports a single flag
-				// value if the socket is listening, of __SO_ACCEPTCON.
-				sockFlags = linux.SO_ACCEPTCON
-			}
-		}
-
-		// In the socket entry below, the value for the 'Num' field requires
-		// some consideration. Linux prints the address to the struct
-		// unix_sock representing a socket in the kernel, but may redact the
-		// value for unprivileged users depending on the kptr_restrict
-		// sysctl.
-		//
-		// One use for this field is to allow a privileged user to
-		// introspect into the kernel memory to determine information about
-		// a socket not available through procfs, such as the socket's peer.
-		//
-		// In gvisor, returning a pointer to our internal structures would
-		// be pointless, as it wouldn't match the memory layout for struct
-		// unix_sock, making introspection difficult. We could populate a
-		// struct unix_sock with the appropriate data, but even that
-		// requires consideration for which kernel version to emulate, as
-		// the definition of this struct changes over time.
-		//
-		// For now, we always redact this pointer.
-		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %5d",
-			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
-			sfile.ReadRefs()-1,            // RefCount, don't count our own ref.
-			0,                             // Protocol, always 0 for UDS.
-			sockFlags,                     // Flags.
-			sops.Endpoint().Type(),        // Type.
-			sops.State(),                  // State.
-			sfile.InodeID(),               // Inode.
-		)
-
-		// Path
-		if len(addr.Addr) != 0 {
-			if addr.Addr[0] == 0 {
-				// Abstract path.
-				fmt.Fprintf(buf, " @%s", string(addr.Addr[1:]))
-			} else {
-				fmt.Fprintf(buf, " %s", string(addr.Addr))
-			}
-		}
-		fmt.Fprintf(buf, "\n")
-
-		s.DecRef()
-	}
-	return nil
-}
-
-// netTCP implements vfs.DynamicBytesSource for /proc/net/tcp.
-//
-// +stateify savable
-type netTCP struct {
-	k *kernel.Kernel
-}
-
-var _ vfs.DynamicBytesSource = (*netTCP)(nil)
-
-func (n *netTCP) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	t := kernel.TaskFromContext(ctx)
-	buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode                                                     \n")
-	for _, se := range n.k.ListSockets() {
-		s := se.Sock.Get()
-		if s == nil {
-			log.Debugf("Couldn't resolve weakref %+v in socket table, racing with destruction?", se.Sock)
-			continue
-		}
-		sfile := s.(*fs.File)
-		sops, ok := sfile.FileOperations.(socket.Socket)
-		if !ok {
-			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", sfile))
-		}
-		if family, stype, _ := sops.Type(); !(family == linux.AF_INET && stype == linux.SOCK_STREAM) {
-			s.DecRef()
-			// Not tcp4 sockets.
-			continue
-		}
-
-		// Linux's documentation for the fields below can be found at
-		// https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt.
-		// For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock().
-		// Note that the header doesn't contain labels for all the fields.
-
-		// Field: sl; entry number.
-		fmt.Fprintf(buf, "%4d: ", se.ID)
-
-		portBuf := make([]byte, 2)
-
-		// Field: local_adddress.
-		var localAddr linux.SockAddrInet
-		if local, _, err := sops.GetSockName(t); err == nil {
-			localAddr = *local.(*linux.SockAddrInet)
-		}
-		binary.LittleEndian.PutUint16(portBuf, localAddr.Port)
-		fmt.Fprintf(buf, "%08X:%04X ",
-			binary.LittleEndian.Uint32(localAddr.Addr[:]),
-			portBuf)
-
-		// Field: rem_address.
-		var remoteAddr linux.SockAddrInet
-		if remote, _, err := sops.GetPeerName(t); err == nil {
-			remoteAddr = *remote.(*linux.SockAddrInet)
-		}
-		binary.LittleEndian.PutUint16(portBuf, remoteAddr.Port)
-		fmt.Fprintf(buf, "%08X:%04X ",
-			binary.LittleEndian.Uint32(remoteAddr.Addr[:]),
-			portBuf)
-
-		// Field: state; socket state.
-		fmt.Fprintf(buf, "%02X ", sops.State())
-
-		// Field: tx_queue, rx_queue; number of packets in the transmit and
-		// receive queue. Unimplemented.
-		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
-
-		// Field: tr, tm->when; timer active state and number of jiffies
-		// until timer expires. Unimplemented.
-		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
-
-		// Field: retrnsmt; number of unrecovered RTO timeouts.
-		// Unimplemented.
-		fmt.Fprintf(buf, "%08X ", 0)
-
-		// Field: uid.
-		uattr, err := sfile.Dirent.Inode.UnstableAttr(ctx)
-		if err != nil {
-			log.Warningf("Failed to retrieve unstable attr for socket file: %v", err)
-			fmt.Fprintf(buf, "%5d ", 0)
-		} else {
-			fmt.Fprintf(buf, "%5d ", uint32(uattr.Owner.UID.In(t.UserNamespace()).OrOverflow()))
-		}
-
-		// Field: timeout; number of unanswered 0-window probes.
-		// Unimplemented.
-		fmt.Fprintf(buf, "%8d ", 0)
-
-		// Field: inode.
-		fmt.Fprintf(buf, "%8d ", sfile.InodeID())
-
-		// Field: refcount. Don't count the ref we obtain while deferencing
-		// the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", sfile.ReadRefs()-1)
-
-		// Field: Socket struct address. Redacted due to the same reason as
-		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
-		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
-
-		// Field: retransmit timeout. Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: predicted tick of soft clock (delayed ACK control data).
-		// Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: (ack.quick<<1)|ack.pingpong, Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: sending congestion window, Unimplemented.
-		fmt.Fprintf(buf, "%d ", 0)
-
-		// Field: Slow start size threshold, -1 if threshold >= 0xFFFF.
-		// Unimplemented, report as large threshold.
-		fmt.Fprintf(buf, "%d", -1)
-
-		fmt.Fprintf(buf, "\n")
-
-		s.DecRef()
-	}
-
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/stat.go b/pkg/sentry/fsimpl/proc/stat.go
deleted file mode 100644
index 720db3828..000000000
--- a/pkg/sentry/fsimpl/proc/stat.go
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-)
-
-// cpuStats contains the breakdown of CPU time for /proc/stat.
-type cpuStats struct {
-	// user is time spent in userspace tasks with non-positive niceness.
-	user uint64
-
-	// nice is time spent in userspace tasks with positive niceness.
-	nice uint64
-
-	// system is time spent in non-interrupt kernel context.
-	system uint64
-
-	// idle is time spent idle.
-	idle uint64
-
-	// ioWait is time spent waiting for IO.
-	ioWait uint64
-
-	// irq is time spent in interrupt context.
-	irq uint64
-
-	// softirq is time spent in software interrupt context.
-	softirq uint64
-
-	// steal is involuntary wait time.
-	steal uint64
-
-	// guest is time spent in guests with non-positive niceness.
-	guest uint64
-
-	// guestNice is time spent in guests with positive niceness.
-	guestNice uint64
-}
-
-// String implements fmt.Stringer.
-func (c cpuStats) String() string {
-	return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice)
-}
-
-// statData implements vfs.DynamicBytesSource for /proc/stat.
-//
-// +stateify savable
-type statData struct {
-	// k is the owning Kernel.
-	k *kernel.Kernel
-}
-
-var _ vfs.DynamicBytesSource = (*statData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (s *statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	// TODO(b/37226836): We currently export only zero CPU stats. We could
-	// at least provide some aggregate stats.
-	var cpu cpuStats
-	fmt.Fprintf(buf, "cpu  %s\n", cpu)
-
-	for c, max := uint(0), s.k.ApplicationCores(); c < max; c++ {
-		fmt.Fprintf(buf, "cpu%d %s\n", c, cpu)
-	}
-
-	// The total number of interrupts is dependent on the CPUs and PCI
-	// devices on the system. See arch_probe_nr_irqs.
-	//
-	// Since we don't report real interrupt stats, just choose an arbitrary
-	// value from a representative VM.
-	const numInterrupts = 256
-
-	// The Kernel doesn't handle real interrupts, so report all zeroes.
-	// TODO(b/37226836): We could count page faults as #PF.
-	fmt.Fprintf(buf, "intr 0") // total
-	for i := 0; i < numInterrupts; i++ {
-		fmt.Fprintf(buf, " 0")
-	}
-	fmt.Fprintf(buf, "\n")
-
-	// Total number of context switches.
-	// TODO(b/37226836): Count this.
-	fmt.Fprintf(buf, "ctxt 0\n")
-
-	// CLOCK_REALTIME timestamp from boot, in seconds.
-	fmt.Fprintf(buf, "btime %d\n", s.k.Timekeeper().BootTime().Seconds())
-
-	// Total number of clones.
-	// TODO(b/37226836): Count this.
-	fmt.Fprintf(buf, "processes 0\n")
-
-	// Number of runnable tasks.
-	// TODO(b/37226836): Count this.
-	fmt.Fprintf(buf, "procs_running 0\n")
-
-	// Number of tasks waiting on IO.
-	// TODO(b/37226836): Count this.
-	fmt.Fprintf(buf, "procs_blocked 0\n")
-
-	// Number of each softirq handled.
-	fmt.Fprintf(buf, "softirq 0") // total
-	for i := 0; i < linux.NumSoftIRQ; i++ {
-		fmt.Fprintf(buf, " 0")
-	}
-	fmt.Fprintf(buf, "\n")
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
new file mode 100644
index 000000000..79c2725f3
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -0,0 +1,182 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"sort"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// subtasksInode represents the inode for /proc/[pid]/task/ directory.
+//
+// +stateify savable
+type subtasksInode struct {
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeAttrs
+	kernfs.OrderedChildren
+	kernfs.AlwaysValid
+
+	locks vfs.FileLocks
+
+	fs                *filesystem
+	task              *kernel.Task
+	pidns             *kernel.PIDNamespace
+	cgroupControllers map[string]string
+}
+
+var _ kernfs.Inode = (*subtasksInode)(nil)
+
+func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *kernfs.Dentry {
+	subInode := &subtasksInode{
+		fs:                fs,
+		task:              task,
+		pidns:             pidns,
+		cgroupControllers: cgroupControllers,
+	}
+	// Note: credentials are overridden by taskOwnedInode.
+	subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+
+	inode := &taskOwnedInode{Inode: subInode, owner: task}
+	dentry := &kernfs.Dentry{}
+	dentry.Init(inode)
+
+	return dentry
+}
+
+// Lookup implements kernfs.inodeDynamicLookup.
+func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	tid, err := strconv.ParseUint(name, 10, 32)
+	if err != nil {
+		return nil, syserror.ENOENT
+	}
+
+	subTask := i.pidns.TaskWithID(kernel.ThreadID(tid))
+	if subTask == nil {
+		return nil, syserror.ENOENT
+	}
+	if subTask.ThreadGroup() != i.task.ThreadGroup() {
+		return nil, syserror.ENOENT
+	}
+
+	subTaskDentry := i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers)
+	return subTaskDentry.VFSDentry(), nil
+}
+
+// IterDirents implements kernfs.inodeDynamicLookup.
+func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	tasks := i.task.ThreadGroup().MemberIDs(i.pidns)
+	if len(tasks) == 0 {
+		return offset, syserror.ENOENT
+	}
+	if relOffset >= int64(len(tasks)) {
+		return offset, nil
+	}
+
+	tids := make([]int, 0, len(tasks))
+	for _, tid := range tasks {
+		tids = append(tids, int(tid))
+	}
+
+	sort.Ints(tids)
+	for _, tid := range tids[relOffset:] {
+		dirent := vfs.Dirent{
+			Name:    strconv.FormatUint(uint64(tid), 10),
+			Type:    linux.DT_DIR,
+			Ino:     i.fs.NextIno(),
+			NextOff: offset + 1,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
+		}
+		offset++
+	}
+	return offset, nil
+}
+
+type subtasksFD struct {
+	kernfs.GenericDirectoryFD
+
+	task *kernel.Task
+}
+
+func (fd *subtasksFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	if fd.task.ExitState() >= kernel.TaskExitZombie {
+		return syserror.ENOENT
+	}
+	return fd.GenericDirectoryFD.IterDirents(ctx, cb)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *subtasksFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	if fd.task.ExitState() >= kernel.TaskExitZombie {
+		return 0, syserror.ENOENT
+	}
+	return fd.GenericDirectoryFD.Seek(ctx, offset, whence)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *subtasksFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	if fd.task.ExitState() >= kernel.TaskExitZombie {
+		return linux.Statx{}, syserror.ENOENT
+	}
+	return fd.GenericDirectoryFD.Stat(ctx, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	if fd.task.ExitState() >= kernel.TaskExitZombie {
+		return syserror.ENOENT
+	}
+	return fd.GenericDirectoryFD.SetStat(ctx, opts)
+}
+
+// Open implements kernfs.Inode.
+func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &subtasksFD{task: i.task}
+	if err := fd.Init(&i.OrderedChildren, &i.locks, &opts); err != nil {
+		return nil, err
+	}
+	if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+// Stat implements kernfs.Inode.
+func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	if opts.Mask&linux.STATX_NLINK != 0 {
+		stat.Nlink += uint32(i.task.ThreadGroup().Count())
+	}
+	return stat, nil
+}
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
diff --git a/pkg/sentry/fsimpl/proc/sys.go b/pkg/sentry/fsimpl/proc/sys.go
deleted file mode 100644
index b88256e12..000000000
--- a/pkg/sentry/fsimpl/proc/sys.go
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-)
-
-// mmapMinAddrData implements vfs.DynamicBytesSource for
-// /proc/sys/vm/mmap_min_addr.
-//
-// +stateify savable
-type mmapMinAddrData struct {
-	k *kernel.Kernel
-}
-
-var _ vfs.DynamicBytesSource = (*mmapMinAddrData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress())
-	return nil
-}
-
-// +stateify savable
-type overcommitMemory struct{}
-
-var _ vfs.DynamicBytesSource = (*overcommitMemory)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (d *overcommitMemory) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "0\n")
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index c46e05c3a..a5c7aa470 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -19,243 +19,221 @@ import (
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/mm"
-	"gvisor.dev/gvisor/pkg/sentry/usage"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// mapsCommon is embedded by mapsData and smapsData.
-type mapsCommon struct {
-	t *kernel.Task
-}
+// taskInode represents the inode for /proc/PID/ directory.
+//
+// +stateify savable
+type taskInode struct {
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNoDynamicLookup
+	kernfs.InodeAttrs
+	kernfs.OrderedChildren
 
-// mm gets the kernel task's MemoryManager. No additional reference is taken on
-// mm here. This is safe because MemoryManager.destroy is required to leave the
-// MemoryManager in a state where it's still usable as a DynamicBytesSource.
-func (md *mapsCommon) mm() *mm.MemoryManager {
-	var tmm *mm.MemoryManager
-	md.t.WithMuLocked(func(t *kernel.Task) {
-		if mm := t.MemoryManager(); mm != nil {
-			tmm = mm
-		}
-	})
-	return tmm
+	locks vfs.FileLocks
+
+	task *kernel.Task
 }
 
-// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
-//
-// +stateify savable
-type mapsData struct {
-	mapsCommon
+var _ kernfs.Inode = (*taskInode)(nil)
+
+func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry {
+	// TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited.
+	contents := map[string]*kernfs.Dentry{
+		"auxv":      fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}),
+		"cmdline":   fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
+		"comm":      fs.newComm(task, fs.NextIno(), 0444),
+		"environ":   fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
+		"exe":       fs.newExeSymlink(task, fs.NextIno()),
+		"fd":        fs.newFDDirInode(task),
+		"fdinfo":    fs.newFDInfoDirInode(task),
+		"gid_map":   fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
+		"io":        fs.newTaskOwnedFile(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
+		"maps":      fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mapsData{task: task}),
+		"mountinfo": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountInfoData{task: task}),
+		"mounts":    fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountsData{task: task}),
+		"net":       fs.newTaskNetDir(task),
+		"ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]*kernfs.Dentry{
+			"net":  fs.newNamespaceSymlink(task, fs.NextIno(), "net"),
+			"pid":  fs.newNamespaceSymlink(task, fs.NextIno(), "pid"),
+			"user": fs.newNamespaceSymlink(task, fs.NextIno(), "user"),
+		}),
+		"oom_score":     fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newStaticFile("0\n")),
+		"oom_score_adj": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
+		"smaps":         fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &smapsData{task: task}),
+		"stat":          fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
+		"statm":         fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statmData{task: task}),
+		"status":        fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
+		"uid_map":       fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
+	}
+	if isThreadGroup {
+		contents["task"] = fs.newSubtasks(task, pidns, cgroupControllers)
+	}
+	if len(cgroupControllers) > 0 {
+		contents["cgroup"] = fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers))
+	}
+
+	taskInode := &taskInode{task: task}
+	// Note: credentials are overridden by taskOwnedInode.
+	taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+
+	inode := &taskOwnedInode{Inode: taskInode, owner: task}
+	dentry := &kernfs.Dentry{}
+	dentry.Init(inode)
+
+	taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	links := taskInode.OrderedChildren.Populate(dentry, contents)
+	taskInode.IncLinks(links)
+
+	return dentry
 }
 
-var _ vfs.DynamicBytesSource = (*mapsData)(nil)
+// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long
+// as the task is still running. When it's dead, another tasks with the same
+// PID could replace it.
+func (i *taskInode) Valid(ctx context.Context) bool {
+	return i.task.ExitState() != kernel.TaskExitDead
+}
 
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (md *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	if mm := md.mm(); mm != nil {
-		mm.ReadMapsDataInto(ctx, buf)
+// Open implements kernfs.Inode.
+func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+	if err != nil {
+		return nil, err
 	}
-	return nil
+	return fd.VFSFileDescription(), nil
 }
 
-// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps.
-//
-// +stateify savable
-type smapsData struct {
-	mapsCommon
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
 }
 
-var _ vfs.DynamicBytesSource = (*smapsData)(nil)
+// taskOwnedInode implements kernfs.Inode and overrides inode owner with task
+// effective user and group.
+type taskOwnedInode struct {
+	kernfs.Inode
 
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (sd *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	if mm := sd.mm(); mm != nil {
-		mm.ReadSmapsDataInto(ctx, buf)
-	}
-	return nil
+	// owner is the task that owns this inode.
+	owner *kernel.Task
 }
 
-// +stateify savable
-type taskStatData struct {
-	t *kernel.Task
+var _ kernfs.Inode = (*taskOwnedInode)(nil)
 
-	// If tgstats is true, accumulate fault stats (not implemented) and CPU
-	// time across all tasks in t's thread group.
-	tgstats bool
+func (fs *filesystem) newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+	// Note: credentials are overridden by taskOwnedInode.
+	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
 
-	// pidns is the PID namespace associated with the proc filesystem that
-	// includes the file using this statData.
-	pidns *kernel.PIDNamespace
+	taskInode := &taskOwnedInode{Inode: inode, owner: task}
+	d := &kernfs.Dentry{}
+	d.Init(taskInode)
+	return d
 }
 
-var _ vfs.DynamicBytesSource = (*taskStatData)(nil)
+func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry {
+	dir := &kernfs.StaticDirectory{}
 
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.t))
-	fmt.Fprintf(buf, "(%s) ", s.t.Name())
-	fmt.Fprintf(buf, "%c ", s.t.StateStatus()[0])
-	ppid := kernel.ThreadID(0)
-	if parent := s.t.Parent(); parent != nil {
-		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
-	}
-	fmt.Fprintf(buf, "%d ", ppid)
-	fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.t.ThreadGroup().ProcessGroup()))
-	fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.t.ThreadGroup().Session()))
-	fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */)
-	fmt.Fprintf(buf, "0 " /* flags */)
-	fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
-	var cputime usage.CPUStats
-	if s.tgstats {
-		cputime = s.t.ThreadGroup().CPUStats()
-	} else {
-		cputime = s.t.CPUStats()
-	}
-	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
-	cputime = s.t.ThreadGroup().JoinedChildCPUStats()
-	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
-	fmt.Fprintf(buf, "%d %d ", s.t.Priority(), s.t.Niceness())
-	fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Count())
-
-	// itrealvalue. Since kernel 2.6.17, this field is no longer
-	// maintained, and is hard coded as 0.
-	fmt.Fprintf(buf, "0 ")
-
-	// Start time is relative to boot time, expressed in clock ticks.
-	fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.t.StartTime().Sub(s.t.Kernel().Timekeeper().BootTime())))
-
-	var vss, rss uint64
-	s.t.WithMuLocked(func(t *kernel.Task) {
-		if mm := t.MemoryManager(); mm != nil {
-			vss = mm.VirtualMemorySize()
-			rss = mm.ResidentSetSize()
-		}
-	})
-	fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize)
-
-	// rsslim.
-	fmt.Fprintf(buf, "%d ", s.t.ThreadGroup().Limits().Get(limits.Rss).Cur)
-
-	fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
-	fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
-	fmt.Fprintf(buf, "0 0 " /* nswap cnswap */)
-	terminationSignal := linux.Signal(0)
-	if s.t == s.t.ThreadGroup().Leader() {
-		terminationSignal = s.t.ThreadGroup().TerminationSignal()
-	}
-	fmt.Fprintf(buf, "%d ", terminationSignal)
-	fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */)
-	fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
-	fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
-	fmt.Fprintf(buf, "0\n" /* exit_code */)
+	// Note: credentials are overridden by taskOwnedInode.
+	dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm)
 
-	return nil
-}
+	inode := &taskOwnedInode{Inode: dir, owner: task}
+	d := &kernfs.Dentry{}
+	d.Init(inode)
 
-// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm.
-//
-// +stateify savable
-type statmData struct {
-	t *kernel.Task
-}
+	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	links := dir.OrderedChildren.Populate(d, children)
+	dir.IncLinks(links)
 
-var _ vfs.DynamicBytesSource = (*statmData)(nil)
+	return d
+}
 
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	var vss, rss uint64
-	s.t.WithMuLocked(func(t *kernel.Task) {
-		if mm := t.MemoryManager(); mm != nil {
-			vss = mm.VirtualMemorySize()
-			rss = mm.ResidentSetSize()
+// Stat implements kernfs.Inode.
+func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.Inode.Stat(ctx, fs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	if opts.Mask&(linux.STATX_UID|linux.STATX_GID) != 0 {
+		uid, gid := i.getOwner(linux.FileMode(stat.Mode))
+		if opts.Mask&linux.STATX_UID != 0 {
+			stat.UID = uint32(uid)
 		}
-	})
-
-	fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize)
-	return nil
+		if opts.Mask&linux.STATX_GID != 0 {
+			stat.GID = uint32(gid)
+		}
+	}
+	return stat, nil
 }
 
-// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status.
-//
-// +stateify savable
-type statusData struct {
-	t     *kernel.Task
-	pidns *kernel.PIDNamespace
+// CheckPermissions implements kernfs.Inode.
+func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	mode := i.Mode()
+	uid, gid := i.getOwner(mode)
+	return vfs.GenericCheckPermissions(creds, ats, mode, uid, gid)
 }
 
-var _ vfs.DynamicBytesSource = (*statusData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	fmt.Fprintf(buf, "Name:\t%s\n", s.t.Name())
-	fmt.Fprintf(buf, "State:\t%s\n", s.t.StateStatus())
-	fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.t.ThreadGroup()))
-	fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.t))
-	ppid := kernel.ThreadID(0)
-	if parent := s.t.Parent(); parent != nil {
-		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+func (i *taskOwnedInode) getOwner(mode linux.FileMode) (auth.KUID, auth.KGID) {
+	// By default, set the task owner as the file owner.
+	creds := i.owner.Credentials()
+	uid := creds.EffectiveKUID
+	gid := creds.EffectiveKGID
+
+	// Linux doesn't apply dumpability adjustments to world readable/executable
+	// directories so that applications can stat /proc/PID to determine the
+	// effective UID of a process. See fs/proc/base.c:task_dump_owner.
+	if mode.FileType() == linux.ModeDirectory && mode.Permissions() == 0555 {
+		return uid, gid
 	}
-	fmt.Fprintf(buf, "PPid:\t%d\n", ppid)
-	tpid := kernel.ThreadID(0)
-	if tracer := s.t.Tracer(); tracer != nil {
-		tpid = s.pidns.IDOfTask(tracer)
+
+	// If the task is not dumpable, then root (in the namespace preferred)
+	// owns the file.
+	m := getMM(i.owner)
+	if m == nil {
+		return auth.RootKUID, auth.RootKGID
 	}
-	fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid)
-	var fds int
-	var vss, rss, data uint64
-	s.t.WithMuLocked(func(t *kernel.Task) {
-		if fdTable := t.FDTable(); fdTable != nil {
-			fds = fdTable.Size()
+	if m.Dumpability() != mm.UserDumpable {
+		uid = auth.RootKUID
+		if kuid := creds.UserNamespace.MapToKUID(auth.RootUID); kuid.Ok() {
+			uid = kuid
 		}
-		if mm := t.MemoryManager(); mm != nil {
-			vss = mm.VirtualMemorySize()
-			rss = mm.ResidentSetSize()
-			data = mm.VirtualDataSize()
+		gid = auth.RootKGID
+		if kgid := creds.UserNamespace.MapToKGID(auth.RootGID); kgid.Ok() {
+			gid = kgid
 		}
-	})
-	fmt.Fprintf(buf, "FDSize:\t%d\n", fds)
-	fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10)
-	fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10)
-	fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10)
-	fmt.Fprintf(buf, "Threads:\t%d\n", s.t.ThreadGroup().Count())
-	creds := s.t.Credentials()
-	fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps)
-	fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
-	fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
-	fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
-	fmt.Fprintf(buf, "Seccomp:\t%d\n", s.t.SeccompMode())
-	return nil
-}
-
-// ioUsage is the /proc/<pid>/io and /proc/<pid>/task/<tid>/io data provider.
-type ioUsage interface {
-	// IOUsage returns the io usage data.
-	IOUsage() *usage.IO
+	}
+	return uid, gid
 }
 
-// +stateify savable
-type ioData struct {
-	ioUsage
+func newIO(t *kernel.Task, isThreadGroup bool) *ioData {
+	if isThreadGroup {
+		return &ioData{ioUsage: t.ThreadGroup()}
+	}
+	return &ioData{ioUsage: t}
 }
 
-var _ vfs.DynamicBytesSource = (*ioData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	io := usage.IO{}
-	io.Accumulate(i.IOUsage())
-
-	fmt.Fprintf(buf, "char: %d\n", io.CharsRead)
-	fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten)
-	fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls)
-	fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls)
-	fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead)
-	fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten)
-	fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
-	return nil
+// newCgroupData creates inode that shows cgroup information.
+// From man 7 cgroups: "For each cgroup hierarchy of which the process is a
+// member, there is one entry containing three colon-separated fields:
+//   hierarchy-ID:controller-list:cgroup-path"
+func newCgroupData(controllers map[string]string) dynamicInode {
+	var buf bytes.Buffer
+
+	// The hierarchy ids must be positive integers (for cgroup v1), but the
+	// exact number does not matter, so long as they are unique. We can
+	// just use a counter, but since linux sorts this file in descending
+	// order, we must count down to preserve this behavior.
+	i := len(controllers)
+	for name, dir := range controllers {
+		fmt.Fprintf(&buf, "%d:%s:%s\n", i, name, dir)
+		i--
+	}
+	return newStaticFile(buf.String())
 }
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
new file mode 100644
index 000000000..f0d3f7f5e
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -0,0 +1,307 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+func getTaskFD(t *kernel.Task, fd int32) (*vfs.FileDescription, kernel.FDFlags) {
+	var (
+		file  *vfs.FileDescription
+		flags kernel.FDFlags
+	)
+	t.WithMuLocked(func(t *kernel.Task) {
+		if fdt := t.FDTable(); fdt != nil {
+			file, flags = fdt.GetVFS2(fd)
+		}
+	})
+	return file, flags
+}
+
+func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool {
+	file, _ := getTaskFD(t, fd)
+	if file == nil {
+		return false
+	}
+	file.DecRef(ctx)
+	return true
+}
+
+type fdDir struct {
+	locks vfs.FileLocks
+
+	fs   *filesystem
+	task *kernel.Task
+
+	// When produceSymlinks is set, dirents produces for the FDs are reported
+	// as symlink. Otherwise, they are reported as regular files.
+	produceSymlink bool
+}
+
+// IterDirents implements kernfs.inodeDynamicLookup.
+func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	var fds []int32
+	i.task.WithMuLocked(func(t *kernel.Task) {
+		if fdTable := t.FDTable(); fdTable != nil {
+			fds = fdTable.GetFDs(ctx)
+		}
+	})
+
+	typ := uint8(linux.DT_REG)
+	if i.produceSymlink {
+		typ = linux.DT_LNK
+	}
+
+	// Find the appropriate starting point.
+	idx := sort.Search(len(fds), func(i int) bool { return fds[i] >= int32(relOffset) })
+	if idx >= len(fds) {
+		return offset, nil
+	}
+	for _, fd := range fds[idx:] {
+		dirent := vfs.Dirent{
+			Name:    strconv.FormatUint(uint64(fd), 10),
+			Type:    typ,
+			Ino:     i.fs.NextIno(),
+			NextOff: offset + 1,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
+		}
+		offset++
+	}
+	return offset, nil
+}
+
+// fdDirInode represents the inode for /proc/[pid]/fd directory.
+//
+// +stateify savable
+type fdDirInode struct {
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeAttrs
+	kernfs.OrderedChildren
+	kernfs.AlwaysValid
+	fdDir
+}
+
+var _ kernfs.Inode = (*fdDirInode)(nil)
+
+func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
+	inode := &fdDirInode{
+		fdDir: fdDir{
+			fs:             fs,
+			task:           task,
+			produceSymlink: true,
+		},
+	}
+	inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+
+	dentry := &kernfs.Dentry{}
+	dentry.Init(inode)
+	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+
+	return dentry
+}
+
+// Lookup implements kernfs.inodeDynamicLookup.
+func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	fdInt, err := strconv.ParseInt(name, 10, 32)
+	if err != nil {
+		return nil, syserror.ENOENT
+	}
+	fd := int32(fdInt)
+	if !taskFDExists(ctx, i.task, fd) {
+		return nil, syserror.ENOENT
+	}
+	taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno())
+	return taskDentry.VFSDentry(), nil
+}
+
+// Open implements kernfs.Inode.
+func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+// CheckPermissions implements kernfs.Inode.
+//
+// This is to match Linux, which uses a special permission handler to guarantee
+// that a process can still access /proc/self/fd after it has executed
+// setuid. See fs/proc/fd.c:proc_fd_permission.
+func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	err := i.InodeAttrs.CheckPermissions(ctx, creds, ats)
+	if err == nil {
+		// Access granted, no extra check needed.
+		return nil
+	}
+	if t := kernel.TaskFromContext(ctx); t != nil {
+		// Allow access if the task trying to access it is in the thread group
+		// corresponding to this directory.
+		if i.task.ThreadGroup() == t.ThreadGroup() {
+			// Access granted (overridden).
+			return nil
+		}
+	}
+	return err
+}
+
+// fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file.
+//
+// +stateify savable
+type fdSymlink struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeSymlink
+
+	task *kernel.Task
+	fd   int32
+}
+
+var _ kernfs.Inode = (*fdSymlink)(nil)
+
+func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry {
+	inode := &fdSymlink{
+		task: task,
+		fd:   fd,
+	}
+	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
+	file, _ := getTaskFD(s.task, s.fd)
+	if file == nil {
+		return "", syserror.ENOENT
+	}
+	defer file.DecRef(ctx)
+	root := vfs.RootFromContext(ctx)
+	defer root.DecRef(ctx)
+	return s.task.Kernel().VFS().PathnameWithDeleted(ctx, root, file.VirtualDentry())
+}
+
+func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	file, _ := getTaskFD(s.task, s.fd)
+	if file == nil {
+		return vfs.VirtualDentry{}, "", syserror.ENOENT
+	}
+	defer file.DecRef(ctx)
+	vd := file.VirtualDentry()
+	vd.IncRef()
+	return vd, "", nil
+}
+
+// fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory.
+//
+// +stateify savable
+type fdInfoDirInode struct {
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeAttrs
+	kernfs.OrderedChildren
+	kernfs.AlwaysValid
+	fdDir
+}
+
+var _ kernfs.Inode = (*fdInfoDirInode)(nil)
+
+func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry {
+	inode := &fdInfoDirInode{
+		fdDir: fdDir{
+			fs:   fs,
+			task: task,
+		},
+	}
+	inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+
+	dentry := &kernfs.Dentry{}
+	dentry.Init(inode)
+	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+
+	return dentry
+}
+
+// Lookup implements kernfs.inodeDynamicLookup.
+func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	fdInt, err := strconv.ParseInt(name, 10, 32)
+	if err != nil {
+		return nil, syserror.ENOENT
+	}
+	fd := int32(fdInt)
+	if !taskFDExists(ctx, i.task, fd) {
+		return nil, syserror.ENOENT
+	}
+	data := &fdInfoData{
+		task: i.task,
+		fd:   fd,
+	}
+	dentry := i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data)
+	return dentry.VFSDentry(), nil
+}
+
+// Open implements kernfs.Inode.
+func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+// fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd].
+//
+// +stateify savable
+type fdInfoData struct {
+	kernfs.DynamicBytesFile
+	refs.AtomicRefCount
+
+	task *kernel.Task
+	fd   int32
+}
+
+var _ dynamicInode = (*fdInfoData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	file, descriptorFlags := getTaskFD(d.task, d.fd)
+	if file == nil {
+		return syserror.ENOENT
+	}
+	defer file.DecRef(ctx)
+	// TODO(b/121266871): Include pos, locks, and other data. For now we only
+	// have flags.
+	// See https://www.kernel.org/doc/Documentation/filesystems/proc.txt
+	flags := uint(file.StatusFlags()) | descriptorFlags.ToLinuxFileFlags()
+	fmt.Fprintf(buf, "flags:\t0%o\n", flags)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
new file mode 100644
index 000000000..830b78949
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -0,0 +1,902 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// "There is an (arbitrary) limit on the number of lines in the file. As at
+// Linux 3.18, the limit is five lines." - user_namespaces(7)
+const maxIDMapLines = 5
+
+// mm gets the kernel task's MemoryManager. No additional reference is taken on
+// mm here. This is safe because MemoryManager.destroy is required to leave the
+// MemoryManager in a state where it's still usable as a DynamicBytesSource.
+func getMM(task *kernel.Task) *mm.MemoryManager {
+	var tmm *mm.MemoryManager
+	task.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			tmm = mm
+		}
+	})
+	return tmm
+}
+
+// getMMIncRef returns t's MemoryManager. If getMMIncRef succeeds, the
+// MemoryManager's users count is incremented, and must be decremented by the
+// caller when it is no longer in use.
+func getMMIncRef(task *kernel.Task) (*mm.MemoryManager, error) {
+	if task.ExitState() == kernel.TaskExitDead {
+		return nil, syserror.ESRCH
+	}
+	var m *mm.MemoryManager
+	task.WithMuLocked(func(t *kernel.Task) {
+		m = t.MemoryManager()
+	})
+	if m == nil || !m.IncUsers() {
+		return nil, io.EOF
+	}
+	return m, nil
+}
+
+func checkTaskState(t *kernel.Task) error {
+	switch t.ExitState() {
+	case kernel.TaskExitZombie:
+		return syserror.EACCES
+	case kernel.TaskExitDead:
+		return syserror.ESRCH
+	}
+	return nil
+}
+
+type bufferWriter struct {
+	buf *bytes.Buffer
+}
+
+// WriteFromBlocks writes up to srcs.NumBytes() bytes from srcs and returns
+// the number of bytes written. It may return a partial write without an
+// error (i.e. (n, nil) where 0 < n < srcs.NumBytes()). It should not
+// return a full write with an error (i.e. srcs.NumBytes(), err) where err
+// != nil).
+func (w *bufferWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	written := srcs.NumBytes()
+	for !srcs.IsEmpty() {
+		w.buf.Write(srcs.Head().ToSlice())
+		srcs = srcs.Tail()
+	}
+	return written, nil
+}
+
+// auxvData implements vfs.DynamicBytesSource for /proc/[pid]/auxv.
+//
+// +stateify savable
+type auxvData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*auxvData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *auxvData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	m, err := getMMIncRef(d.task)
+	if err != nil {
+		return err
+	}
+	defer m.DecUsers(ctx)
+
+	auxv := m.Auxv()
+	// Space for buffer with AT_NULL (0) terminator at the end.
+	buf.Grow((len(auxv) + 1) * 16)
+	for _, e := range auxv {
+		var tmp [16]byte
+		usermem.ByteOrder.PutUint64(tmp[:8], e.Key)
+		usermem.ByteOrder.PutUint64(tmp[8:], uint64(e.Value))
+		buf.Write(tmp[:])
+	}
+	var atNull [16]byte
+	buf.Write(atNull[:])
+
+	return nil
+}
+
+// execArgType enumerates the types of exec arguments that are exposed through
+// proc.
+type execArgType int
+
+const (
+	cmdlineDataArg execArgType = iota
+	environDataArg
+)
+
+// cmdlineData implements vfs.DynamicBytesSource for /proc/[pid]/cmdline.
+//
+// +stateify savable
+type cmdlineData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+
+	// arg is the type of exec argument this file contains.
+	arg execArgType
+}
+
+var _ dynamicInode = (*cmdlineData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *cmdlineData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	m, err := getMMIncRef(d.task)
+	if err != nil {
+		return err
+	}
+	defer m.DecUsers(ctx)
+
+	// Figure out the bounds of the exec arg we are trying to read.
+	var ar usermem.AddrRange
+	switch d.arg {
+	case cmdlineDataArg:
+		ar = usermem.AddrRange{
+			Start: m.ArgvStart(),
+			End:   m.ArgvEnd(),
+		}
+	case environDataArg:
+		ar = usermem.AddrRange{
+			Start: m.EnvvStart(),
+			End:   m.EnvvEnd(),
+		}
+	default:
+		panic(fmt.Sprintf("unknown exec arg type %v", d.arg))
+	}
+	if ar.Start == 0 || ar.End == 0 {
+		// Don't attempt to read before the start/end are set up.
+		return io.EOF
+	}
+
+	// N.B. Technically this should be usermem.IOOpts.IgnorePermissions = true
+	// until Linux 4.9 (272ddc8b3735 "proc: don't use FOLL_FORCE for reading
+	// cmdline and environment").
+	writer := &bufferWriter{buf: buf}
+	if n, err := m.CopyInTo(ctx, usermem.AddrRangeSeqOf(ar), writer, usermem.IOOpts{}); n == 0 || err != nil {
+		// Nothing to copy or something went wrong.
+		return err
+	}
+
+	// On Linux, if the NULL byte at the end of the argument vector has been
+	// overwritten, it continues reading the environment vector as part of
+	// the argument vector.
+	if d.arg == cmdlineDataArg && buf.Bytes()[buf.Len()-1] != 0 {
+		if end := bytes.IndexByte(buf.Bytes(), 0); end != -1 {
+			// If we found a NULL character somewhere else in argv, truncate the
+			// return up to the NULL terminator (including it).
+			buf.Truncate(end)
+			return nil
+		}
+
+		// There is no NULL terminator in the string, return into envp.
+		arEnvv := usermem.AddrRange{
+			Start: m.EnvvStart(),
+			End:   m.EnvvEnd(),
+		}
+
+		// Upstream limits the returned amount to one page of slop.
+		// https://elixir.bootlin.com/linux/v4.20/source/fs/proc/base.c#L208
+		// we'll return one page total between argv and envp because of the
+		// above page restrictions.
+		if buf.Len() >= usermem.PageSize {
+			// Returned at least one page already, nothing else to add.
+			return nil
+		}
+		remaining := usermem.PageSize - buf.Len()
+		if int(arEnvv.Length()) > remaining {
+			end, ok := arEnvv.Start.AddLength(uint64(remaining))
+			if !ok {
+				return syserror.EFAULT
+			}
+			arEnvv.End = end
+		}
+		if _, err := m.CopyInTo(ctx, usermem.AddrRangeSeqOf(arEnvv), writer, usermem.IOOpts{}); err != nil {
+			return err
+		}
+
+		// Linux will return envp up to and including the first NULL character,
+		// so find it.
+		envStart := int(ar.Length())
+		if nullIdx := bytes.IndexByte(buf.Bytes()[envStart:], 0); nullIdx != -1 {
+			buf.Truncate(envStart + nullIdx)
+		}
+	}
+
+	return nil
+}
+
+// +stateify savable
+type commInode struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry {
+	inode := &commInode{task: task}
+	inode.DynamicBytesFile.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	// This file can always be read or written by members of the same thread
+	// group. See fs/proc/base.c:proc_tid_comm_permission.
+	//
+	// N.B. This check is currently a no-op as we don't yet support writing and
+	// this file is world-readable anyways.
+	t := kernel.TaskFromContext(ctx)
+	if t != nil && t.ThreadGroup() == i.task.ThreadGroup() && !ats.MayExec() {
+		return nil
+	}
+
+	return i.DynamicBytesFile.CheckPermissions(ctx, creds, ats)
+}
+
+// commData implements vfs.DynamicBytesSource for /proc/[pid]/comm.
+//
+// +stateify savable
+type commData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*commData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *commData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString(d.task.Name())
+	buf.WriteString("\n")
+	return nil
+}
+
+// idMapData implements vfs.WritableDynamicBytesSource for
+// /proc/[pid]/{gid_map|uid_map}.
+//
+// +stateify savable
+type idMapData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+	gids bool
+}
+
+var _ dynamicInode = (*idMapData)(nil)
+
+// Generate implements vfs.WritableDynamicBytesSource.Generate.
+func (d *idMapData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	var entries []auth.IDMapEntry
+	if d.gids {
+		entries = d.task.UserNamespace().GIDMap()
+	} else {
+		entries = d.task.UserNamespace().UIDMap()
+	}
+	for _, e := range entries {
+		fmt.Fprintf(buf, "%10d %10d %10d\n", e.FirstID, e.FirstParentID, e.Length)
+	}
+	return nil
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (d *idMapData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	// "In addition, the number of bytes written to the file must be less than
+	// the system page size, and the write must be performed at the start of
+	// the file ..." - user_namespaces(7)
+	srclen := src.NumBytes()
+	if srclen >= usermem.PageSize || offset != 0 {
+		return 0, syserror.EINVAL
+	}
+	b := make([]byte, srclen)
+	if _, err := src.CopyIn(ctx, b); err != nil {
+		return 0, err
+	}
+
+	// Truncate from the first NULL byte.
+	var nul int64
+	nul = int64(bytes.IndexByte(b, 0))
+	if nul == -1 {
+		nul = srclen
+	}
+	b = b[:nul]
+	// Remove the last \n.
+	if nul >= 1 && b[nul-1] == '\n' {
+		b = b[:nul-1]
+	}
+	lines := bytes.SplitN(b, []byte("\n"), maxIDMapLines+1)
+	if len(lines) > maxIDMapLines {
+		return 0, syserror.EINVAL
+	}
+
+	entries := make([]auth.IDMapEntry, len(lines))
+	for i, l := range lines {
+		var e auth.IDMapEntry
+		_, err := fmt.Sscan(string(l), &e.FirstID, &e.FirstParentID, &e.Length)
+		if err != nil {
+			return 0, syserror.EINVAL
+		}
+		entries[i] = e
+	}
+	var err error
+	if d.gids {
+		err = d.task.UserNamespace().SetGIDMap(ctx, entries)
+	} else {
+		err = d.task.UserNamespace().SetUIDMap(ctx, entries)
+	}
+	if err != nil {
+		return 0, err
+	}
+
+	// On success, Linux's kernel/user_namespace.c:map_write() always returns
+	// count, even if fewer bytes were used.
+	return int64(srclen), nil
+}
+
+// mapsData implements vfs.DynamicBytesSource for /proc/[pid]/maps.
+//
+// +stateify savable
+type mapsData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*mapsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *mapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if mm := getMM(d.task); mm != nil {
+		mm.ReadMapsDataInto(ctx, buf)
+	}
+	return nil
+}
+
+// smapsData implements vfs.DynamicBytesSource for /proc/[pid]/smaps.
+//
+// +stateify savable
+type smapsData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*smapsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *smapsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if mm := getMM(d.task); mm != nil {
+		mm.ReadSmapsDataInto(ctx, buf)
+	}
+	return nil
+}
+
+// +stateify savable
+type taskStatData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+
+	// If tgstats is true, accumulate fault stats (not implemented) and CPU
+	// time across all tasks in t's thread group.
+	tgstats bool
+
+	// pidns is the PID namespace associated with the proc filesystem that
+	// includes the file using this statData.
+	pidns *kernel.PIDNamespace
+}
+
+var _ dynamicInode = (*taskStatData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *taskStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfTask(s.task))
+	fmt.Fprintf(buf, "(%s) ", s.task.Name())
+	fmt.Fprintf(buf, "%c ", s.task.StateStatus()[0])
+	ppid := kernel.ThreadID(0)
+	if parent := s.task.Parent(); parent != nil {
+		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+	}
+	fmt.Fprintf(buf, "%d ", ppid)
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfProcessGroup(s.task.ThreadGroup().ProcessGroup()))
+	fmt.Fprintf(buf, "%d ", s.pidns.IDOfSession(s.task.ThreadGroup().Session()))
+	fmt.Fprintf(buf, "0 0 " /* tty_nr tpgid */)
+	fmt.Fprintf(buf, "0 " /* flags */)
+	fmt.Fprintf(buf, "0 0 0 0 " /* minflt cminflt majflt cmajflt */)
+	var cputime usage.CPUStats
+	if s.tgstats {
+		cputime = s.task.ThreadGroup().CPUStats()
+	} else {
+		cputime = s.task.CPUStats()
+	}
+	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
+	cputime = s.task.ThreadGroup().JoinedChildCPUStats()
+	fmt.Fprintf(buf, "%d %d ", linux.ClockTFromDuration(cputime.UserTime), linux.ClockTFromDuration(cputime.SysTime))
+	fmt.Fprintf(buf, "%d %d ", s.task.Priority(), s.task.Niceness())
+	fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Count())
+
+	// itrealvalue. Since kernel 2.6.17, this field is no longer
+	// maintained, and is hard coded as 0.
+	fmt.Fprintf(buf, "0 ")
+
+	// Start time is relative to boot time, expressed in clock ticks.
+	fmt.Fprintf(buf, "%d ", linux.ClockTFromDuration(s.task.StartTime().Sub(s.task.Kernel().Timekeeper().BootTime())))
+
+	var vss, rss uint64
+	s.task.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+		}
+	})
+	fmt.Fprintf(buf, "%d %d ", vss, rss/usermem.PageSize)
+
+	// rsslim.
+	fmt.Fprintf(buf, "%d ", s.task.ThreadGroup().Limits().Get(limits.Rss).Cur)
+
+	fmt.Fprintf(buf, "0 0 0 0 0 " /* startcode endcode startstack kstkesp kstkeip */)
+	fmt.Fprintf(buf, "0 0 0 0 0 " /* signal blocked sigignore sigcatch wchan */)
+	fmt.Fprintf(buf, "0 0 " /* nswap cnswap */)
+	terminationSignal := linux.Signal(0)
+	if s.task == s.task.ThreadGroup().Leader() {
+		terminationSignal = s.task.ThreadGroup().TerminationSignal()
+	}
+	fmt.Fprintf(buf, "%d ", terminationSignal)
+	fmt.Fprintf(buf, "0 0 0 " /* processor rt_priority policy */)
+	fmt.Fprintf(buf, "0 0 0 " /* delayacct_blkio_ticks guest_time cguest_time */)
+	fmt.Fprintf(buf, "0 0 0 0 0 0 0 " /* start_data end_data start_brk arg_start arg_end env_start env_end */)
+	fmt.Fprintf(buf, "0\n" /* exit_code */)
+
+	return nil
+}
+
+// statmData implements vfs.DynamicBytesSource for /proc/[pid]/statm.
+//
+// +stateify savable
+type statmData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*statmData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statmData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	var vss, rss uint64
+	s.task.WithMuLocked(func(t *kernel.Task) {
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+		}
+	})
+
+	fmt.Fprintf(buf, "%d %d 0 0 0 0 0\n", vss/usermem.PageSize, rss/usermem.PageSize)
+	return nil
+}
+
+// statusData implements vfs.DynamicBytesSource for /proc/[pid]/status.
+//
+// +stateify savable
+type statusData struct {
+	kernfs.DynamicBytesFile
+
+	task  *kernel.Task
+	pidns *kernel.PIDNamespace
+}
+
+var _ dynamicInode = (*statusData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "Name:\t%s\n", s.task.Name())
+	fmt.Fprintf(buf, "State:\t%s\n", s.task.StateStatus())
+	fmt.Fprintf(buf, "Tgid:\t%d\n", s.pidns.IDOfThreadGroup(s.task.ThreadGroup()))
+	fmt.Fprintf(buf, "Pid:\t%d\n", s.pidns.IDOfTask(s.task))
+	ppid := kernel.ThreadID(0)
+	if parent := s.task.Parent(); parent != nil {
+		ppid = s.pidns.IDOfThreadGroup(parent.ThreadGroup())
+	}
+	fmt.Fprintf(buf, "PPid:\t%d\n", ppid)
+	tpid := kernel.ThreadID(0)
+	if tracer := s.task.Tracer(); tracer != nil {
+		tpid = s.pidns.IDOfTask(tracer)
+	}
+	fmt.Fprintf(buf, "TracerPid:\t%d\n", tpid)
+	var fds int
+	var vss, rss, data uint64
+	s.task.WithMuLocked(func(t *kernel.Task) {
+		if fdTable := t.FDTable(); fdTable != nil {
+			fds = fdTable.Size()
+		}
+		if mm := t.MemoryManager(); mm != nil {
+			vss = mm.VirtualMemorySize()
+			rss = mm.ResidentSetSize()
+			data = mm.VirtualDataSize()
+		}
+	})
+	fmt.Fprintf(buf, "FDSize:\t%d\n", fds)
+	fmt.Fprintf(buf, "VmSize:\t%d kB\n", vss>>10)
+	fmt.Fprintf(buf, "VmRSS:\t%d kB\n", rss>>10)
+	fmt.Fprintf(buf, "VmData:\t%d kB\n", data>>10)
+	fmt.Fprintf(buf, "Threads:\t%d\n", s.task.ThreadGroup().Count())
+	creds := s.task.Credentials()
+	fmt.Fprintf(buf, "CapInh:\t%016x\n", creds.InheritableCaps)
+	fmt.Fprintf(buf, "CapPrm:\t%016x\n", creds.PermittedCaps)
+	fmt.Fprintf(buf, "CapEff:\t%016x\n", creds.EffectiveCaps)
+	fmt.Fprintf(buf, "CapBnd:\t%016x\n", creds.BoundingCaps)
+	fmt.Fprintf(buf, "Seccomp:\t%d\n", s.task.SeccompMode())
+	// We unconditionally report a single NUMA node. See
+	// pkg/sentry/syscalls/linux/sys_mempolicy.go.
+	fmt.Fprintf(buf, "Mems_allowed:\t1\n")
+	fmt.Fprintf(buf, "Mems_allowed_list:\t0\n")
+	return nil
+}
+
+// ioUsage is the /proc/[pid]/io and /proc/[pid]/task/[tid]/io data provider.
+type ioUsage interface {
+	// IOUsage returns the io usage data.
+	IOUsage() *usage.IO
+}
+
+// +stateify savable
+type ioData struct {
+	kernfs.DynamicBytesFile
+
+	ioUsage
+}
+
+var _ dynamicInode = (*ioData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (i *ioData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	io := usage.IO{}
+	io.Accumulate(i.IOUsage())
+
+	fmt.Fprintf(buf, "char: %d\n", io.CharsRead)
+	fmt.Fprintf(buf, "wchar: %d\n", io.CharsWritten)
+	fmt.Fprintf(buf, "syscr: %d\n", io.ReadSyscalls)
+	fmt.Fprintf(buf, "syscw: %d\n", io.WriteSyscalls)
+	fmt.Fprintf(buf, "read_bytes: %d\n", io.BytesRead)
+	fmt.Fprintf(buf, "write_bytes: %d\n", io.BytesWritten)
+	fmt.Fprintf(buf, "cancelled_write_bytes: %d\n", io.BytesWriteCancelled)
+	return nil
+}
+
+// oomScoreAdj is a stub of the /proc/<pid>/oom_score_adj file.
+//
+// +stateify savable
+type oomScoreAdj struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ vfs.WritableDynamicBytesSource = (*oomScoreAdj)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (o *oomScoreAdj) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if o.task.ExitState() == kernel.TaskExitDead {
+		return syserror.ESRCH
+	}
+	fmt.Fprintf(buf, "%d\n", o.task.OOMScoreAdj())
+	return nil
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+
+	if o.task.ExitState() == kernel.TaskExitDead {
+		return 0, syserror.ESRCH
+	}
+	if err := o.task.SetOOMScoreAdj(v); err != nil {
+		return 0, err
+	}
+
+	return n, nil
+}
+
+// exeSymlink is an symlink for the /proc/[pid]/exe file.
+//
+// +stateify savable
+type exeSymlink struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeSymlink
+
+	task *kernel.Task
+}
+
+var _ kernfs.Inode = (*exeSymlink)(nil)
+
+func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry {
+	inode := &exeSymlink{task: task}
+	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+// Readlink implements kernfs.Inode.
+func (s *exeSymlink) Readlink(ctx context.Context) (string, error) {
+	if !kernel.ContextCanTrace(ctx, s.task, false) {
+		return "", syserror.EACCES
+	}
+
+	// Pull out the executable for /proc/[pid]/exe.
+	exec, err := s.executable()
+	if err != nil {
+		return "", err
+	}
+	defer exec.DecRef(ctx)
+
+	return exec.PathnameWithDeleted(ctx), nil
+}
+
+// Getlink implements kernfs.Inode.Getlink.
+func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	if !kernel.ContextCanTrace(ctx, s.task, false) {
+		return vfs.VirtualDentry{}, "", syserror.EACCES
+	}
+
+	exec, err := s.executable()
+	if err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+	defer exec.DecRef(ctx)
+
+	vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
+	vd.IncRef()
+	return vd, "", nil
+}
+
+func (s *exeSymlink) executable() (file fsbridge.File, err error) {
+	if err := checkTaskState(s.task); err != nil {
+		return nil, err
+	}
+
+	s.task.WithMuLocked(func(t *kernel.Task) {
+		mm := t.MemoryManager()
+		if mm == nil {
+			err = syserror.EACCES
+			return
+		}
+
+		// The MemoryManager may be destroyed, in which case
+		// MemoryManager.destroy will simply set the executable to nil
+		// (with locks held).
+		file = mm.Executable()
+		if file == nil {
+			err = syserror.ESRCH
+		}
+	})
+	return
+}
+
+// mountInfoData is used to implement /proc/[pid]/mountinfo.
+//
+// +stateify savable
+type mountInfoData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*mountInfoData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	var fsctx *kernel.FSContext
+	i.task.WithMuLocked(func(t *kernel.Task) {
+		fsctx = t.FSContext()
+	})
+	if fsctx == nil {
+		// The task has been destroyed. Nothing to show here.
+		return nil
+	}
+	rootDir := fsctx.RootDirectoryVFS2()
+	if !rootDir.Ok() {
+		// Root has been destroyed. Don't try to read mounts.
+		return nil
+	}
+	defer rootDir.DecRef(ctx)
+	i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf)
+	return nil
+}
+
+// mountsData is used to implement /proc/[pid]/mounts.
+//
+// +stateify savable
+type mountsData struct {
+	kernfs.DynamicBytesFile
+
+	task *kernel.Task
+}
+
+var _ dynamicInode = (*mountsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	var fsctx *kernel.FSContext
+	i.task.WithMuLocked(func(t *kernel.Task) {
+		fsctx = t.FSContext()
+	})
+	if fsctx == nil {
+		// The task has been destroyed. Nothing to show here.
+		return nil
+	}
+	rootDir := fsctx.RootDirectoryVFS2()
+	if !rootDir.Ok() {
+		// Root has been destroyed. Don't try to read mounts.
+		return nil
+	}
+	defer rootDir.DecRef(ctx)
+	i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf)
+	return nil
+}
+
+type namespaceSymlink struct {
+	kernfs.StaticSymlink
+
+	task *kernel.Task
+}
+
+func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
+	// Namespace symlinks should contain the namespace name and the inode number
+	// for the namespace instance, so for example user:[123456]. We currently fake
+	// the inode number by sticking the symlink inode in its place.
+	target := fmt.Sprintf("%s:[%d]", ns, ino)
+
+	inode := &namespaceSymlink{task: task}
+	// Note: credentials are overridden by taskOwnedInode.
+	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
+
+	taskInode := &taskOwnedInode{Inode: inode, owner: task}
+	d := &kernfs.Dentry{}
+	d.Init(taskInode)
+	return d
+}
+
+// Readlink implements Inode.
+func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) {
+	if err := checkTaskState(s.task); err != nil {
+		return "", err
+	}
+	return s.StaticSymlink.Readlink(ctx)
+}
+
+// Getlink implements Inode.Getlink.
+func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	if err := checkTaskState(s.task); err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+
+	// Create a synthetic inode to represent the namespace.
+	dentry := &kernfs.Dentry{}
+	dentry.Init(&namespaceInode{})
+	vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry())
+	vd.IncRef()
+	dentry.DecRef(ctx)
+	return vd, "", nil
+}
+
+// namespaceInode is a synthetic inode created to represent a namespace in
+// /proc/[pid]/ns/*.
+type namespaceInode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	locks vfs.FileLocks
+}
+
+var _ kernfs.Inode = (*namespaceInode)(nil)
+
+// Init initializes a namespace inode.
+func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
+	}
+	i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
+}
+
+// Open implements Inode.Open.
+func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &namespaceFD{inode: i}
+	i.IncRef()
+	fd.LockFD.Init(&i.locks)
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// namespace FD is a synthetic file that represents a namespace in
+// /proc/[pid]/ns/*.
+type namespaceFD struct {
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+
+	vfsfd vfs.FileDescription
+	inode *namespaceInode
+}
+
+var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
+
+// Stat implements FileDescriptionImpl.
+func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return fd.inode.Stat(ctx, vfs, opts)
+}
+
+// SetStat implements FileDescriptionImpl.
+func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	creds := auth.CredentialsFromContext(ctx)
+	return fd.inode.SetStat(ctx, vfs, creds, opts)
+}
+
+// Release implements FileDescriptionImpl.
+func (fd *namespaceFD) Release(ctx context.Context) {
+	fd.inode.DecRef(ctx)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *namespaceFD) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *namespaceFD) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
new file mode 100644
index 000000000..a4c884bf9
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -0,0 +1,810 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"reflect"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/header"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry {
+	k := task.Kernel()
+	pidns := task.PIDNamespace()
+	root := auth.NewRootCredentials(pidns.UserNamespace())
+
+	var contents map[string]*kernfs.Dentry
+	if stack := task.NetworkNamespace().Stack(); stack != nil {
+		const (
+			arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
+			netlink   = "sk       Eth Pid    Groups   Rmem     Wmem     Dump     Locks     Drops     Inode\n"
+			packet    = "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n"
+			protocols = "protocol  size sockets  memory press maxhdr  slab module     cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"
+			ptype     = "Type Device      Function\n"
+			upd6      = "  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n"
+		)
+		psched := fmt.Sprintf("%08x %08x %08x %08x\n", uint64(time.Microsecond/time.Nanosecond), 64, 1000000, uint64(time.Second/time.Nanosecond))
+
+		// TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task
+		// network namespace.
+		contents = map[string]*kernfs.Dentry{
+			"dev":  fs.newDentry(root, fs.NextIno(), 0444, &netDevData{stack: stack}),
+			"snmp": fs.newDentry(root, fs.NextIno(), 0444, &netSnmpData{stack: stack}),
+
+			// The following files are simple stubs until they are implemented in
+			// netstack, if the file contains a header the stub is just the header
+			// otherwise it is an empty file.
+			"arp":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(arp)),
+			"netlink":   fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(netlink)),
+			"netstat":   fs.newDentry(root, fs.NextIno(), 0444, &netStatData{}),
+			"packet":    fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(packet)),
+			"protocols": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(protocols)),
+
+			// Linux sets psched values to: nsec per usec, psched tick in ns, 1000000,
+			// high res timer ticks per sec (ClockGetres returns 1ns resolution).
+			"psched": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(psched)),
+			"ptype":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(ptype)),
+			"route":  fs.newDentry(root, fs.NextIno(), 0444, &netRouteData{stack: stack}),
+			"tcp":    fs.newDentry(root, fs.NextIno(), 0444, &netTCPData{kernel: k}),
+			"udp":    fs.newDentry(root, fs.NextIno(), 0444, &netUDPData{kernel: k}),
+			"unix":   fs.newDentry(root, fs.NextIno(), 0444, &netUnixData{kernel: k}),
+		}
+
+		if stack.SupportsIPv6() {
+			contents["if_inet6"] = fs.newDentry(root, fs.NextIno(), 0444, &ifinet6{stack: stack})
+			contents["ipv6_route"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(""))
+			contents["tcp6"] = fs.newDentry(root, fs.NextIno(), 0444, &netTCP6Data{kernel: k})
+			contents["udp6"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(upd6))
+		}
+	}
+
+	return fs.newTaskOwnedDir(task, fs.NextIno(), 0555, contents)
+}
+
+// ifinet6 implements vfs.DynamicBytesSource for /proc/net/if_inet6.
+//
+// +stateify savable
+type ifinet6 struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*ifinet6)(nil)
+
+func (n *ifinet6) contents() []string {
+	var lines []string
+	nics := n.stack.Interfaces()
+	for id, naddrs := range n.stack.InterfaceAddrs() {
+		nic, ok := nics[id]
+		if !ok {
+			// NIC was added after NICNames was called. We'll just ignore it.
+			continue
+		}
+
+		for _, a := range naddrs {
+			// IPv6 only.
+			if a.Family != linux.AF_INET6 {
+				continue
+			}
+
+			// Fields:
+			// IPv6 address displayed in 32 hexadecimal chars without colons
+			// Netlink device number (interface index) in hexadecimal (use nic id)
+			// Prefix length in hexadecimal
+			// Scope value (use 0)
+			// Interface flags
+			// Device name
+			lines = append(lines, fmt.Sprintf("%032x %02x %02x %02x %02x %8s\n", a.Addr, id, a.PrefixLen, 0, a.Flags, nic.Name))
+		}
+	}
+	return lines
+}
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *ifinet6) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	for _, l := range n.contents() {
+		buf.WriteString(l)
+	}
+	return nil
+}
+
+// netDevData implements vfs.DynamicBytesSource for /proc/net/dev.
+//
+// +stateify savable
+type netDevData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netDevData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *netDevData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	interfaces := n.stack.Interfaces()
+	buf.WriteString("Inter-|   Receive                                                |  Transmit\n")
+	buf.WriteString(" face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed\n")
+
+	for _, i := range interfaces {
+		// Implements the same format as
+		// net/core/net-procfs.c:dev_seq_printf_stats.
+		var stats inet.StatDev
+		if err := n.stack.Statistics(&stats, i.Name); err != nil {
+			log.Warningf("Failed to retrieve interface statistics for %v: %v", i.Name, err)
+			continue
+		}
+		fmt.Fprintf(
+			buf,
+			"%6s: %7d %7d %4d %4d %4d %5d %10d %9d %8d %7d %4d %4d %4d %5d %7d %10d\n",
+			i.Name,
+			// Received
+			stats[0], // bytes
+			stats[1], // packets
+			stats[2], // errors
+			stats[3], // dropped
+			stats[4], // fifo
+			stats[5], // frame
+			stats[6], // compressed
+			stats[7], // multicast
+			// Transmitted
+			stats[8],  // bytes
+			stats[9],  // packets
+			stats[10], // errors
+			stats[11], // dropped
+			stats[12], // fifo
+			stats[13], // frame
+			stats[14], // compressed
+			stats[15], // multicast
+		)
+	}
+
+	return nil
+}
+
+// netUnixData implements vfs.DynamicBytesSource for /proc/net/unix.
+//
+// +stateify savable
+type netUnixData struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netUnixData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("Num       RefCount Protocol Flags    Type St Inode Path\n")
+	for _, se := range n.kernel.ListSockets() {
+		s := se.SockVFS2
+		if !s.TryIncRef() {
+			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
+			continue
+		}
+		if family, _, _ := s.Impl().(socket.SocketVFS2).Type(); family != linux.AF_UNIX {
+			s.DecRef(ctx)
+			// Not a unix socket.
+			continue
+		}
+		sops := s.Impl().(*unix.SocketVFS2)
+
+		addr, err := sops.Endpoint().GetLocalAddress()
+		if err != nil {
+			log.Warningf("Failed to retrieve socket name from %+v: %v", s, err)
+			addr.Addr = "<unknown>"
+		}
+
+		sockFlags := 0
+		if ce, ok := sops.Endpoint().(transport.ConnectingEndpoint); ok {
+			if ce.Listening() {
+				// For unix domain sockets, linux reports a single flag
+				// value if the socket is listening, of __SO_ACCEPTCON.
+				sockFlags = linux.SO_ACCEPTCON
+			}
+		}
+
+		// Get inode number.
+		var ino uint64
+		stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_INO})
+		if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
+			log.Warningf("Failed to retrieve ino for socket file: %v", statErr)
+		} else {
+			ino = stat.Ino
+		}
+
+		// In the socket entry below, the value for the 'Num' field requires
+		// some consideration. Linux prints the address to the struct
+		// unix_sock representing a socket in the kernel, but may redact the
+		// value for unprivileged users depending on the kptr_restrict
+		// sysctl.
+		//
+		// One use for this field is to allow a privileged user to
+		// introspect into the kernel memory to determine information about
+		// a socket not available through procfs, such as the socket's peer.
+		//
+		// In gvisor, returning a pointer to our internal structures would
+		// be pointless, as it wouldn't match the memory layout for struct
+		// unix_sock, making introspection difficult. We could populate a
+		// struct unix_sock with the appropriate data, but even that
+		// requires consideration for which kernel version to emulate, as
+		// the definition of this struct changes over time.
+		//
+		// For now, we always redact this pointer.
+		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d",
+			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
+			s.Refs()-1,                    // RefCount, don't count our own ref.
+			0,                             // Protocol, always 0 for UDS.
+			sockFlags,                     // Flags.
+			sops.Endpoint().Type(),        // Type.
+			sops.State(),                  // State.
+			ino,                           // Inode.
+		)
+
+		// Path
+		if len(addr.Addr) != 0 {
+			if addr.Addr[0] == 0 {
+				// Abstract path.
+				fmt.Fprintf(buf, " @%s", string(addr.Addr[1:]))
+			} else {
+				fmt.Fprintf(buf, " %s", string(addr.Addr))
+			}
+		}
+		fmt.Fprintf(buf, "\n")
+
+		s.DecRef(ctx)
+	}
+	return nil
+}
+
+func networkToHost16(n uint16) uint16 {
+	// n is in network byte order, so is big-endian. The most-significant byte
+	// should be stored in the lower address.
+	//
+	// We manually inline binary.BigEndian.Uint16() because Go does not support
+	// non-primitive consts, so binary.BigEndian is a (mutable) var, so calls to
+	// binary.BigEndian.Uint16() require a read of binary.BigEndian and an
+	// interface method call, defeating inlining.
+	buf := [2]byte{byte(n >> 8 & 0xff), byte(n & 0xff)}
+	return usermem.ByteOrder.Uint16(buf[:])
+}
+
+func writeInetAddr(w io.Writer, family int, i linux.SockAddr) {
+	switch family {
+	case linux.AF_INET:
+		var a linux.SockAddrInet
+		if i != nil {
+			a = *i.(*linux.SockAddrInet)
+		}
+
+		// linux.SockAddrInet.Port is stored in the network byte order and is
+		// printed like a number in host byte order. Note that all numbers in host
+		// byte order are printed with the most-significant byte first when
+		// formatted with %X. See get_tcp4_sock() and udp4_format_sock() in Linux.
+		port := networkToHost16(a.Port)
+
+		// linux.SockAddrInet.Addr is stored as a byte slice in big-endian order
+		// (i.e. most-significant byte in index 0). Linux represents this as a
+		// __be32 which is a typedef for an unsigned int, and is printed with
+		// %X. This means that for a little-endian machine, Linux prints the
+		// least-significant byte of the address first. To emulate this, we first
+		// invert the byte order for the address using usermem.ByteOrder.Uint32,
+		// which makes it have the equivalent encoding to a __be32 on a little
+		// endian machine. Note that this operation is a no-op on a big endian
+		// machine. Then similar to Linux, we format it with %X, which will print
+		// the most-significant byte of the __be32 address first, which is now
+		// actually the least-significant byte of the original address in
+		// linux.SockAddrInet.Addr on little endian machines, due to the conversion.
+		addr := usermem.ByteOrder.Uint32(a.Addr[:])
+
+		fmt.Fprintf(w, "%08X:%04X ", addr, port)
+	case linux.AF_INET6:
+		var a linux.SockAddrInet6
+		if i != nil {
+			a = *i.(*linux.SockAddrInet6)
+		}
+
+		port := networkToHost16(a.Port)
+		addr0 := usermem.ByteOrder.Uint32(a.Addr[0:4])
+		addr1 := usermem.ByteOrder.Uint32(a.Addr[4:8])
+		addr2 := usermem.ByteOrder.Uint32(a.Addr[8:12])
+		addr3 := usermem.ByteOrder.Uint32(a.Addr[12:16])
+		fmt.Fprintf(w, "%08X%08X%08X%08X:%04X ", addr0, addr1, addr2, addr3, port)
+	}
+}
+
+func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel, family int) error {
+	// t may be nil here if our caller is not part of a task goroutine. This can
+	// happen for example if we're here for "sentryctl cat". When t is nil,
+	// degrade gracefully and retrieve what we can.
+	t := kernel.TaskFromContext(ctx)
+
+	for _, se := range k.ListSockets() {
+		s := se.SockVFS2
+		if !s.TryIncRef() {
+			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
+			continue
+		}
+		sops, ok := s.Impl().(socket.SocketVFS2)
+		if !ok {
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s))
+		}
+		if fa, stype, _ := sops.Type(); !(family == fa && stype == linux.SOCK_STREAM) {
+			s.DecRef(ctx)
+			// Not tcp4 sockets.
+			continue
+		}
+
+		// Linux's documentation for the fields below can be found at
+		// https://www.kernel.org/doc/Documentation/networking/proc_net_tcp.txt.
+		// For Linux's implementation, see net/ipv4/tcp_ipv4.c:get_tcp4_sock().
+		// Note that the header doesn't contain labels for all the fields.
+
+		// Field: sl; entry number.
+		fmt.Fprintf(buf, "%4d: ", se.ID)
+
+		// Field: local_adddress.
+		var localAddr linux.SockAddr
+		if t != nil {
+			if local, _, err := sops.GetSockName(t); err == nil {
+				localAddr = local
+			}
+		}
+		writeInetAddr(buf, family, localAddr)
+
+		// Field: rem_address.
+		var remoteAddr linux.SockAddr
+		if t != nil {
+			if remote, _, err := sops.GetPeerName(t); err == nil {
+				remoteAddr = remote
+			}
+		}
+		writeInetAddr(buf, family, remoteAddr)
+
+		// Field: state; socket state.
+		fmt.Fprintf(buf, "%02X ", sops.State())
+
+		// Field: tx_queue, rx_queue; number of packets in the transmit and
+		// receive queue. Unimplemented.
+		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
+
+		// Field: tr, tm->when; timer active state and number of jiffies
+		// until timer expires. Unimplemented.
+		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
+
+		// Field: retrnsmt; number of unrecovered RTO timeouts.
+		// Unimplemented.
+		fmt.Fprintf(buf, "%08X ", 0)
+
+		stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO})
+
+		// Field: uid.
+		if statErr != nil || stat.Mask&linux.STATX_UID == 0 {
+			log.Warningf("Failed to retrieve uid for socket file: %v", statErr)
+			fmt.Fprintf(buf, "%5d ", 0)
+		} else {
+			creds := auth.CredentialsFromContext(ctx)
+			fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow()))
+		}
+
+		// Field: timeout; number of unanswered 0-window probes.
+		// Unimplemented.
+		fmt.Fprintf(buf, "%8d ", 0)
+
+		// Field: inode.
+		if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
+			log.Warningf("Failed to retrieve inode for socket file: %v", statErr)
+			fmt.Fprintf(buf, "%8d ", 0)
+		} else {
+			fmt.Fprintf(buf, "%8d ", stat.Ino)
+		}
+
+		// Field: refcount. Don't count the ref we obtain while deferencing
+		// the weakref to this socket.
+		fmt.Fprintf(buf, "%d ", s.Refs()-1)
+
+		// Field: Socket struct address. Redacted due to the same reason as
+		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
+		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
+
+		// Field: retransmit timeout. Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: predicted tick of soft clock (delayed ACK control data).
+		// Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: (ack.quick<<1)|ack.pingpong, Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: sending congestion window, Unimplemented.
+		fmt.Fprintf(buf, "%d ", 0)
+
+		// Field: Slow start size threshold, -1 if threshold >= 0xFFFF.
+		// Unimplemented, report as large threshold.
+		fmt.Fprintf(buf, "%d", -1)
+
+		fmt.Fprintf(buf, "\n")
+
+		s.DecRef(ctx)
+	}
+
+	return nil
+}
+
+// netTCPData implements vfs.DynamicBytesSource for /proc/net/tcp.
+//
+// +stateify savable
+type netTCPData struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netTCPData)(nil)
+
+func (d *netTCPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode                                                     \n")
+	return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET)
+}
+
+// netTCP6Data implements vfs.DynamicBytesSource for /proc/net/tcp6.
+//
+// +stateify savable
+type netTCP6Data struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netTCP6Data)(nil)
+
+func (d *netTCP6Data) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("  sl  local_address                         remote_address                        st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode\n")
+	return commonGenerateTCP(ctx, buf, d.kernel, linux.AF_INET6)
+}
+
+// netUDPData implements vfs.DynamicBytesSource for /proc/net/udp.
+//
+// +stateify savable
+type netUDPData struct {
+	kernfs.DynamicBytesFile
+
+	kernel *kernel.Kernel
+}
+
+var _ dynamicInode = (*netUDPData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	// t may be nil here if our caller is not part of a task goroutine. This can
+	// happen for example if we're here for "sentryctl cat". When t is nil,
+	// degrade gracefully and retrieve what we can.
+	t := kernel.TaskFromContext(ctx)
+
+	buf.WriteString("  sl  local_address rem_address   st tx_queue rx_queue tr tm->when retrnsmt   uid  timeout inode ref pointer drops             \n")
+
+	for _, se := range d.kernel.ListSockets() {
+		s := se.SockVFS2
+		if !s.TryIncRef() {
+			log.Debugf("Couldn't get reference on %v in socket table, racing with destruction?", s)
+			continue
+		}
+		sops, ok := s.Impl().(socket.SocketVFS2)
+		if !ok {
+			panic(fmt.Sprintf("Found non-socket file in socket table: %+v", s))
+		}
+		if family, stype, _ := sops.Type(); family != linux.AF_INET || stype != linux.SOCK_DGRAM {
+			s.DecRef(ctx)
+			// Not udp4 socket.
+			continue
+		}
+
+		// For Linux's implementation, see net/ipv4/udp.c:udp4_format_sock().
+
+		// Field: sl; entry number.
+		fmt.Fprintf(buf, "%5d: ", se.ID)
+
+		// Field: local_adddress.
+		var localAddr linux.SockAddrInet
+		if t != nil {
+			if local, _, err := sops.GetSockName(t); err == nil {
+				localAddr = *local.(*linux.SockAddrInet)
+			}
+		}
+		writeInetAddr(buf, linux.AF_INET, &localAddr)
+
+		// Field: rem_address.
+		var remoteAddr linux.SockAddrInet
+		if t != nil {
+			if remote, _, err := sops.GetPeerName(t); err == nil {
+				remoteAddr = *remote.(*linux.SockAddrInet)
+			}
+		}
+		writeInetAddr(buf, linux.AF_INET, &remoteAddr)
+
+		// Field: state; socket state.
+		fmt.Fprintf(buf, "%02X ", sops.State())
+
+		// Field: tx_queue, rx_queue; number of packets in the transmit and
+		// receive queue. Unimplemented.
+		fmt.Fprintf(buf, "%08X:%08X ", 0, 0)
+
+		// Field: tr, tm->when. Always 0 for UDP.
+		fmt.Fprintf(buf, "%02X:%08X ", 0, 0)
+
+		// Field: retrnsmt. Always 0 for UDP.
+		fmt.Fprintf(buf, "%08X ", 0)
+
+		stat, statErr := s.Stat(ctx, vfs.StatOptions{Mask: linux.STATX_UID | linux.STATX_INO})
+
+		// Field: uid.
+		if statErr != nil || stat.Mask&linux.STATX_UID == 0 {
+			log.Warningf("Failed to retrieve uid for socket file: %v", statErr)
+			fmt.Fprintf(buf, "%5d ", 0)
+		} else {
+			creds := auth.CredentialsFromContext(ctx)
+			fmt.Fprintf(buf, "%5d ", uint32(auth.KUID(stat.UID).In(creds.UserNamespace).OrOverflow()))
+		}
+
+		// Field: timeout. Always 0 for UDP.
+		fmt.Fprintf(buf, "%8d ", 0)
+
+		// Field: inode.
+		if statErr != nil || stat.Mask&linux.STATX_INO == 0 {
+			log.Warningf("Failed to retrieve inode for socket file: %v", statErr)
+			fmt.Fprintf(buf, "%8d ", 0)
+		} else {
+			fmt.Fprintf(buf, "%8d ", stat.Ino)
+		}
+
+		// Field: ref; reference count on the socket inode. Don't count the ref
+		// we obtain while deferencing the weakref to this socket.
+		fmt.Fprintf(buf, "%d ", s.Refs()-1)
+
+		// Field: Socket struct address. Redacted due to the same reason as
+		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
+		fmt.Fprintf(buf, "%#016p ", (*socket.Socket)(nil))
+
+		// Field: drops; number of dropped packets. Unimplemented.
+		fmt.Fprintf(buf, "%d", 0)
+
+		fmt.Fprintf(buf, "\n")
+
+		s.DecRef(ctx)
+	}
+	return nil
+}
+
+// netSnmpData implements vfs.DynamicBytesSource for /proc/net/snmp.
+//
+// +stateify savable
+type netSnmpData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netSnmpData)(nil)
+
+type snmpLine struct {
+	prefix string
+	header string
+}
+
+var snmp = []snmpLine{
+	{
+		prefix: "Ip",
+		header: "Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates",
+	},
+	{
+		prefix: "Icmp",
+		header: "InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps",
+	},
+	{
+		prefix: "IcmpMsg",
+	},
+	{
+		prefix: "Tcp",
+		header: "RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors",
+	},
+	{
+		prefix: "Udp",
+		header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
+	},
+	{
+		prefix: "UdpLite",
+		header: "InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors IgnoredMulti",
+	},
+}
+
+func toSlice(a interface{}) []uint64 {
+	v := reflect.Indirect(reflect.ValueOf(a))
+	return v.Slice(0, v.Len()).Interface().([]uint64)
+}
+
+func sprintSlice(s []uint64) string {
+	if len(s) == 0 {
+		return ""
+	}
+	r := fmt.Sprint(s)
+	return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
+}
+
+// Generate implements vfs.DynamicBytesSource.
+func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	types := []interface{}{
+		&inet.StatSNMPIP{},
+		&inet.StatSNMPICMP{},
+		nil, // TODO(gvisor.dev/issue/628): Support IcmpMsg stats.
+		&inet.StatSNMPTCP{},
+		&inet.StatSNMPUDP{},
+		&inet.StatSNMPUDPLite{},
+	}
+	for i, stat := range types {
+		line := snmp[i]
+		if stat == nil {
+			fmt.Fprintf(buf, "%s:\n", line.prefix)
+			fmt.Fprintf(buf, "%s:\n", line.prefix)
+			continue
+		}
+		if err := d.stack.Statistics(stat, line.prefix); err != nil {
+			if err == syserror.EOPNOTSUPP {
+				log.Infof("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
+			} else {
+				log.Warningf("Failed to retrieve %s of /proc/net/snmp: %v", line.prefix, err)
+			}
+		}
+
+		fmt.Fprintf(buf, "%s: %s\n", line.prefix, line.header)
+
+		if line.prefix == "Tcp" {
+			tcp := stat.(*inet.StatSNMPTCP)
+			// "Tcp" needs special processing because MaxConn is signed. RFC 2012.
+			fmt.Fprintf(buf, "%s: %s %d %s\n", line.prefix, sprintSlice(tcp[:3]), int64(tcp[3]), sprintSlice(tcp[4:]))
+		} else {
+			fmt.Fprintf(buf, "%s: %s\n", line.prefix, sprintSlice(toSlice(stat)))
+		}
+	}
+	return nil
+}
+
+// netRouteData implements vfs.DynamicBytesSource for /proc/net/route.
+//
+// +stateify savable
+type netRouteData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netRouteData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.
+// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
+func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT")
+
+	interfaces := d.stack.Interfaces()
+	for _, rt := range d.stack.RouteTable() {
+		// /proc/net/route only includes ipv4 routes.
+		if rt.Family != linux.AF_INET {
+			continue
+		}
+
+		// /proc/net/route does not include broadcast or multicast routes.
+		if rt.Type == linux.RTN_BROADCAST || rt.Type == linux.RTN_MULTICAST {
+			continue
+		}
+
+		iface, ok := interfaces[rt.OutputInterface]
+		if !ok || iface.Name == "lo" {
+			continue
+		}
+
+		var (
+			gw     uint32
+			prefix uint32
+			flags  = linux.RTF_UP
+		)
+		if len(rt.GatewayAddr) == header.IPv4AddressSize {
+			flags |= linux.RTF_GATEWAY
+			gw = usermem.ByteOrder.Uint32(rt.GatewayAddr)
+		}
+		if len(rt.DstAddr) == header.IPv4AddressSize {
+			prefix = usermem.ByteOrder.Uint32(rt.DstAddr)
+		}
+		l := fmt.Sprintf(
+			"%s\t%08X\t%08X\t%04X\t%d\t%d\t%d\t%08X\t%d\t%d\t%d",
+			iface.Name,
+			prefix,
+			gw,
+			flags,
+			0, // RefCnt.
+			0, // Use.
+			0, // Metric.
+			(uint32(1)<<rt.DstLen)-1,
+			0, // MTU.
+			0, // Window.
+			0, // RTT.
+		)
+		fmt.Fprintf(buf, "%-127s\n", l)
+	}
+	return nil
+}
+
+// netStatData implements vfs.DynamicBytesSource for /proc/net/netstat.
+//
+// +stateify savable
+type netStatData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack
+}
+
+var _ dynamicInode = (*netStatData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.
+// See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
+func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " +
+		"EmbryonicRsts PruneCalled RcvPruned OfoPruned OutOfWindowIcmps " +
+		"LockDroppedIcmps ArpFilter TW TWRecycled TWKilled PAWSPassive " +
+		"PAWSActive PAWSEstab DelayedACKs DelayedACKLocked DelayedACKLost " +
+		"ListenOverflows ListenDrops TCPPrequeued TCPDirectCopyFromBacklog " +
+		"TCPDirectCopyFromPrequeue TCPPrequeueDropped TCPHPHits TCPHPHitsToUser " +
+		"TCPPureAcks TCPHPAcks TCPRenoRecovery TCPSackRecovery TCPSACKReneging " +
+		"TCPFACKReorder TCPSACKReorder TCPRenoReorder TCPTSReorder TCPFullUndo " +
+		"TCPPartialUndo TCPDSACKUndo TCPLossUndo TCPLostRetransmit " +
+		"TCPRenoFailures TCPSackFailures TCPLossFailures TCPFastRetrans " +
+		"TCPForwardRetrans TCPSlowStartRetrans TCPTimeouts TCPLossProbes " +
+		"TCPLossProbeRecovery TCPRenoRecoveryFail TCPSackRecoveryFail " +
+		"TCPSchedulerFailed TCPRcvCollapsed TCPDSACKOldSent TCPDSACKOfoSent " +
+		"TCPDSACKRecv TCPDSACKOfoRecv TCPAbortOnData TCPAbortOnClose " +
+		"TCPAbortOnMemory TCPAbortOnTimeout TCPAbortOnLinger TCPAbortFailed " +
+		"TCPMemoryPressures TCPSACKDiscard TCPDSACKIgnoredOld " +
+		"TCPDSACKIgnoredNoUndo TCPSpuriousRTOs TCPMD5NotFound TCPMD5Unexpected " +
+		"TCPMD5Failure TCPSackShifted TCPSackMerged TCPSackShiftFallback " +
+		"TCPBacklogDrop TCPMinTTLDrop TCPDeferAcceptDrop IPReversePathFilter " +
+		"TCPTimeWaitOverflow TCPReqQFullDoCookies TCPReqQFullDrop TCPRetransFail " +
+		"TCPRcvCoalesce TCPOFOQueue TCPOFODrop TCPOFOMerge TCPChallengeACK " +
+		"TCPSYNChallenge TCPFastOpenActive TCPFastOpenActiveFail " +
+		"TCPFastOpenPassive TCPFastOpenPassiveFail TCPFastOpenListenOverflow " +
+		"TCPFastOpenCookieReqd TCPSpuriousRtxHostQueues BusyPollRxPackets " +
+		"TCPAutoCorking TCPFromZeroWindowAdv TCPToZeroWindowAdv " +
+		"TCPWantZeroWindowAdv TCPSynRetrans TCPOrigDataSent TCPHystartTrainDetect " +
+		"TCPHystartTrainCwnd TCPHystartDelayDetect TCPHystartDelayCwnd " +
+		"TCPACKSkippedSynRecv TCPACKSkippedPAWS TCPACKSkippedSeq " +
+		"TCPACKSkippedFinWait2 TCPACKSkippedTimeWait TCPACKSkippedChallenge " +
+		"TCPWinProbe TCPKeepAlive TCPMTUPFail TCPMTUPSuccess\n")
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
new file mode 100644
index 000000000..6d2b90a8b
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -0,0 +1,256 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"sort"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const (
+	selfName       = "self"
+	threadSelfName = "thread-self"
+)
+
+// tasksInode represents the inode for /proc/ directory.
+//
+// +stateify savable
+type tasksInode struct {
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeAttrs
+	kernfs.OrderedChildren
+	kernfs.AlwaysValid
+
+	locks vfs.FileLocks
+
+	fs    *filesystem
+	pidns *kernel.PIDNamespace
+
+	// '/proc/self' and '/proc/thread-self' have custom directory offsets in
+	// Linux. So handle them outside of OrderedChildren.
+	selfSymlink       *vfs.Dentry
+	threadSelfSymlink *vfs.Dentry
+
+	// cgroupControllers is a map of controller name to directory in the
+	// cgroup hierarchy. These controllers are immutable and will be listed
+	// in /proc/pid/cgroup if not nil.
+	cgroupControllers map[string]string
+}
+
+var _ kernfs.Inode = (*tasksInode)(nil)
+
+func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) {
+	root := auth.NewRootCredentials(pidns.UserNamespace())
+	contents := map[string]*kernfs.Dentry{
+		"cpuinfo":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))),
+		"filesystems": fs.newDentry(root, fs.NextIno(), 0444, &filesystemsData{}),
+		"loadavg":     fs.newDentry(root, fs.NextIno(), 0444, &loadavgData{}),
+		"sys":         fs.newSysDir(root, k),
+		"meminfo":     fs.newDentry(root, fs.NextIno(), 0444, &meminfoData{}),
+		"mounts":      kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"),
+		"net":         kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"),
+		"stat":        fs.newDentry(root, fs.NextIno(), 0444, &statData{}),
+		"uptime":      fs.newDentry(root, fs.NextIno(), 0444, &uptimeData{}),
+		"version":     fs.newDentry(root, fs.NextIno(), 0444, &versionData{}),
+	}
+
+	inode := &tasksInode{
+		pidns:             pidns,
+		fs:                fs,
+		selfSymlink:       fs.newSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(),
+		threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(),
+		cgroupControllers: cgroupControllers,
+	}
+	inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+
+	dentry := &kernfs.Dentry{}
+	dentry.Init(inode)
+
+	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	links := inode.OrderedChildren.Populate(dentry, contents)
+	inode.IncLinks(links)
+
+	return inode, dentry
+}
+
+// Lookup implements kernfs.inodeDynamicLookup.
+func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+	// Try to lookup a corresponding task.
+	tid, err := strconv.ParseUint(name, 10, 64)
+	if err != nil {
+		// If it failed to parse, check if it's one of the special handled files.
+		switch name {
+		case selfName:
+			return i.selfSymlink, nil
+		case threadSelfName:
+			return i.threadSelfSymlink, nil
+		}
+		return nil, syserror.ENOENT
+	}
+
+	task := i.pidns.TaskWithID(kernel.ThreadID(tid))
+	if task == nil {
+		return nil, syserror.ENOENT
+	}
+
+	taskDentry := i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers)
+	return taskDentry.VFSDentry(), nil
+}
+
+// IterDirents implements kernfs.inodeDynamicLookup.
+func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
+	// fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
+	const FIRST_PROCESS_ENTRY = 256
+
+	// Use maxTaskID to shortcut searches that will result in 0 entries.
+	const maxTaskID = kernel.TasksLimit + 1
+	if offset >= maxTaskID {
+		return offset, nil
+	}
+
+	// According to Linux (fs/proc/base.c:proc_pid_readdir()), process directories
+	// start at offset FIRST_PROCESS_ENTRY with '/proc/self', followed by
+	// '/proc/thread-self' and then '/proc/[pid]'.
+	if offset < FIRST_PROCESS_ENTRY {
+		offset = FIRST_PROCESS_ENTRY
+	}
+
+	if offset == FIRST_PROCESS_ENTRY {
+		dirent := vfs.Dirent{
+			Name:    selfName,
+			Type:    linux.DT_LNK,
+			Ino:     i.fs.NextIno(),
+			NextOff: offset + 1,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
+		}
+		offset++
+	}
+	if offset == FIRST_PROCESS_ENTRY+1 {
+		dirent := vfs.Dirent{
+			Name:    threadSelfName,
+			Type:    linux.DT_LNK,
+			Ino:     i.fs.NextIno(),
+			NextOff: offset + 1,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
+		}
+		offset++
+	}
+
+	// Collect all tasks that TGIDs are greater than the offset specified. Per
+	// Linux we only include in directory listings if it's the leader. But for
+	// whatever crazy reason, you can still walk to the given node.
+	var tids []int
+	startTid := offset - FIRST_PROCESS_ENTRY - 2
+	for _, tg := range i.pidns.ThreadGroups() {
+		tid := i.pidns.IDOfThreadGroup(tg)
+		if int64(tid) < startTid {
+			continue
+		}
+		if leader := tg.Leader(); leader != nil {
+			tids = append(tids, int(tid))
+		}
+	}
+
+	if len(tids) == 0 {
+		return offset, nil
+	}
+
+	sort.Ints(tids)
+	for _, tid := range tids {
+		dirent := vfs.Dirent{
+			Name:    strconv.FormatUint(uint64(tid), 10),
+			Type:    linux.DT_DIR,
+			Ino:     i.fs.NextIno(),
+			NextOff: FIRST_PROCESS_ENTRY + 2 + int64(tid) + 1,
+		}
+		if err := cb.Handle(dirent); err != nil {
+			return offset, err
+		}
+		offset++
+	}
+	return maxTaskID, nil
+}
+
+// Open implements kernfs.Inode.
+func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+
+	if opts.Mask&linux.STATX_NLINK != 0 {
+		// Add dynamic children to link count.
+		for _, tg := range i.pidns.ThreadGroups() {
+			if leader := tg.Leader(); leader != nil {
+				stat.Nlink++
+			}
+		}
+	}
+
+	return stat, nil
+}
+
+// staticFileSetStat implements a special static file that allows inode
+// attributes to be set. This is to support /proc files that are readonly, but
+// allow attributes to be set.
+type staticFileSetStat struct {
+	dynamicBytesFileSetAttr
+	vfs.StaticData
+}
+
+var _ dynamicInode = (*staticFileSetStat)(nil)
+
+func newStaticFileSetStat(data string) *staticFileSetStat {
+	return &staticFileSetStat{StaticData: vfs.StaticData{Data: data}}
+}
+
+func cpuInfoData(k *kernel.Kernel) string {
+	features := k.FeatureSet()
+	if features == nil {
+		// Kernel is always initialized with a FeatureSet.
+		panic("cpuinfo read with nil FeatureSet")
+	}
+	var buf bytes.Buffer
+	for i, max := uint(0), k.ApplicationCores(); i < max; i++ {
+		features.WriteCPUInfoTo(i, &buf)
+	}
+	return buf.String()
+}
+
+func shmData(v uint64) dynamicInode {
+	return newStaticFile(strconv.FormatUint(v, 10))
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
new file mode 100644
index 000000000..7d8983aa5
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -0,0 +1,384 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+	"strconv"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type selfSymlink struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeSymlink
+
+	pidns *kernel.PIDNamespace
+}
+
+var _ kernfs.Inode = (*selfSymlink)(nil)
+
+func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
+	inode := &selfSymlink{pidns: pidns}
+	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// Who is reading this link?
+		return "", syserror.EINVAL
+	}
+	tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
+	if tgid == 0 {
+		return "", syserror.ENOENT
+	}
+	return strconv.FormatUint(uint64(tgid), 10), nil
+}
+
+func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	target, err := s.Readlink(ctx)
+	return vfs.VirtualDentry{}, target, err
+}
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+type threadSelfSymlink struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeSymlink
+
+	pidns *kernel.PIDNamespace
+}
+
+var _ kernfs.Inode = (*threadSelfSymlink)(nil)
+
+func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
+	inode := &threadSelfSymlink{pidns: pidns}
+	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+
+	d := &kernfs.Dentry{}
+	d.Init(inode)
+	return d
+}
+
+func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// Who is reading this link?
+		return "", syserror.EINVAL
+	}
+	tgid := s.pidns.IDOfThreadGroup(t.ThreadGroup())
+	tid := s.pidns.IDOfTask(t)
+	if tid == 0 || tgid == 0 {
+		return "", syserror.ENOENT
+	}
+	return fmt.Sprintf("%d/task/%d", tgid, tid), nil
+}
+
+func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	target, err := s.Readlink(ctx)
+	return vfs.VirtualDentry{}, target, err
+}
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+// dynamicBytesFileSetAttr implements a special file that allows inode
+// attributes to be set. This is to support /proc files that are readonly, but
+// allow attributes to be set.
+type dynamicBytesFileSetAttr struct {
+	kernfs.DynamicBytesFile
+}
+
+// SetStat implements Inode.SetStat.
+func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts)
+}
+
+// cpuStats contains the breakdown of CPU time for /proc/stat.
+type cpuStats struct {
+	// user is time spent in userspace tasks with non-positive niceness.
+	user uint64
+
+	// nice is time spent in userspace tasks with positive niceness.
+	nice uint64
+
+	// system is time spent in non-interrupt kernel context.
+	system uint64
+
+	// idle is time spent idle.
+	idle uint64
+
+	// ioWait is time spent waiting for IO.
+	ioWait uint64
+
+	// irq is time spent in interrupt context.
+	irq uint64
+
+	// softirq is time spent in software interrupt context.
+	softirq uint64
+
+	// steal is involuntary wait time.
+	steal uint64
+
+	// guest is time spent in guests with non-positive niceness.
+	guest uint64
+
+	// guestNice is time spent in guests with positive niceness.
+	guestNice uint64
+}
+
+// String implements fmt.Stringer.
+func (c cpuStats) String() string {
+	return fmt.Sprintf("%d %d %d %d %d %d %d %d %d %d", c.user, c.nice, c.system, c.idle, c.ioWait, c.irq, c.softirq, c.steal, c.guest, c.guestNice)
+}
+
+// statData implements vfs.DynamicBytesSource for /proc/stat.
+//
+// +stateify savable
+type statData struct {
+	dynamicBytesFileSetAttr
+}
+
+var _ dynamicInode = (*statData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (*statData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	// TODO(b/37226836): We currently export only zero CPU stats. We could
+	// at least provide some aggregate stats.
+	var cpu cpuStats
+	fmt.Fprintf(buf, "cpu  %s\n", cpu)
+
+	k := kernel.KernelFromContext(ctx)
+	for c, max := uint(0), k.ApplicationCores(); c < max; c++ {
+		fmt.Fprintf(buf, "cpu%d %s\n", c, cpu)
+	}
+
+	// The total number of interrupts is dependent on the CPUs and PCI
+	// devices on the system. See arch_probe_nr_irqs.
+	//
+	// Since we don't report real interrupt stats, just choose an arbitrary
+	// value from a representative VM.
+	const numInterrupts = 256
+
+	// The Kernel doesn't handle real interrupts, so report all zeroes.
+	// TODO(b/37226836): We could count page faults as #PF.
+	fmt.Fprintf(buf, "intr 0") // total
+	for i := 0; i < numInterrupts; i++ {
+		fmt.Fprintf(buf, " 0")
+	}
+	fmt.Fprintf(buf, "\n")
+
+	// Total number of context switches.
+	// TODO(b/37226836): Count this.
+	fmt.Fprintf(buf, "ctxt 0\n")
+
+	// CLOCK_REALTIME timestamp from boot, in seconds.
+	fmt.Fprintf(buf, "btime %d\n", k.Timekeeper().BootTime().Seconds())
+
+	// Total number of clones.
+	// TODO(b/37226836): Count this.
+	fmt.Fprintf(buf, "processes 0\n")
+
+	// Number of runnable tasks.
+	// TODO(b/37226836): Count this.
+	fmt.Fprintf(buf, "procs_running 0\n")
+
+	// Number of tasks waiting on IO.
+	// TODO(b/37226836): Count this.
+	fmt.Fprintf(buf, "procs_blocked 0\n")
+
+	// Number of each softirq handled.
+	fmt.Fprintf(buf, "softirq 0") // total
+	for i := 0; i < linux.NumSoftIRQ; i++ {
+		fmt.Fprintf(buf, " 0")
+	}
+	fmt.Fprintf(buf, "\n")
+	return nil
+}
+
+// loadavgData backs /proc/loadavg.
+//
+// +stateify savable
+type loadavgData struct {
+	dynamicBytesFileSetAttr
+}
+
+var _ dynamicInode = (*loadavgData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (*loadavgData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	// TODO(b/62345059): Include real data in fields.
+	// Column 1-3: CPU and IO utilization of the last 1, 5, and 10 minute periods.
+	// Column 4-5: currently running processes and the total number of processes.
+	// Column 6: the last process ID used.
+	fmt.Fprintf(buf, "%.2f %.2f %.2f %d/%d %d\n", 0.00, 0.00, 0.00, 0, 0, 0)
+	return nil
+}
+
+// meminfoData implements vfs.DynamicBytesSource for /proc/meminfo.
+//
+// +stateify savable
+type meminfoData struct {
+	dynamicBytesFileSetAttr
+}
+
+var _ dynamicInode = (*meminfoData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (*meminfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	k := kernel.KernelFromContext(ctx)
+	mf := k.MemoryFile()
+	mf.UpdateUsage()
+	snapshot, totalUsage := usage.MemoryAccounting.Copy()
+	totalSize := usage.TotalMemory(mf.TotalSize(), totalUsage)
+	anon := snapshot.Anonymous + snapshot.Tmpfs
+	file := snapshot.PageCache + snapshot.Mapped
+	// We don't actually have active/inactive LRUs, so just make up numbers.
+	activeFile := (file / 2) &^ (usermem.PageSize - 1)
+	inactiveFile := file - activeFile
+
+	fmt.Fprintf(buf, "MemTotal:       %8d kB\n", totalSize/1024)
+	memFree := totalSize - totalUsage
+	if memFree > totalSize {
+		// Underflow.
+		memFree = 0
+	}
+	// We use MemFree as MemAvailable because we don't swap.
+	// TODO(rahat): When reclaim is implemented the value of MemAvailable
+	// should change.
+	fmt.Fprintf(buf, "MemFree:        %8d kB\n", memFree/1024)
+	fmt.Fprintf(buf, "MemAvailable:   %8d kB\n", memFree/1024)
+	fmt.Fprintf(buf, "Buffers:               0 kB\n") // memory usage by block devices
+	fmt.Fprintf(buf, "Cached:         %8d kB\n", (file+snapshot.Tmpfs)/1024)
+	// Emulate a system with no swap, which disables inactivation of anon pages.
+	fmt.Fprintf(buf, "SwapCache:             0 kB\n")
+	fmt.Fprintf(buf, "Active:         %8d kB\n", (anon+activeFile)/1024)
+	fmt.Fprintf(buf, "Inactive:       %8d kB\n", inactiveFile/1024)
+	fmt.Fprintf(buf, "Active(anon):   %8d kB\n", anon/1024)
+	fmt.Fprintf(buf, "Inactive(anon):        0 kB\n")
+	fmt.Fprintf(buf, "Active(file):   %8d kB\n", activeFile/1024)
+	fmt.Fprintf(buf, "Inactive(file): %8d kB\n", inactiveFile/1024)
+	fmt.Fprintf(buf, "Unevictable:           0 kB\n") // TODO(b/31823263)
+	fmt.Fprintf(buf, "Mlocked:               0 kB\n") // TODO(b/31823263)
+	fmt.Fprintf(buf, "SwapTotal:             0 kB\n")
+	fmt.Fprintf(buf, "SwapFree:              0 kB\n")
+	fmt.Fprintf(buf, "Dirty:                 0 kB\n")
+	fmt.Fprintf(buf, "Writeback:             0 kB\n")
+	fmt.Fprintf(buf, "AnonPages:      %8d kB\n", anon/1024)
+	fmt.Fprintf(buf, "Mapped:         %8d kB\n", file/1024) // doesn't count mapped tmpfs, which we don't know
+	fmt.Fprintf(buf, "Shmem:          %8d kB\n", snapshot.Tmpfs/1024)
+	return nil
+}
+
+// uptimeData implements vfs.DynamicBytesSource for /proc/uptime.
+//
+// +stateify savable
+type uptimeData struct {
+	dynamicBytesFileSetAttr
+}
+
+var _ dynamicInode = (*uptimeData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (*uptimeData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	k := kernel.KernelFromContext(ctx)
+	now := time.NowFromContext(ctx)
+
+	// Pretend that we've spent zero time sleeping (second number).
+	fmt.Fprintf(buf, "%.2f 0.00\n", now.Sub(k.Timekeeper().BootTime()).Seconds())
+	return nil
+}
+
+// versionData implements vfs.DynamicBytesSource for /proc/version.
+//
+// +stateify savable
+type versionData struct {
+	dynamicBytesFileSetAttr
+}
+
+var _ dynamicInode = (*versionData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (*versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	k := kernel.KernelFromContext(ctx)
+	init := k.GlobalInit()
+	if init == nil {
+		// Attempted to read before the init Task is created. This can
+		// only occur during startup, which should never need to read
+		// this file.
+		panic("Attempted to read version before initial Task is available")
+	}
+
+	// /proc/version takes the form:
+	//
+	// "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST)
+	// (COMPILER_VERSION) VERSION"
+	//
+	// where:
+	// - SYSNAME, RELEASE, and VERSION are the same as returned by
+	// sys_utsname
+	// - COMPILE_USER is the user that build the kernel
+	// - COMPILE_HOST is the hostname of the machine on which the kernel
+	// was built
+	// - COMPILER_VERSION is the version reported by the building compiler
+	//
+	// Since we don't really want to expose build information to
+	// applications, those fields are omitted.
+	//
+	// FIXME(mpratt): Using Version from the init task SyscallTable
+	// disregards the different version a task may have (e.g., in a uts
+	// namespace).
+	ver := init.Leader().SyscallTable().Version
+	fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version)
+	return nil
+}
+
+// filesystemsData backs /proc/filesystems.
+//
+// +stateify savable
+type filesystemsData struct {
+	kernfs.DynamicBytesFile
+}
+
+var _ dynamicInode = (*filesystemsData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *filesystemsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	k := kernel.KernelFromContext(ctx)
+	k.VFS().GenerateProcFilesystems(buf)
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
new file mode 100644
index 000000000..6768aa880
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -0,0 +1,317 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// newSysDir returns the dentry corresponding to /proc/sys directory.
+func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
+	return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+		"kernel": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+			"hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}),
+			"shmall":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)),
+			"shmmax":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)),
+			"shmmni":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)),
+		}),
+		"vm": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+			"mmap_min_addr":     fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}),
+			"overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")),
+		}),
+		"net": fs.newSysNetDir(root, k),
+	})
+}
+
+// newSysNetDir returns the dentry corresponding to /proc/sys/net directory.
+func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
+	var contents map[string]*kernfs.Dentry
+
+	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
+	// network namespace of the calling process.
+	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
+		contents = map[string]*kernfs.Dentry{
+			"ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+				"tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}),
+				"tcp_sack":     fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
+
+				// The following files are simple stubs until they are implemented in
+				// netstack, most of these files are configuration related. We use the
+				// value closest to the actual netstack behavior or any empty file, all
+				// of these files will have mode 0444 (read-only for all users).
+				"ip_local_port_range":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("16000   65535")),
+				"ip_local_reserved_ports": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
+				"ipfrag_time":             fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("30")),
+				"ip_nonlocal_bind":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"ip_no_pmtu_disc":         fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+
+				// tcp_allowed_congestion_control tell the user what they are able to
+				// do as an unprivledged process so we leave it empty.
+				"tcp_allowed_congestion_control":   fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
+				"tcp_available_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")),
+				"tcp_congestion_control":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")),
+
+				// Many of the following stub files are features netstack doesn't
+				// support. The unsupported features return "0" to indicate they are
+				// disabled.
+				"tcp_base_mss":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1280")),
+				"tcp_dsack":                 fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_early_retrans":         fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_fack":                  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_fastopen":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_fastopen_key":          fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
+				"tcp_invalid_ratelimit":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_keepalive_intvl":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_keepalive_probes":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_keepalive_time":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("7200")),
+				"tcp_mtu_probing":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_no_metrics_save":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+				"tcp_probe_interval":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_probe_threshold":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"tcp_retries1":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
+				"tcp_retries2":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("15")),
+				"tcp_rfc1337":               fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+				"tcp_slow_start_after_idle": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+				"tcp_synack_retries":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
+				"tcp_syn_retries":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
+				"tcp_timestamps":            fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+			}),
+			"core": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
+				"default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")),
+				"message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")),
+				"message_cost":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
+				"optmem_max":    fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
+				"rmem_default":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
+				"rmem_max":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
+				"somaxconn":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("128")),
+				"wmem_default":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
+				"wmem_max":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
+			}),
+		}
+	}
+
+	return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents)
+}
+
+// mmapMinAddrData implements vfs.DynamicBytesSource for
+// /proc/sys/vm/mmap_min_addr.
+//
+// +stateify savable
+type mmapMinAddrData struct {
+	kernfs.DynamicBytesFile
+
+	k *kernel.Kernel
+}
+
+var _ dynamicInode = (*mmapMinAddrData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (d *mmapMinAddrData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "%d\n", d.k.Platform.MinUserAddress())
+	return nil
+}
+
+// hostnameData implements vfs.DynamicBytesSource for /proc/sys/kernel/hostname.
+//
+// +stateify savable
+type hostnameData struct {
+	kernfs.DynamicBytesFile
+}
+
+var _ dynamicInode = (*hostnameData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (*hostnameData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	utsns := kernel.UTSNamespaceFromContext(ctx)
+	buf.WriteString(utsns.HostName())
+	buf.WriteString("\n")
+	return nil
+}
+
+// tcpSackData implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/tcp_sack.
+//
+// +stateify savable
+type tcpSackData struct {
+	kernfs.DynamicBytesFile
+
+	stack   inet.Stack `state:"wait"`
+	enabled *bool
+}
+
+var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.
+func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if d.enabled == nil {
+		sack, err := d.stack.TCPSACKEnabled()
+		if err != nil {
+			return err
+		}
+		d.enabled = &sack
+	}
+
+	val := "0\n"
+	if *d.enabled {
+		// Technically, this is not quite compatible with Linux. Linux stores these
+		// as an integer, so if you write "2" into tcp_sack, you should get 2 back.
+		// Tough luck.
+		val = "1\n"
+	}
+	buf.WriteString(val)
+	return nil
+}
+
+func (d *tcpSackData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		// No need to handle partial writes thus far.
+		return 0, syserror.EINVAL
+	}
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit the amount of memory allocated.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return n, err
+	}
+	if d.enabled == nil {
+		d.enabled = new(bool)
+	}
+	*d.enabled = v != 0
+	return n, d.stack.SetTCPSACKEnabled(*d.enabled)
+}
+
+// tcpRecoveryData implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/ipv4/tcp_recovery.
+//
+// +stateify savable
+type tcpRecoveryData struct {
+	kernfs.DynamicBytesFile
+
+	stack inet.Stack `state:"wait"`
+}
+
+var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil)
+
+// Generate implements vfs.DynamicBytesSource.
+func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	recovery, err := d.stack.TCPRecovery()
+	if err != nil {
+		return err
+	}
+
+	buf.WriteString(fmt.Sprintf("%d\n", recovery))
+	return nil
+}
+
+func (d *tcpRecoveryData) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		// No need to handle partial writes thus far.
+		return 0, syserror.EINVAL
+	}
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit the amount of memory allocated.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+	if err := d.stack.SetTCPRecovery(inet.TCPLossRecovery(v)); err != nil {
+		return 0, err
+	}
+	return n, nil
+}
+
+// ipForwarding implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/ipv4/ip_forwarding.
+//
+// +stateify savable
+type ipForwarding struct {
+	kernfs.DynamicBytesFile
+
+	stack   inet.Stack `state:"wait"`
+	enabled *bool
+}
+
+var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if ipf.enabled == nil {
+		enabled := ipf.stack.Forwarding(ipv4.ProtocolNumber)
+		ipf.enabled = &enabled
+	}
+
+	val := "0\n"
+	if *ipf.enabled {
+		// Technically, this is not quite compatible with Linux. Linux stores these
+		// as an integer, so if you write "2" into tcp_sack, you should get 2 back.
+		// Tough luck.
+		val = "1\n"
+	}
+	buf.WriteString(val)
+
+	return nil
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		// No need to handle partial writes thus far.
+		return 0, syserror.EINVAL
+	}
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+	if ipf.enabled == nil {
+		ipf.enabled = new(bool)
+	}
+	*ipf.enabled = v != 0
+	if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil {
+		return 0, err
+	}
+	return n, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/net_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
index 20a77a8ca..1abf56da2 100644
--- a/pkg/sentry/fsimpl/proc/net_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
@@ -20,8 +20,10 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func newIPv6TestStack() *inet.TestStack {
@@ -31,7 +33,7 @@ func newIPv6TestStack() *inet.TestStack {
 }
 
 func TestIfinet6NoAddresses(t *testing.T) {
-	n := &ifinet6{s: newIPv6TestStack()}
+	n := &ifinet6{stack: newIPv6TestStack()}
 	var buf bytes.Buffer
 	n.Generate(contexttest.Context(t), &buf)
 	if buf.Len() > 0 {
@@ -62,7 +64,7 @@ func TestIfinet6(t *testing.T) {
 		"101112131415161718191a1b1c1d1e1f 02 80 00 00     eth1\n": {},
 	}
 
-	n := &ifinet6{s: s}
+	n := &ifinet6{stack: s}
 	contents := n.contents()
 	if len(contents) != len(want) {
 		t.Errorf("Got len(n.contents()) = %d, want = %d", len(contents), len(want))
@@ -76,3 +78,72 @@ func TestIfinet6(t *testing.T) {
 		t.Errorf("Got n.contents() = %v, want = %v", got, want)
 	}
 }
+
+// TestIPForwarding tests the implementation of
+// /proc/sys/net/ipv4/ip_forwarding
+func TestConfigureIPForwarding(t *testing.T) {
+	ctx := context.Background()
+	s := inet.NewTestStack()
+
+	var cases = []struct {
+		comment string
+		initial bool
+		str     string
+		final   bool
+	}{
+		{
+			comment: `Forwarding is disabled; write 1 and enable forwarding`,
+			initial: false,
+			str:     "1",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is disabled; write 0 and disable forwarding`,
+			initial: false,
+			str:     "0",
+			final:   false,
+		},
+		{
+			comment: `Forwarding is enabled; write 1 and enable forwarding`,
+			initial: true,
+			str:     "1",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is enabled; write 0 and disable forwarding`,
+			initial: true,
+			str:     "0",
+			final:   false,
+		},
+		{
+			comment: `Forwarding is disabled; write 2404 and enable forwarding`,
+			initial: false,
+			str:     "2404",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is enabled; write 2404 and enable forwarding`,
+			initial: true,
+			str:     "2404",
+			final:   true,
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.comment, func(t *testing.T) {
+			s.IPForwarding = c.initial
+
+			file := &ipForwarding{stack: s, enabled: &c.initial}
+
+			// Write the values.
+			src := usermem.BytesIOSequence([]byte(c.str))
+			if n, err := file.Write(ctx, src, 0); n != int64(len(c.str)) || err != nil {
+				t.Errorf("file.Write(ctx, nil, %v, 0) = (%d, %v); wanted (%d, nil)", c.str, n, err, len(c.str))
+			}
+
+			// Read the values from the stack and check them.
+			if s.IPForwarding != c.final {
+				t.Errorf("s.IPForwarding = %v; wanted %v", s.IPForwarding, c.final)
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
new file mode 100644
index 000000000..3c9297dee
--- /dev/null
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -0,0 +1,505 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package proc
+
+import (
+	"fmt"
+	"math"
+	"path"
+	"strconv"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+var (
+	// Next offset 256 by convention. Adds 1 for the next offset.
+	selfLink       = vfs.Dirent{Type: linux.DT_LNK, NextOff: 256 + 0 + 1}
+	threadSelfLink = vfs.Dirent{Type: linux.DT_LNK, NextOff: 256 + 1 + 1}
+
+	// /proc/[pid] next offset starts at 256+2 (files above), then adds the
+	// PID, and adds 1 for the next offset.
+	proc1 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 1 + 1}
+	proc2 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 2 + 1}
+	proc3 = vfs.Dirent{Type: linux.DT_DIR, NextOff: 258 + 3 + 1}
+)
+
+var (
+	tasksStaticFiles = map[string]testutil.DirentType{
+		"cpuinfo":     linux.DT_REG,
+		"filesystems": linux.DT_REG,
+		"loadavg":     linux.DT_REG,
+		"meminfo":     linux.DT_REG,
+		"mounts":      linux.DT_LNK,
+		"net":         linux.DT_LNK,
+		"self":        linux.DT_LNK,
+		"stat":        linux.DT_REG,
+		"sys":         linux.DT_DIR,
+		"thread-self": linux.DT_LNK,
+		"uptime":      linux.DT_REG,
+		"version":     linux.DT_REG,
+	}
+	tasksStaticFilesNextOffs = map[string]int64{
+		"self":        selfLink.NextOff,
+		"thread-self": threadSelfLink.NextOff,
+	}
+	taskStaticFiles = map[string]testutil.DirentType{
+		"auxv":          linux.DT_REG,
+		"cgroup":        linux.DT_REG,
+		"cmdline":       linux.DT_REG,
+		"comm":          linux.DT_REG,
+		"environ":       linux.DT_REG,
+		"exe":           linux.DT_LNK,
+		"fd":            linux.DT_DIR,
+		"fdinfo":        linux.DT_DIR,
+		"gid_map":       linux.DT_REG,
+		"io":            linux.DT_REG,
+		"maps":          linux.DT_REG,
+		"mountinfo":     linux.DT_REG,
+		"mounts":        linux.DT_REG,
+		"net":           linux.DT_DIR,
+		"ns":            linux.DT_DIR,
+		"oom_score":     linux.DT_REG,
+		"oom_score_adj": linux.DT_REG,
+		"smaps":         linux.DT_REG,
+		"stat":          linux.DT_REG,
+		"statm":         linux.DT_REG,
+		"status":        linux.DT_REG,
+		"task":          linux.DT_DIR,
+		"uid_map":       linux.DT_REG,
+	}
+)
+
+func setup(t *testing.T) *testutil.System {
+	k, err := testutil.Boot()
+	if err != nil {
+		t.Fatalf("Error creating kernel: %v", err)
+	}
+
+	ctx := k.SupervisorContext()
+	creds := auth.CredentialsFromContext(ctx)
+
+	k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("NewMountNamespace(): %v", err)
+	}
+	pop := &vfs.PathOperation{
+		Root:  mntns.Root(),
+		Start: mntns.Root(),
+		Path:  fspath.Parse("/proc"),
+	}
+	if err := k.VFS().MkdirAt(ctx, creds, pop, &vfs.MkdirOptions{Mode: 0777}); err != nil {
+		t.Fatalf("MkDir(/proc): %v", err)
+	}
+
+	pop = &vfs.PathOperation{
+		Root:  mntns.Root(),
+		Start: mntns.Root(),
+		Path:  fspath.Parse("/proc"),
+	}
+	mntOpts := &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: &InternalData{
+				Cgroups: map[string]string{
+					"cpuset": "/foo/cpuset",
+					"memory": "/foo/memory",
+				},
+			},
+		},
+	}
+	if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
+		t.Fatalf("MountAt(/proc): %v", err)
+	}
+	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
+}
+
+func TestTasksEmpty(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	collector := s.ListDirents(s.PathOpAtRoot("/proc"))
+	s.AssertAllDirentTypes(collector, tasksStaticFiles)
+	s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs)
+}
+
+func TestTasks(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	expectedDirents := make(map[string]testutil.DirentType)
+	for n, d := range tasksStaticFiles {
+		expectedDirents[n] = d
+	}
+
+	k := kernel.KernelFromContext(s.Ctx)
+	var tasks []*kernel.Task
+	for i := 0; i < 5; i++ {
+		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root)
+		if err != nil {
+			t.Fatalf("CreateTask(): %v", err)
+		}
+		tasks = append(tasks, task)
+		expectedDirents[fmt.Sprintf("%d", i+1)] = linux.DT_DIR
+	}
+
+	collector := s.ListDirents(s.PathOpAtRoot("/proc"))
+	s.AssertAllDirentTypes(collector, expectedDirents)
+	s.AssertDirentOffsets(collector, tasksStaticFilesNextOffs)
+
+	lastPid := 0
+	dirents := collector.OrderedDirents()
+	doneSkippingNonTaskDirs := false
+	for _, d := range dirents {
+		pid, err := strconv.Atoi(d.Name)
+		if err != nil {
+			if !doneSkippingNonTaskDirs {
+				// We haven't gotten to the task dirs yet.
+				continue
+			}
+			t.Fatalf("Invalid process directory %q", d.Name)
+		}
+		doneSkippingNonTaskDirs = true
+		if lastPid > pid {
+			t.Errorf("pids not in order: %v", dirents)
+		}
+		found := false
+		for _, t := range tasks {
+			if k.TaskSet().Root.IDOfTask(t) == kernel.ThreadID(pid) {
+				found = true
+			}
+		}
+		if !found {
+			t.Errorf("Additional task ID %d listed: %v", pid, tasks)
+		}
+		// Next offset starts at 256+2 ('self' and 'thread-self'), then adds the
+		// PID, and adds 1 for the next offset.
+		if want := int64(256 + 2 + pid + 1); d.NextOff != want {
+			t.Errorf("Wrong dirent offset want: %d got: %d: %+v", want, d.NextOff, d)
+		}
+	}
+	if !doneSkippingNonTaskDirs {
+		t.Fatalf("Never found any process directories.")
+	}
+
+	// Test lookup.
+	for _, path := range []string{"/proc/1", "/proc/2"} {
+		fd, err := s.VFS.OpenAt(
+			s.Ctx,
+			s.Creds,
+			s.PathOpAtRoot(path),
+			&vfs.OpenOptions{},
+		)
+		if err != nil {
+			t.Fatalf("vfsfs.OpenAt(%q) failed: %v", path, err)
+		}
+		defer fd.DecRef(s.Ctx)
+		buf := make([]byte, 1)
+		bufIOSeq := usermem.BytesIOSequence(buf)
+		if _, err := fd.Read(s.Ctx, bufIOSeq, vfs.ReadOptions{}); err != syserror.EISDIR {
+			t.Errorf("wrong error reading directory: %v", err)
+		}
+	}
+
+	if _, err := s.VFS.OpenAt(
+		s.Ctx,
+		s.Creds,
+		s.PathOpAtRoot("/proc/9999"),
+		&vfs.OpenOptions{},
+	); err != syserror.ENOENT {
+		t.Fatalf("wrong error from vfsfs.OpenAt(/proc/9999): %v", err)
+	}
+}
+
+func TestTasksOffset(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	k := kernel.KernelFromContext(s.Ctx)
+	for i := 0; i < 3; i++ {
+		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+		if _, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root); err != nil {
+			t.Fatalf("CreateTask(): %v", err)
+		}
+	}
+
+	for _, tc := range []struct {
+		name   string
+		offset int64
+		wants  map[string]vfs.Dirent
+	}{
+		{
+			name:   "small offset",
+			offset: 100,
+			wants: map[string]vfs.Dirent{
+				"self":        selfLink,
+				"thread-self": threadSelfLink,
+				"1":           proc1,
+				"2":           proc2,
+				"3":           proc3,
+			},
+		},
+		{
+			name:   "offset at start",
+			offset: 256,
+			wants: map[string]vfs.Dirent{
+				"self":        selfLink,
+				"thread-self": threadSelfLink,
+				"1":           proc1,
+				"2":           proc2,
+				"3":           proc3,
+			},
+		},
+		{
+			name:   "skip /proc/self",
+			offset: 257,
+			wants: map[string]vfs.Dirent{
+				"thread-self": threadSelfLink,
+				"1":           proc1,
+				"2":           proc2,
+				"3":           proc3,
+			},
+		},
+		{
+			name:   "skip symlinks",
+			offset: 258,
+			wants: map[string]vfs.Dirent{
+				"1": proc1,
+				"2": proc2,
+				"3": proc3,
+			},
+		},
+		{
+			name:   "skip first process",
+			offset: 260,
+			wants: map[string]vfs.Dirent{
+				"2": proc2,
+				"3": proc3,
+			},
+		},
+		{
+			name:   "last process",
+			offset: 261,
+			wants: map[string]vfs.Dirent{
+				"3": proc3,
+			},
+		},
+		{
+			name:   "after last",
+			offset: 262,
+			wants:  nil,
+		},
+		{
+			name:   "TaskLimit+1",
+			offset: kernel.TasksLimit + 1,
+			wants:  nil,
+		},
+		{
+			name:   "max",
+			offset: math.MaxInt64,
+			wants:  nil,
+		},
+	} {
+		t.Run(tc.name, func(t *testing.T) {
+			s := s.WithSubtest(t)
+			fd, err := s.VFS.OpenAt(
+				s.Ctx,
+				s.Creds,
+				s.PathOpAtRoot("/proc"),
+				&vfs.OpenOptions{},
+			)
+			if err != nil {
+				t.Fatalf("vfsfs.OpenAt(/) failed: %v", err)
+			}
+			defer fd.DecRef(s.Ctx)
+			if _, err := fd.Seek(s.Ctx, tc.offset, linux.SEEK_SET); err != nil {
+				t.Fatalf("Seek(%d, SEEK_SET): %v", tc.offset, err)
+			}
+
+			var collector testutil.DirentCollector
+			if err := fd.IterDirents(s.Ctx, &collector); err != nil {
+				t.Fatalf("IterDirent(): %v", err)
+			}
+
+			expectedTypes := make(map[string]testutil.DirentType)
+			expectedOffsets := make(map[string]int64)
+			for name, want := range tc.wants {
+				expectedTypes[name] = want.Type
+				if want.NextOff != 0 {
+					expectedOffsets[name] = want.NextOff
+				}
+			}
+
+			collector.SkipDotsChecks(true) // We seek()ed past the dots.
+			s.AssertAllDirentTypes(&collector, expectedTypes)
+			s.AssertDirentOffsets(&collector, expectedOffsets)
+		})
+	}
+}
+
+func TestTask(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	k := kernel.KernelFromContext(s.Ctx)
+	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+	_, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root)
+	if err != nil {
+		t.Fatalf("CreateTask(): %v", err)
+	}
+
+	collector := s.ListDirents(s.PathOpAtRoot("/proc/1"))
+	s.AssertAllDirentTypes(collector, taskStaticFiles)
+}
+
+func TestProcSelf(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	k := kernel.KernelFromContext(s.Ctx)
+	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+	task, err := testutil.CreateTask(s.Ctx, "name", tc, s.MntNs, s.Root, s.Root)
+	if err != nil {
+		t.Fatalf("CreateTask(): %v", err)
+	}
+
+	collector := s.WithTemporaryContext(task).ListDirents(&vfs.PathOperation{
+		Root:               s.Root,
+		Start:              s.Root,
+		Path:               fspath.Parse("/proc/self/"),
+		FollowFinalSymlink: true,
+	})
+	s.AssertAllDirentTypes(collector, taskStaticFiles)
+}
+
+func iterateDir(ctx context.Context, t *testing.T, s *testutil.System, fd *vfs.FileDescription) {
+	t.Logf("Iterating: %s", fd.MappedName(ctx))
+
+	var collector testutil.DirentCollector
+	if err := fd.IterDirents(ctx, &collector); err != nil {
+		t.Fatalf("IterDirents(): %v", err)
+	}
+	if err := collector.Contains(".", linux.DT_DIR); err != nil {
+		t.Error(err.Error())
+	}
+	if err := collector.Contains("..", linux.DT_DIR); err != nil {
+		t.Error(err.Error())
+	}
+
+	for _, d := range collector.Dirents() {
+		if d.Name == "." || d.Name == ".." {
+			continue
+		}
+		absPath := path.Join(fd.MappedName(ctx), d.Name)
+		if d.Type == linux.DT_LNK {
+			link, err := s.VFS.ReadlinkAt(
+				ctx,
+				auth.CredentialsFromContext(ctx),
+				&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(absPath)},
+			)
+			if err != nil {
+				t.Errorf("vfsfs.ReadlinkAt(%v) failed: %v", absPath, err)
+			} else {
+				t.Logf("Skipping symlink: %s => %s", absPath, link)
+			}
+			continue
+		}
+
+		t.Logf("Opening: %s", absPath)
+		child, err := s.VFS.OpenAt(
+			ctx,
+			auth.CredentialsFromContext(ctx),
+			&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse(absPath)},
+			&vfs.OpenOptions{},
+		)
+		if err != nil {
+			t.Errorf("vfsfs.OpenAt(%v) failed: %v", absPath, err)
+			continue
+		}
+		defer child.DecRef(ctx)
+		stat, err := child.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			t.Errorf("Stat(%v) failed: %v", absPath, err)
+		}
+		if got := linux.FileMode(stat.Mode).DirentType(); got != d.Type {
+			t.Errorf("wrong file mode, stat: %v, dirent: %v", got, d.Type)
+		}
+		if d.Type == linux.DT_DIR {
+			// Found another dir, let's do it again!
+			iterateDir(ctx, t, s, child)
+		}
+	}
+}
+
+// TestTree iterates all directories and stats every file.
+func TestTree(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	k := kernel.KernelFromContext(s.Ctx)
+
+	pop := &vfs.PathOperation{
+		Root:  s.Root,
+		Start: s.Root,
+		Path:  fspath.Parse("test-file"),
+	}
+	opts := &vfs.OpenOptions{
+		Flags: linux.O_RDONLY | linux.O_CREAT,
+		Mode:  0777,
+	}
+	file, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, opts)
+	if err != nil {
+		t.Fatalf("failed to create test file: %v", err)
+	}
+	defer file.DecRef(s.Ctx)
+
+	var tasks []*kernel.Task
+	for i := 0; i < 5; i++ {
+		tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+		task, err := testutil.CreateTask(s.Ctx, fmt.Sprintf("name-%d", i), tc, s.MntNs, s.Root, s.Root)
+		if err != nil {
+			t.Fatalf("CreateTask(): %v", err)
+		}
+		// Add file to populate /proc/[pid]/fd and fdinfo directories.
+		task.FDTable().NewFDVFS2(task, 0, file, kernel.FDFlags{})
+		tasks = append(tasks, task)
+	}
+
+	ctx := tasks[0]
+	fd, err := s.VFS.OpenAt(
+		ctx,
+		auth.CredentialsFromContext(s.Ctx),
+		&vfs.PathOperation{Root: s.Root, Start: s.Root, Path: fspath.Parse("/proc")},
+		&vfs.OpenOptions{},
+	)
+	if err != nil {
+		t.Fatalf("vfsfs.OpenAt(/proc) failed: %v", err)
+	}
+	iterateDir(ctx, t, s, fd)
+	fd.DecRef(ctx)
+}
diff --git a/pkg/sentry/fsimpl/proc/version.go b/pkg/sentry/fsimpl/proc/version.go
deleted file mode 100644
index e1643d4e0..000000000
--- a/pkg/sentry/fsimpl/proc/version.go
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package proc
-
-import (
-	"bytes"
-	"fmt"
-
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-)
-
-// versionData implements vfs.DynamicBytesSource for /proc/version.
-//
-// +stateify savable
-type versionData struct {
-	// k is the owning Kernel.
-	k *kernel.Kernel
-}
-
-var _ vfs.DynamicBytesSource = (*versionData)(nil)
-
-// Generate implements vfs.DynamicBytesSource.Generate.
-func (v *versionData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	init := v.k.GlobalInit()
-	if init == nil {
-		// Attempted to read before the init Task is created. This can
-		// only occur during startup, which should never need to read
-		// this file.
-		panic("Attempted to read version before initial Task is available")
-	}
-
-	// /proc/version takes the form:
-	//
-	// "SYSNAME version RELEASE (COMPILE_USER@COMPILE_HOST)
-	// (COMPILER_VERSION) VERSION"
-	//
-	// where:
-	// - SYSNAME, RELEASE, and VERSION are the same as returned by
-	// sys_utsname
-	// - COMPILE_USER is the user that build the kernel
-	// - COMPILE_HOST is the hostname of the machine on which the kernel
-	// was built
-	// - COMPILER_VERSION is the version reported by the building compiler
-	//
-	// Since we don't really want to expose build information to
-	// applications, those fields are omitted.
-	//
-	// FIXME(mpratt): Using Version from the init task SyscallTable
-	// disregards the different version a task may have (e.g., in a uts
-	// namespace).
-	ver := init.Leader().SyscallTable().Version
-	fmt.Fprintf(buf, "%s version %s %s\n", ver.Sysname, ver.Release, ver.Version)
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD
new file mode 100644
index 000000000..067c1657f
--- /dev/null
+++ b/pkg/sentry/fsimpl/signalfd/BUILD
@@ -0,0 +1,20 @@
+load("//tools:defs.bzl", "go_library")
+
+package(licenses = ["notice"])
+
+go_library(
+    name = "signalfd",
+    srcs = ["signalfd.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/binary",
+        "//pkg/context",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
new file mode 100644
index 000000000..6297e1df4
--- /dev/null
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -0,0 +1,136 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package signalfd
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// SignalFileDescription implements FileDescriptionImpl for signal fds.
+type SignalFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
+
+	// target is the original signal target task.
+	//
+	// The semantics here are a bit broken. Linux will always use current
+	// for all reads, regardless of where the signalfd originated. We can't
+	// do exactly that because we need to plumb the context through
+	// EventRegister in order to support proper blocking behavior. This
+	// will undoubtedly become very complicated quickly.
+	target *kernel.Task
+
+	// mu protects mask.
+	mu sync.Mutex
+
+	// mask is the signal mask. Protected by mu.
+	mask linux.SignalSet
+}
+
+var _ vfs.FileDescriptionImpl = (*SignalFileDescription)(nil)
+
+// New creates a new signal fd.
+func New(vfsObj *vfs.VirtualFilesystem, target *kernel.Task, mask linux.SignalSet, flags uint32) (*vfs.FileDescription, error) {
+	vd := vfsObj.NewAnonVirtualDentry("[signalfd]")
+	defer vd.DecRef(target)
+	sfd := &SignalFileDescription{
+		target: target,
+		mask:   mask,
+	}
+	if err := sfd.vfsfd.Init(sfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+		DenyPRead:         true,
+		DenyPWrite:        true,
+	}); err != nil {
+		return nil, err
+	}
+	return &sfd.vfsfd, nil
+}
+
+// Mask returns the signal mask.
+func (sfd *SignalFileDescription) Mask() linux.SignalSet {
+	sfd.mu.Lock()
+	defer sfd.mu.Unlock()
+	return sfd.mask
+}
+
+// SetMask sets the signal mask.
+func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) {
+	sfd.mu.Lock()
+	defer sfd.mu.Unlock()
+	sfd.mask = mask
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	// Attempt to dequeue relevant signals.
+	info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0)
+	if err != nil {
+		// There must be no signal available.
+		return 0, syserror.ErrWouldBlock
+	}
+
+	// Copy out the signal info using the specified format.
+	var buf [128]byte
+	binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{
+		Signo:   uint32(info.Signo),
+		Errno:   info.Errno,
+		Code:    info.Code,
+		PID:     uint32(info.Pid()),
+		UID:     uint32(info.Uid()),
+		Status:  info.Status(),
+		Overrun: uint32(info.Overrun()),
+		Addr:    info.Addr(),
+	})
+	n, err := dst.CopyOut(ctx, buf[:])
+	return int64(n), err
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (sfd *SignalFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	sfd.mu.Lock()
+	defer sfd.mu.Unlock()
+	if mask&waiter.EventIn != 0 && sfd.target.PendingSignals()&sfd.mask != 0 {
+		return waiter.EventIn // Pending signals.
+	}
+	return 0
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (sfd *SignalFileDescription) EventRegister(entry *waiter.Entry, _ waiter.EventMask) {
+	sfd.mu.Lock()
+	defer sfd.mu.Unlock()
+	// Register for the signal set; ignore the passed events.
+	sfd.target.SignalRegister(entry, waiter.EventMask(sfd.mask))
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) {
+	// Unregister the original entry.
+	sfd.target.SignalUnregister(entry)
+}
+
+// Release implements FileDescriptionImpl.Release()
+func (sfd *SignalFileDescription) Release(context.Context) {}
diff --git a/pkg/sentry/fsimpl/sockfs/BUILD b/pkg/sentry/fsimpl/sockfs/BUILD
new file mode 100644
index 000000000..9453277b8
--- /dev/null
+++ b/pkg/sentry/fsimpl/sockfs/BUILD
@@ -0,0 +1,18 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "sockfs",
+    srcs = ["sockfs.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
new file mode 100644
index 000000000..c61818ff6
--- /dev/null
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -0,0 +1,109 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sockfs provides a filesystem implementation for anonymous sockets.
+package sockfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// filesystemType implements vfs.FilesystemType.
+type filesystemType struct{}
+
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	panic("sockfs.filesystemType.GetFilesystem should never be called")
+}
+
+// Name implements FilesystemType.Name.
+//
+// Note that registering sockfs is unnecessary, except for the fact that it
+// will not show up under /proc/filesystems as a result. This is a very minor
+// discrepancy from Linux.
+func (filesystemType) Name() string {
+	return "sockfs"
+}
+
+type filesystem struct {
+	kernfs.Filesystem
+
+	devMinor uint32
+}
+
+// NewFilesystem sets up and returns a new sockfs filesystem.
+//
+// Note that there should only ever be one instance of sockfs.Filesystem,
+// backing a global socket mount.
+func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, err
+	}
+	fs := &filesystem{
+		devMinor: devMinor,
+	}
+	fs.Filesystem.VFSFilesystem().Init(vfsObj, filesystemType{}, fs)
+	return fs.Filesystem.VFSFilesystem(), nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release(ctx)
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	inode := vd.Dentry().Impl().(*kernfs.Dentry).Inode().(*inode)
+	b.PrependComponent(fmt.Sprintf("socket:[%d]", inode.InodeAttrs.Ino()))
+	return vfs.PrependPathSyntheticError{}
+}
+
+// inode implements kernfs.Inode.
+type inode struct {
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return nil, syserror.ENXIO
+}
+
+// NewDentry constructs and returns a sockfs dentry.
+//
+// Preconditions: mnt.Filesystem() must have been returned by NewFilesystem().
+func NewDentry(creds *auth.Credentials, mnt *vfs.Mount) *vfs.Dentry {
+	fs := mnt.Filesystem().Impl().(*filesystem)
+
+	// File mode matches net/socket.c:sock_alloc.
+	filemode := linux.FileMode(linux.S_IFSOCK | 0600)
+	i := &inode{}
+	i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode)
+
+	d := &kernfs.Dentry{}
+	d.Init(i)
+	return d.VFSDentry()
+}
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
new file mode 100644
index 000000000..1b548ccd4
--- /dev/null
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -0,0 +1,34 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+
+licenses(["notice"])
+
+go_library(
+    name = "sys",
+    srcs = [
+        "sys.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/sentry/fsimpl/kernfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "sys_test",
+    srcs = ["sys_test.go"],
+    deps = [
+        ":sys",
+        "//pkg/abi/linux",
+        "//pkg/sentry/fsimpl/testutil",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
new file mode 100644
index 000000000..0401726b6
--- /dev/null
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -0,0 +1,159 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package sys implements sysfs.
+package sys
+
+import (
+	"bytes"
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the default filesystem name.
+const Name = "sysfs"
+const defaultSysDirMode = linux.FileMode(0755)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	kernfs.Filesystem
+
+	devMinor uint32
+}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+
+	fs := &filesystem{
+		devMinor: devMinor,
+	}
+	fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
+
+	root := fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+		"block": fs.newDir(creds, defaultSysDirMode, nil),
+		"bus":   fs.newDir(creds, defaultSysDirMode, nil),
+		"class": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+			"power_supply": fs.newDir(creds, defaultSysDirMode, nil),
+		}),
+		"dev": fs.newDir(creds, defaultSysDirMode, nil),
+		"devices": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+			"system": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+				"cpu": cpuDir(ctx, fs, creds),
+			}),
+		}),
+		"firmware": fs.newDir(creds, defaultSysDirMode, nil),
+		"fs":       fs.newDir(creds, defaultSysDirMode, nil),
+		"kernel":   fs.newDir(creds, defaultSysDirMode, nil),
+		"module":   fs.newDir(creds, defaultSysDirMode, nil),
+		"power":    fs.newDir(creds, defaultSysDirMode, nil),
+	})
+	return fs.VFSFilesystem(), root.VFSDentry(), nil
+}
+
+func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernfs.Dentry {
+	k := kernel.KernelFromContext(ctx)
+	maxCPUCores := k.ApplicationCores()
+	children := map[string]*kernfs.Dentry{
+		"online":   fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
+		"possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
+		"present":  fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
+	}
+	for i := uint(0); i < maxCPUCores; i++ {
+		children[fmt.Sprintf("cpu%d", i)] = fs.newDir(creds, linux.FileMode(0555), nil)
+	}
+	return fs.newDir(creds, defaultSysDirMode, children)
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.Filesystem.Release(ctx)
+}
+
+// dir implements kernfs.Inode.
+type dir struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoDynamicLookup
+	kernfs.InodeNotSymlink
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.OrderedChildren
+
+	locks vfs.FileLocks
+
+	dentry kernfs.Dentry
+}
+
+func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+	d := &dir{}
+	d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
+	d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	d.dentry.Init(d)
+
+	d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents))
+
+	return &d.dentry
+}
+
+// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
+	return syserror.EPERM
+}
+
+// Open implements kernfs.Inode.Open.
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+	if err != nil {
+		return nil, err
+	}
+	return fd.VFSFileDescription(), nil
+}
+
+// cpuFile implements kernfs.Inode.
+type cpuFile struct {
+	kernfs.DynamicBytesFile
+	maxCores uint
+}
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	fmt.Fprintf(buf, "0-%d\n", c.maxCores-1)
+	return nil
+}
+
+func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry {
+	c := &cpuFile{maxCores: maxCores}
+	c.DynamicBytesFile.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode)
+	d := &kernfs.Dentry{}
+	d.Init(c)
+	return d
+}
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
new file mode 100644
index 000000000..9fd38b295
--- /dev/null
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -0,0 +1,89 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sys_test
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func newTestSystem(t *testing.T) *testutil.System {
+	k, err := testutil.Boot()
+	if err != nil {
+		t.Fatalf("Failed to create test kernel: %v", err)
+	}
+	ctx := k.SupervisorContext()
+	creds := auth.CredentialsFromContext(ctx)
+	k.VFS().MustRegisterFilesystemType(sys.Name, sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+
+	mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("Failed to create new mount namespace: %v", err)
+	}
+	return testutil.NewSystem(ctx, t, k.VFS(), mns)
+}
+
+func TestReadCPUFile(t *testing.T) {
+	s := newTestSystem(t)
+	defer s.Destroy()
+	k := kernel.KernelFromContext(s.Ctx)
+	maxCPUCores := k.ApplicationCores()
+
+	expected := fmt.Sprintf("0-%d\n", maxCPUCores-1)
+
+	for _, fname := range []string{"online", "possible", "present"} {
+		pop := s.PathOpAtRoot(fmt.Sprintf("devices/system/cpu/%s", fname))
+		fd, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, &vfs.OpenOptions{})
+		if err != nil {
+			t.Fatalf("OpenAt(pop:%+v) = %+v failed: %v", pop, fd, err)
+		}
+		defer fd.DecRef(s.Ctx)
+		content, err := s.ReadToEnd(fd)
+		if err != nil {
+			t.Fatalf("Read failed: %v", err)
+		}
+		if diff := cmp.Diff(expected, content); diff != "" {
+			t.Fatalf("Read returned unexpected data:\n--- want\n+++ got\n%v", diff)
+		}
+	}
+}
+
+func TestSysRootContainsExpectedEntries(t *testing.T) {
+	s := newTestSystem(t)
+	defer s.Destroy()
+	pop := s.PathOpAtRoot("/")
+	s.AssertAllDirentTypes(s.ListDirents(pop), map[string]testutil.DirentType{
+		"block":    linux.DT_DIR,
+		"bus":      linux.DT_DIR,
+		"class":    linux.DT_DIR,
+		"dev":      linux.DT_DIR,
+		"devices":  linux.DT_DIR,
+		"firmware": linux.DT_DIR,
+		"fs":       linux.DT_DIR,
+		"kernel":   linux.DT_DIR,
+		"module":   linux.DT_DIR,
+		"power":    linux.DT_DIR,
+	})
+}
diff --git a/pkg/sentry/fsimpl/testutil/BUILD b/pkg/sentry/fsimpl/testutil/BUILD
new file mode 100644
index 000000000..400a97996
--- /dev/null
+++ b/pkg/sentry/fsimpl/testutil/BUILD
@@ -0,0 +1,37 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "testutil",
+    testonly = 1,
+    srcs = [
+        "kernel.go",
+        "testutil.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/cpuid",
+        "//pkg/fspath",
+        "//pkg/memutil",
+        "//pkg/sentry/fsbridge",
+        "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/sched",
+        "//pkg/sentry/limits",
+        "//pkg/sentry/loader",
+        "//pkg/sentry/mm",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/platform/kvm",
+        "//pkg/sentry/platform/ptrace",
+        "//pkg/sentry/time",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/usermem",
+        "@com_github_google_go_cmp//cmp:go_default_library",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
new file mode 100644
index 000000000..1813269e0
--- /dev/null
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -0,0 +1,180 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package testutil
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"runtime"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/cpuid"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/memutil"
+	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
+	"gvisor.dev/gvisor/pkg/sentry/limits"
+	"gvisor.dev/gvisor/pkg/sentry/loader"
+	"gvisor.dev/gvisor/pkg/sentry/mm"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/platform"
+	"gvisor.dev/gvisor/pkg/sentry/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+
+	// Platforms are plugable.
+	_ "gvisor.dev/gvisor/pkg/sentry/platform/kvm"
+	_ "gvisor.dev/gvisor/pkg/sentry/platform/ptrace"
+)
+
+var (
+	platformFlag = flag.String("platform", "ptrace", "specify which platform to use")
+)
+
+// Boot initializes a new bare bones kernel for test.
+func Boot() (*kernel.Kernel, error) {
+	platformCtr, err := platform.Lookup(*platformFlag)
+	if err != nil {
+		return nil, fmt.Errorf("platform not found: %v", err)
+	}
+	deviceFile, err := platformCtr.OpenDevice()
+	if err != nil {
+		return nil, fmt.Errorf("creating platform: %v", err)
+	}
+	plat, err := platformCtr.New(deviceFile)
+	if err != nil {
+		return nil, fmt.Errorf("creating platform: %v", err)
+	}
+
+	kernel.VFS2Enabled = true
+	k := &kernel.Kernel{
+		Platform: plat,
+	}
+
+	mf, err := createMemoryFile()
+	if err != nil {
+		return nil, err
+	}
+	k.SetMemoryFile(mf)
+
+	// Pass k as the platform since it is savable, unlike the actual platform.
+	vdso, err := loader.PrepareVDSO(k)
+	if err != nil {
+		return nil, fmt.Errorf("creating vdso: %v", err)
+	}
+
+	// Create timekeeper.
+	tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
+	if err != nil {
+		return nil, fmt.Errorf("creating timekeeper: %v", err)
+	}
+	tk.SetClocks(time.NewCalibratedClocks())
+
+	creds := auth.NewRootCredentials(auth.NewRootUserNamespace())
+
+	// Initiate the Kernel object, which is required by the Context passed
+	// to createVFS in order to mount (among other things) procfs.
+	if err = k.Init(kernel.InitKernelArgs{
+		ApplicationCores:            uint(runtime.GOMAXPROCS(-1)),
+		FeatureSet:                  cpuid.HostFeatureSet(),
+		Timekeeper:                  tk,
+		RootUserNamespace:           creds.UserNamespace,
+		Vdso:                        vdso,
+		RootUTSNamespace:            kernel.NewUTSNamespace("hostname", "domain", creds.UserNamespace),
+		RootIPCNamespace:            kernel.NewIPCNamespace(creds.UserNamespace),
+		RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+		PIDNamespace:                kernel.NewRootPIDNamespace(creds.UserNamespace),
+	}); err != nil {
+		return nil, fmt.Errorf("initializing kernel: %v", err)
+	}
+
+	k.VFS().MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+		AllowUserList:  true,
+	})
+
+	ls, err := limits.NewLinuxLimitSet()
+	if err != nil {
+		return nil, err
+	}
+	tg := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, ls)
+	k.TestOnly_SetGlobalInit(tg)
+
+	return k, nil
+}
+
+// CreateTask creates a new bare bones task for tests.
+func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns *vfs.MountNamespace, root, cwd vfs.VirtualDentry) (*kernel.Task, error) {
+	k := kernel.KernelFromContext(ctx)
+	if k == nil {
+		return nil, fmt.Errorf("cannot find kernel from context")
+	}
+
+	exe, err := newFakeExecutable(ctx, k.VFS(), auth.CredentialsFromContext(ctx), root)
+	if err != nil {
+		return nil, err
+	}
+	m := mm.NewMemoryManager(k, k, k.SleepForAddressSpaceActivation)
+	m.SetExecutable(ctx, fsbridge.NewVFSFile(exe))
+
+	config := &kernel.TaskConfig{
+		Kernel:                  k,
+		ThreadGroup:             tc,
+		TaskContext:             &kernel.TaskContext{Name: name, MemoryManager: m},
+		Credentials:             auth.CredentialsFromContext(ctx),
+		NetworkNamespace:        k.RootNetworkNamespace(),
+		AllowedCPUMask:          sched.NewFullCPUSet(k.ApplicationCores()),
+		UTSNamespace:            kernel.UTSNamespaceFromContext(ctx),
+		IPCNamespace:            kernel.IPCNamespaceFromContext(ctx),
+		AbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
+		MountNamespaceVFS2:      mntns,
+		FSContext:               kernel.NewFSContextVFS2(root, cwd, 0022),
+		FDTable:                 k.NewFDTable(),
+	}
+	return k.TaskSet().NewTask(config)
+}
+
+func newFakeExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry) (*vfs.FileDescription, error) {
+	const name = "executable"
+	pop := &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(name),
+	}
+	opts := &vfs.OpenOptions{
+		Flags: linux.O_RDONLY | linux.O_CREAT,
+		Mode:  0777,
+	}
+	return vfsObj.OpenAt(ctx, creds, pop, opts)
+}
+
+func createMemoryFile() (*pgalloc.MemoryFile, error) {
+	const memfileName = "test-memory"
+	memfd, err := memutil.CreateMemFD(memfileName, 0)
+	if err != nil {
+		return nil, fmt.Errorf("error creating memfd: %v", err)
+	}
+	memfile := os.NewFile(uintptr(memfd), memfileName)
+	mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
+	if err != nil {
+		memfile.Close()
+		return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
+	}
+	return mf, nil
+}
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
new file mode 100644
index 000000000..568132121
--- /dev/null
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -0,0 +1,284 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package testutil provides common test utilities for kernfs-based
+// filesystems.
+package testutil
+
+import (
+	"fmt"
+	"io"
+	"strings"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// System represents the context for a single test.
+//
+// Test systems must be explicitly destroyed with System.Destroy.
+type System struct {
+	t     *testing.T
+	Ctx   context.Context
+	Creds *auth.Credentials
+	VFS   *vfs.VirtualFilesystem
+	Root  vfs.VirtualDentry
+	MntNs *vfs.MountNamespace
+}
+
+// NewSystem constructs a System.
+//
+// Precondition: Caller must hold a reference on MntNs, whose ownership
+// is transferred to the new System.
+func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System {
+	s := &System{
+		t:     t,
+		Ctx:   ctx,
+		Creds: auth.CredentialsFromContext(ctx),
+		VFS:   v,
+		MntNs: mns,
+		Root:  mns.Root(),
+	}
+	return s
+}
+
+// WithSubtest creates a temporary test system with a new test harness,
+// referencing all other resources from the original system. This is useful when
+// a system is reused for multiple subtests, and the T needs to change for each
+// case. Note that this is safe when test cases run in parallel, as all
+// resources referenced by the system are immutable, or handle interior
+// mutations in a thread-safe manner.
+//
+// The returned system must not outlive the original and should not be destroyed
+// via System.Destroy.
+func (s *System) WithSubtest(t *testing.T) *System {
+	return &System{
+		t:     t,
+		Ctx:   s.Ctx,
+		Creds: s.Creds,
+		VFS:   s.VFS,
+		MntNs: s.MntNs,
+		Root:  s.Root,
+	}
+}
+
+// WithTemporaryContext constructs a temporary test system with a new context
+// ctx. The temporary system borrows all resources and references from the
+// original system. The returned temporary system must not outlive the original
+// system, and should not be destroyed via System.Destroy.
+func (s *System) WithTemporaryContext(ctx context.Context) *System {
+	return &System{
+		t:     s.t,
+		Ctx:   ctx,
+		Creds: s.Creds,
+		VFS:   s.VFS,
+		MntNs: s.MntNs,
+		Root:  s.Root,
+	}
+}
+
+// Destroy release resources associated with a test system.
+func (s *System) Destroy() {
+	s.Root.DecRef(s.Ctx)
+	s.MntNs.DecRef(s.Ctx) // Reference on MntNs passed to NewSystem.
+}
+
+// ReadToEnd reads the contents of fd until EOF to a string.
+func (s *System) ReadToEnd(fd *vfs.FileDescription) (string, error) {
+	buf := make([]byte, usermem.PageSize)
+	bufIOSeq := usermem.BytesIOSequence(buf)
+	opts := vfs.ReadOptions{}
+
+	var content strings.Builder
+	for {
+		n, err := fd.Read(s.Ctx, bufIOSeq, opts)
+		if n == 0 || err != nil {
+			if err == io.EOF {
+				err = nil
+			}
+			return content.String(), err
+		}
+		content.Write(buf[:n])
+	}
+}
+
+// PathOpAtRoot constructs a PathOperation with the given path from
+// the root of the filesystem.
+func (s *System) PathOpAtRoot(path string) *vfs.PathOperation {
+	return &vfs.PathOperation{
+		Root:  s.Root,
+		Start: s.Root,
+		Path:  fspath.Parse(path),
+	}
+}
+
+// GetDentryOrDie attempts to resolve a dentry referred to by the
+// provided path operation. If unsuccessful, the test fails.
+func (s *System) GetDentryOrDie(pop *vfs.PathOperation) vfs.VirtualDentry {
+	vd, err := s.VFS.GetDentryAt(s.Ctx, s.Creds, pop, &vfs.GetDentryOptions{})
+	if err != nil {
+		s.t.Fatalf("GetDentryAt(pop:%+v) failed: %v", pop, err)
+	}
+	return vd
+}
+
+// DirentType is an alias for values for linux_dirent64.d_type.
+type DirentType = uint8
+
+// ListDirents lists the Dirents for a directory at pop.
+func (s *System) ListDirents(pop *vfs.PathOperation) *DirentCollector {
+	fd, err := s.VFS.OpenAt(s.Ctx, s.Creds, pop, &vfs.OpenOptions{Flags: linux.O_RDONLY})
+	if err != nil {
+		s.t.Fatalf("OpenAt for PathOperation %+v failed: %v", pop, err)
+	}
+	defer fd.DecRef(s.Ctx)
+
+	collector := &DirentCollector{}
+	if err := fd.IterDirents(s.Ctx, collector); err != nil {
+		s.t.Fatalf("IterDirent failed: %v", err)
+	}
+	return collector
+}
+
+// AssertAllDirentTypes verifies that the set of dirents in collector contains
+// exactly the specified set of expected entries. AssertAllDirentTypes respects
+// collector.skipDots, and implicitly checks for "." and ".." accordingly.
+func (s *System) AssertAllDirentTypes(collector *DirentCollector, expected map[string]DirentType) {
+	if expected == nil {
+		expected = make(map[string]DirentType)
+	}
+	// Also implicitly check for "." and "..", if enabled.
+	if !collector.skipDots {
+		expected["."] = linux.DT_DIR
+		expected[".."] = linux.DT_DIR
+	}
+
+	dentryTypes := make(map[string]DirentType)
+	collector.mu.Lock()
+	for _, dirent := range collector.dirents {
+		dentryTypes[dirent.Name] = dirent.Type
+	}
+	collector.mu.Unlock()
+	if diff := cmp.Diff(expected, dentryTypes); diff != "" {
+		s.t.Fatalf("IterDirent had unexpected results:\n--- want\n+++ got\n%v", diff)
+	}
+}
+
+// AssertDirentOffsets verifies that collector contains at least the entries
+// specified in expected, with the given NextOff field. Entries specified in
+// expected but missing from collector result in failure. Extra entries in
+// collector are ignored. AssertDirentOffsets respects collector.skipDots, and
+// implicitly checks for "." and ".." accordingly.
+func (s *System) AssertDirentOffsets(collector *DirentCollector, expected map[string]int64) {
+	// Also implicitly check for "." and "..", if enabled.
+	if !collector.skipDots {
+		expected["."] = 1
+		expected[".."] = 2
+	}
+
+	dentryNextOffs := make(map[string]int64)
+	collector.mu.Lock()
+	for _, dirent := range collector.dirents {
+		// Ignore extra entries in dentries that are not in expected.
+		if _, ok := expected[dirent.Name]; ok {
+			dentryNextOffs[dirent.Name] = dirent.NextOff
+		}
+	}
+	collector.mu.Unlock()
+	if diff := cmp.Diff(expected, dentryNextOffs); diff != "" {
+		s.t.Fatalf("IterDirent had unexpected results:\n--- want\n+++ got\n%v", diff)
+	}
+}
+
+// DirentCollector provides an implementation for vfs.IterDirentsCallback for
+// testing. It simply iterates to the end of a given directory FD and collects
+// all dirents emitted by the callback.
+type DirentCollector struct {
+	mu      sync.Mutex
+	order   []*vfs.Dirent
+	dirents map[string]*vfs.Dirent
+	// When the collector is used in various Assert* functions, should "." and
+	// ".." be implicitly checked?
+	skipDots bool
+}
+
+// SkipDotsChecks enables or disables the implicit checks on "." and ".." when
+// the collector is used in various Assert* functions. Note that "." and ".."
+// are still collected if passed to d.Handle, so the caller should only disable
+// the checks when they aren't expected.
+func (d *DirentCollector) SkipDotsChecks(value bool) {
+	d.skipDots = value
+}
+
+// Handle implements vfs.IterDirentsCallback.Handle.
+func (d *DirentCollector) Handle(dirent vfs.Dirent) error {
+	d.mu.Lock()
+	if d.dirents == nil {
+		d.dirents = make(map[string]*vfs.Dirent)
+	}
+	d.order = append(d.order, &dirent)
+	d.dirents[dirent.Name] = &dirent
+	d.mu.Unlock()
+	return nil
+}
+
+// Count returns the number of dirents currently in the collector.
+func (d *DirentCollector) Count() int {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	return len(d.dirents)
+}
+
+// Contains checks whether the collector has a dirent with the given name and
+// type.
+func (d *DirentCollector) Contains(name string, typ uint8) error {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+	dirent, ok := d.dirents[name]
+	if !ok {
+		return fmt.Errorf("No dirent named %q found", name)
+	}
+	if dirent.Type != typ {
+		return fmt.Errorf("Dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent)
+	}
+	return nil
+}
+
+// Dirents returns all dirents discovered by this collector.
+func (d *DirentCollector) Dirents() map[string]*vfs.Dirent {
+	d.mu.Lock()
+	dirents := make(map[string]*vfs.Dirent)
+	for n, d := range d.dirents {
+		dirents[n] = d
+	}
+	d.mu.Unlock()
+	return dirents
+}
+
+// OrderedDirents returns an ordered list of dirents as discovered by this
+// collector.
+func (d *DirentCollector) OrderedDirents() []*vfs.Dirent {
+	d.mu.Lock()
+	dirents := make([]*vfs.Dirent, len(d.order))
+	copy(dirents, d.order)
+	d.mu.Unlock()
+	return dirents
+}
diff --git a/pkg/sentry/fsimpl/timerfd/BUILD b/pkg/sentry/fsimpl/timerfd/BUILD
new file mode 100644
index 000000000..fbb02a271
--- /dev/null
+++ b/pkg/sentry/fsimpl/timerfd/BUILD
@@ -0,0 +1,17 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "timerfd",
+    srcs = ["timerfd.go"],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/context",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+        "//pkg/waiter",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
new file mode 100644
index 000000000..86beaa0a8
--- /dev/null
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -0,0 +1,144 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package timerfd implements timer fds.
+package timerfd
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/context"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// TimerFileDescription implements FileDescriptionImpl for timer fds. It also
+// implements ktime.TimerListener.
+type TimerFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
+
+	events waiter.Queue
+	timer  *ktime.Timer
+
+	// val is the number of timer expirations since the last successful
+	// call to PRead, or SetTime. val must be accessed using atomic memory
+	// operations.
+	val uint64
+}
+
+var _ vfs.FileDescriptionImpl = (*TimerFileDescription)(nil)
+var _ ktime.TimerListener = (*TimerFileDescription)(nil)
+
+// New returns a new timer fd.
+func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock, flags uint32) (*vfs.FileDescription, error) {
+	vd := vfsObj.NewAnonVirtualDentry("[timerfd]")
+	defer vd.DecRef(ctx)
+	tfd := &TimerFileDescription{}
+	tfd.timer = ktime.NewTimer(clock, tfd)
+	if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{
+		UseDentryMetadata: true,
+		DenyPRead:         true,
+		DenyPWrite:        true,
+	}); err != nil {
+		return nil, err
+	}
+	return &tfd.vfsfd, nil
+}
+
+// Read implements FileDescriptionImpl.Read.
+func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	const sizeofUint64 = 8
+	if dst.NumBytes() < sizeofUint64 {
+		return 0, syserror.EINVAL
+	}
+	if val := atomic.SwapUint64(&tfd.val, 0); val != 0 {
+		var buf [sizeofUint64]byte
+		usermem.ByteOrder.PutUint64(buf[:], val)
+		if _, err := dst.CopyOut(ctx, buf[:]); err != nil {
+			// Linux does not undo consuming the number of
+			// expirations even if writing to userspace fails.
+			return 0, err
+		}
+		return sizeofUint64, nil
+	}
+	return 0, syserror.ErrWouldBlock
+}
+
+// Clock returns the timer fd's Clock.
+func (tfd *TimerFileDescription) Clock() ktime.Clock {
+	return tfd.timer.Clock()
+}
+
+// GetTime returns the associated Timer's setting and the time at which it was
+// observed.
+func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) {
+	return tfd.timer.Get()
+}
+
+// SetTime atomically changes the associated Timer's setting, resets the number
+// of expirations to 0, and returns the previous setting and the time at which
+// it was observed.
+func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) {
+	return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) })
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	var ready waiter.EventMask
+	if atomic.LoadUint64(&tfd.val) != 0 {
+		ready |= waiter.EventIn
+	}
+	return ready
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	tfd.events.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) {
+	tfd.events.EventUnregister(e)
+}
+
+// PauseTimer pauses the associated Timer.
+func (tfd *TimerFileDescription) PauseTimer() {
+	tfd.timer.Pause()
+}
+
+// ResumeTimer resumes the associated Timer.
+func (tfd *TimerFileDescription) ResumeTimer() {
+	tfd.timer.Resume()
+}
+
+// Release implements FileDescriptionImpl.Release()
+func (tfd *TimerFileDescription) Release(context.Context) {
+	tfd.timer.Destroy()
+}
+
+// Notify implements ktime.TimerListener.Notify.
+func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) {
+	atomic.AddUint64(&tfd.val, exp)
+	tfd.events.Notify(waiter.EventIn)
+	return ktime.Setting{}, false
+}
+
+// Destroy implements ktime.TimerListener.Destroy.
+func (tfd *TimerFileDescription) Destroy() {}
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
new file mode 100644
index 000000000..5cd428d64
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -0,0 +1,125 @@
+load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+licenses(["notice"])
+
+go_template_instance(
+    name = "dentry_list",
+    out = "dentry_list.go",
+    package = "tmpfs",
+    prefix = "dentry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*dentry",
+        "Linker": "*dentry",
+    },
+)
+
+go_template_instance(
+    name = "fstree",
+    out = "fstree.go",
+    package = "tmpfs",
+    prefix = "generic",
+    template = "//pkg/sentry/vfs/genericfstree:generic_fstree",
+    types = {
+        "Dentry": "dentry",
+    },
+)
+
+go_template_instance(
+    name = "inode_refs",
+    out = "inode_refs.go",
+    package = "tmpfs",
+    prefix = "inode",
+    template = "//pkg/refs_vfs2:refs_template",
+    types = {
+        "T": "inode",
+    },
+)
+
+go_library(
+    name = "tmpfs",
+    srcs = [
+        "dentry_list.go",
+        "device_file.go",
+        "directory.go",
+        "filesystem.go",
+        "fstree.go",
+        "inode_refs.go",
+        "named_pipe.go",
+        "regular_file.go",
+        "socket_file.go",
+        "symlink.go",
+        "tmpfs.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/safemem",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/kernel/time",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/uniqueid",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/vfs",
+        "//pkg/sentry/vfs/memxattr",
+        "//pkg/sync",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
+
+go_test(
+    name = "benchmark_test",
+    size = "small",
+    srcs = ["benchmark_test.go"],
+    deps = [
+        ":tmpfs",
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/refs",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "tmpfs_test",
+    size = "small",
+    srcs = [
+        "pipe_test.go",
+        "regular_file_test.go",
+        "stat_test.go",
+        "tmpfs_test.go",
+    ],
+    library = ":tmpfs",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/contexttest",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index a94b17db6..d263147c2 100644
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -21,11 +21,13 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
-	"gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -81,7 +83,7 @@ func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent
 	}
 
 	err = fn(root, d)
-	d.DecRef()
+	d.DecRef(ctx)
 	return err
 }
 
@@ -103,17 +105,17 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to create mount namespace: %v", err)
 			}
-			defer mntns.DecRef()
+			defer mntns.DecRef(ctx)
 
 			var filePathBuilder strings.Builder
 			filePathBuilder.WriteByte('/')
 
 			// Create nested directories with given depth.
 			root := mntns.Root()
-			defer root.DecRef()
+			defer root.DecRef(ctx)
 			d := root
 			d.IncRef()
-			defer d.DecRef()
+			defer d.DecRef(ctx)
 			for i := depth; i > 0; i-- {
 				name := fmt.Sprintf("%d", i)
 				if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
@@ -123,7 +125,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) {
 				if err != nil {
 					b.Fatalf("failed to walk to directory %q: %v", name, err)
 				}
-				d.DecRef()
+				d.DecRef(ctx)
 				d = next
 				filePathBuilder.WriteString(name)
 				filePathBuilder.WriteByte('/')
@@ -134,7 +136,7 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to create file %q: %v", filename, err)
 			}
-			file.DecRef()
+			file.DecRef(ctx)
 			filePathBuilder.WriteString(filename)
 			filePath := filePathBuilder.String()
 
@@ -160,39 +162,46 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) {
 					b.Fatalf("stat(%q) failed: %v", filePath, err)
 				}
 			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
 		})
 	}
 }
 
-func BenchmarkVFS2MemfsStat(b *testing.B) {
+func BenchmarkVFS2TmpfsStat(b *testing.B) {
 	for _, depth := range depths {
 		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
 			ctx := contexttest.Context(b)
 			creds := auth.CredentialsFromContext(ctx)
 
 			// Create VFS.
-			vfsObj := vfs.New()
-			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+			vfsObj := vfs.VirtualFilesystem{}
+			if err := vfsObj.Init(ctx); err != nil {
+				b.Fatalf("VFS init: %v", err)
+			}
+			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+				AllowUserMount: true,
+			})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
+			defer mntns.DecRef(ctx)
 
 			var filePathBuilder strings.Builder
 			filePathBuilder.WriteByte('/')
 
 			// Create nested directories with given depth.
 			root := mntns.Root()
-			defer root.DecRef()
+			defer root.DecRef(ctx)
 			vd := root
 			vd.IncRef()
-			defer vd.DecRef()
 			for i := depth; i > 0; i-- {
 				name := fmt.Sprintf("%d", i)
 				pop := vfs.PathOperation{
-					Root:     root,
-					Start:    vd,
-					Pathname: name,
+					Root:  root,
+					Start: vd,
+					Path:  fspath.Parse(name),
 				}
 				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
 					Mode: 0755,
@@ -203,7 +212,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 				if err != nil {
 					b.Fatalf("failed to walk to directory %q: %v", name, err)
 				}
-				vd.DecRef()
+				vd.DecRef(ctx)
 				vd = nextVD
 				filePathBuilder.WriteString(name)
 				filePathBuilder.WriteByte('/')
@@ -213,16 +222,18 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
 				Root:               root,
 				Start:              vd,
-				Pathname:           filename,
+				Path:               fspath.Parse(filename),
 				FollowFinalSymlink: true,
 			}, &vfs.OpenOptions{
 				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
 				Mode:  0644,
 			})
+			vd.DecRef(ctx)
+			vd = vfs.VirtualDentry{}
 			if err != nil {
 				b.Fatalf("failed to create file %q: %v", filename, err)
 			}
-			defer fd.DecRef()
+			defer fd.DecRef(ctx)
 			filePathBuilder.WriteString(filename)
 			filePath := filePathBuilder.String()
 
@@ -232,7 +243,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               root,
 					Start:              root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
@@ -243,6 +254,8 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
 				}
 			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
 		})
 	}
 }
@@ -265,14 +278,14 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to create mount namespace: %v", err)
 			}
-			defer mntns.DecRef()
+			defer mntns.DecRef(ctx)
 
 			var filePathBuilder strings.Builder
 			filePathBuilder.WriteByte('/')
 
 			// Create and mount the submount.
 			root := mntns.Root()
-			defer root.DecRef()
+			defer root.DecRef(ctx)
 			if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil {
 				b.Fatalf("failed to create mount point: %v", err)
 			}
@@ -280,7 +293,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to walk to mount point: %v", err)
 			}
-			defer mountPoint.DecRef()
+			defer mountPoint.DecRef(ctx)
 			submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
 			if err != nil {
 				b.Fatalf("failed to create tmpfs submount: %v", err)
@@ -296,7 +309,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to walk to mount root: %v", err)
 			}
-			defer d.DecRef()
+			defer d.DecRef(ctx)
 			for i := depth; i > 0; i-- {
 				name := fmt.Sprintf("%d", i)
 				if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
@@ -306,7 +319,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
 				if err != nil {
 					b.Fatalf("failed to walk to directory %q: %v", name, err)
 				}
-				d.DecRef()
+				d.DecRef(ctx)
 				d = next
 				filePathBuilder.WriteString(name)
 				filePathBuilder.WriteByte('/')
@@ -317,7 +330,7 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to create file %q: %v", filename, err)
 			}
-			file.DecRef()
+			file.DecRef(ctx)
 			filePathBuilder.WriteString(filename)
 			filePath := filePathBuilder.String()
 
@@ -343,34 +356,42 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
 					b.Fatalf("stat(%q) failed: %v", filePath, err)
 				}
 			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
 		})
 	}
 }
 
-func BenchmarkVFS2MemfsMountStat(b *testing.B) {
+func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 	for _, depth := range depths {
 		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
 			ctx := contexttest.Context(b)
 			creds := auth.CredentialsFromContext(ctx)
 
 			// Create VFS.
-			vfsObj := vfs.New()
-			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+			vfsObj := vfs.VirtualFilesystem{}
+			if err := vfsObj.Init(ctx); err != nil {
+				b.Fatalf("VFS init: %v", err)
+			}
+			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+				AllowUserMount: true,
+			})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
+			defer mntns.DecRef(ctx)
 
 			var filePathBuilder strings.Builder
 			filePathBuilder.WriteByte('/')
 
 			// Create the mount point.
 			root := mntns.Root()
-			defer root.DecRef()
+			defer root.DecRef(ctx)
 			pop := vfs.PathOperation{
-				Root:     root,
-				Start:    root,
-				Pathname: mountPointName,
+				Root:  root,
+				Start: root,
+				Path:  fspath.Parse(mountPointName),
 			}
 			if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
 				Mode: 0755,
@@ -382,9 +403,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to walk to mount point: %v", err)
 			}
-			defer mountPoint.DecRef()
+			defer mountPoint.DecRef(ctx)
 			// Create and mount the submount.
-			if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.NewFilesystemOptions{}); err != nil {
+			if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
 				b.Fatalf("failed to mount tmpfs submount: %v", err)
 			}
 			filePathBuilder.WriteString(mountPointName)
@@ -395,13 +416,12 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to walk to mount root: %v", err)
 			}
-			defer vd.DecRef()
 			for i := depth; i > 0; i-- {
 				name := fmt.Sprintf("%d", i)
 				pop := vfs.PathOperation{
-					Root:     root,
-					Start:    vd,
-					Pathname: name,
+					Root:  root,
+					Start: vd,
+					Path:  fspath.Parse(name),
 				}
 				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
 					Mode: 0755,
@@ -412,33 +432,27 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 				if err != nil {
 					b.Fatalf("failed to walk to directory %q: %v", name, err)
 				}
-				vd.DecRef()
+				vd.DecRef(ctx)
 				vd = nextVD
 				filePathBuilder.WriteString(name)
 				filePathBuilder.WriteByte('/')
 			}
 
-			// Verify that we didn't create any directories under the mount
-			// point (i.e. they were all created on the submount).
-			firstDirName := fmt.Sprintf("%d", depth)
-			if child := mountPoint.Dentry().Child(firstDirName); child != nil {
-				b.Fatalf("created directory %q under root mount, not submount", firstDirName)
-			}
-
 			// Create the file that will be stat'd.
 			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
 				Root:               root,
 				Start:              vd,
-				Pathname:           filename,
+				Path:               fspath.Parse(filename),
 				FollowFinalSymlink: true,
 			}, &vfs.OpenOptions{
 				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
 				Mode:  0644,
 			})
+			vd.DecRef(ctx)
 			if err != nil {
 				b.Fatalf("failed to create file %q: %v", filename, err)
 			}
-			fd.DecRef()
+			fd.DecRef(ctx)
 			filePathBuilder.WriteString(filename)
 			filePath := filePathBuilder.String()
 
@@ -448,7 +462,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               root,
 					Start:              root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
@@ -459,6 +473,14 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
 				}
 			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
 		})
 	}
 }
+
+func init() {
+	// Turn off reference leak checking for a fair comparison between vfs1 and
+	// vfs2.
+	refs.SetLeakMode(refs.NoLeakChecking)
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
new file mode 100644
index 000000000..ac54d420d
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -0,0 +1,49 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+type deviceFile struct {
+	inode inode
+	kind  vfs.DeviceKind
+	major uint32
+	minor uint32
+}
+
+func (fs *filesystem) newDeviceFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, kind vfs.DeviceKind, major, minor uint32) *inode {
+	file := &deviceFile{
+		kind:  kind,
+		major: major,
+		minor: minor,
+	}
+	switch kind {
+	case vfs.BlockDevice:
+		mode |= linux.S_IFBLK
+	case vfs.CharDevice:
+		mode |= linux.S_IFCHR
+	default:
+		panic(fmt.Sprintf("invalid DeviceKind: %v", kind))
+	}
+	file.inode.init(file, fs, kuid, kgid, mode)
+	file.inode.nlink = 1 // from parent directory
+	return &file.inode
+}
diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 0bd82e480..78b4fc5be 100644
--- a/pkg/sentry/fsimpl/memfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -12,55 +12,95 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package memfs
+package tmpfs
 
 import (
+	"sync/atomic"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 type directory struct {
-	inode inode
+	// Since directories can't be hard-linked, each directory can only be
+	// associated with a single dentry, which we can store in the directory
+	// struct.
+	dentry dentry
+	inode  inode
+
+	// childMap maps the names of the directory's children to their dentries.
+	// childMap is protected by filesystem.mu.
+	childMap map[string]*dentry
 
-	// childList is a list containing (1) child Dentries and (2) fake Dentries
+	// numChildren is len(childMap), but accessed using atomic memory
+	// operations to avoid locking in inode.statTo().
+	numChildren int64
+
+	// childList is a list containing (1) child dentries and (2) fake dentries
 	// (with inode == nil) that represent the iteration position of
 	// directoryFDs. childList is used to support directoryFD.IterDirents()
-	// efficiently. childList is protected by filesystem.mu.
+	// efficiently. childList is protected by iterMu.
+	iterMu    sync.Mutex
 	childList dentryList
 }
 
-func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode {
+func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *directory {
 	dir := &directory{}
-	dir.inode.init(dir, fs, creds, mode)
+	dir.inode.init(dir, fs, kuid, kgid, linux.S_IFDIR|mode)
 	dir.inode.nlink = 2 // from "." and parent directory or ".." for root
-	return &dir.inode
+	dir.dentry.inode = &dir.inode
+	dir.dentry.vfsd.Init(&dir.dentry)
+	return dir
+}
+
+// Preconditions: filesystem.mu must be locked for writing. dir must not
+// already contain a child with the given name.
+func (dir *directory) insertChildLocked(child *dentry, name string) {
+	child.parent = &dir.dentry
+	child.name = name
+	if dir.childMap == nil {
+		dir.childMap = make(map[string]*dentry)
+	}
+	dir.childMap[name] = child
+	atomic.AddInt64(&dir.numChildren, 1)
+	dir.iterMu.Lock()
+	dir.childList.PushBack(child)
+	dir.iterMu.Unlock()
+}
+
+// Preconditions: filesystem.mu must be locked for writing.
+func (dir *directory) removeChildLocked(child *dentry) {
+	delete(dir.childMap, child.name)
+	atomic.AddInt64(&dir.numChildren, -1)
+	dir.iterMu.Lock()
+	dir.childList.Remove(child)
+	dir.iterMu.Unlock()
 }
 
-func (i *inode) isDir() bool {
-	_, ok := i.impl.(*directory)
-	return ok
+func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error {
+	return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid)))
 }
 
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
 
-	// Protected by filesystem.mu.
+	// Protected by directory.iterMu.
 	iter *dentry
 	off  int64
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
-func (fd *directoryFD) Release() {
+func (fd *directoryFD) Release(ctx context.Context) {
 	if fd.iter != nil {
-		fs := fd.filesystem()
 		dir := fd.inode().impl.(*directory)
-		fs.mu.Lock()
+		dir.iterMu.Lock()
 		dir.childList.Remove(fd.iter)
-		fs.mu.Unlock()
+		dir.iterMu.Unlock()
 		fd.iter = nil
 	}
 }
@@ -68,36 +108,43 @@ func (fd *directoryFD) Release() {
 // IterDirents implements vfs.FileDescriptionImpl.IterDirents.
 func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
 	fs := fd.filesystem()
-	vfsd := fd.vfsfd.VirtualDentry().Dentry()
+	dir := fd.inode().impl.(*directory)
+
+	defer fd.dentry().InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent)
 
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
+	// fs.mu is required to read d.parent and dentry.name.
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	dir.iterMu.Lock()
+	defer dir.iterMu.Unlock()
+
+	fd.inode().touchAtime(fd.vfsfd.Mount())
 
 	if fd.off == 0 {
-		if !cb.Handle(vfs.Dirent{
+		if err := cb.Handle(vfs.Dirent{
 			Name:    ".",
 			Type:    linux.DT_DIR,
-			Ino:     vfsd.Impl().(*dentry).inode.ino,
+			Ino:     dir.inode.ino,
 			NextOff: 1,
-		}) {
-			return nil
+		}); err != nil {
+			return err
 		}
 		fd.off++
 	}
+
 	if fd.off == 1 {
-		parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
-		if !cb.Handle(vfs.Dirent{
+		parentInode := genericParentOrSelf(&dir.dentry).inode
+		if err := cb.Handle(vfs.Dirent{
 			Name:    "..",
 			Type:    parentInode.direntType(),
 			Ino:     parentInode.ino,
 			NextOff: 2,
-		}) {
-			return nil
+		}); err != nil {
+			return err
 		}
 		fd.off++
 	}
 
-	dir := vfsd.Impl().(*dentry).inode.impl.(*directory)
 	var child *dentry
 	if fd.iter == nil {
 		// Start iteration at the beginning of dir.
@@ -111,14 +158,14 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	for child != nil {
 		// Skip other directoryFD iterators.
 		if child.inode != nil {
-			if !cb.Handle(vfs.Dirent{
-				Name:    child.vfsd.Name(),
+			if err := cb.Handle(vfs.Dirent{
+				Name:    child.name,
 				Type:    child.inode.direntType(),
 				Ino:     child.inode.ino,
 				NextOff: fd.off + 1,
-			}) {
+			}); err != nil {
 				dir.childList.InsertBefore(child, fd.iter)
-				return nil
+				return err
 			}
 			fd.off++
 		}
@@ -130,9 +177,9 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 
 // Seek implements vfs.FileDescriptionImpl.Seek.
 func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
-	fs := fd.filesystem()
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
+	dir := fd.inode().impl.(*directory)
+	dir.iterMu.Lock()
+	defer dir.iterMu.Unlock()
 
 	switch whence {
 	case linux.SEEK_SET:
@@ -160,8 +207,6 @@ func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (in
 		remChildren = offset - 2
 	}
 
-	dir := fd.inode().impl.(*directory)
-
 	// Ensure that fd.iter exists and is not linked into dir.childList.
 	if fd.iter == nil {
 		fd.iter = &dentry{}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
new file mode 100644
index 000000000..cb8b2d944
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -0,0 +1,860 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	// All filesystem state is in-memory.
+	return nil
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// stepLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+	dir, ok := d.inode.impl.(*directory)
+	if !ok {
+		return nil, syserror.ENOTDIR
+	}
+	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	name := rp.Component()
+	if name == "." {
+		rp.Advance()
+		return d, nil
+	}
+	if name == ".." {
+		if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return d, nil
+		}
+		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return d.parent, nil
+	}
+	if len(name) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
+	child, ok := dir.childMap[name]
+	if !ok {
+		return nil, syserror.ENOENT
+	}
+	if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+		return nil, err
+	}
+	if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+		// Symlink traversal updates access time.
+		child.inode.touchAtime(rp.Mount())
+		if err := rp.HandleSymlink(symlink.target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return child, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
+//
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
+	for !rp.Final() {
+		next, err := stepLocked(ctx, rp, d)
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	dir, ok := d.inode.impl.(*directory)
+	if !ok {
+		return nil, syserror.ENOTDIR
+	}
+	return dir, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
+//
+// Preconditions: filesystem.mu must be locked.
+func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	for !rp.Done() {
+		next, err := stepLocked(ctx, rp, d)
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
+//
+// doCreateAt is loosely analogous to a conjunction of Linux's
+// fs/namei.c:filename_create() and done_path_create().
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EEXIST
+	}
+	if len(name) > linux.NAME_MAX {
+		return syserror.ENAMETOOLONG
+	}
+	if _, ok := parentDir.childMap[name]; ok {
+		return syserror.EEXIST
+	}
+	if !dir && rp.MustBeDir() {
+		return syserror.ENOENT
+	}
+	// tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only
+	// be dead if it was deleted.
+	if parentDir.dentry.vfsd.IsDead() {
+		return syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := create(parentDir, name); err != nil {
+		return err
+	}
+
+	ev := linux.IN_CREATE
+	if dir {
+		ev |= linux.IN_ISDIR
+	}
+	parentDir.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */)
+	parentDir.inode.touchCMtime()
+	return nil
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(ctx, rp)
+	if err != nil {
+		return err
+	}
+	return d.inode.checkPermissions(creds, ats)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(ctx, rp)
+	if err != nil {
+		return nil, err
+	}
+	if opts.CheckSearchable {
+		if !d.inode.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+			return nil, err
+		}
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	dir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return nil, err
+	}
+	dir.dentry.IncRef()
+	return &dir.dentry.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
+		if rp.Mount() != vd.Mount() {
+			return syserror.EXDEV
+		}
+		d := vd.Dentry().Impl().(*dentry)
+		i := d.inode
+		if i.isDir() {
+			return syserror.EPERM
+		}
+		if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+			return err
+		}
+		if i.nlink == 0 {
+			return syserror.ENOENT
+		}
+		if i.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+		i.incLinksLocked()
+		i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */)
+		parentDir.insertChildLocked(fs.newDentry(i), name)
+		return nil
+	})
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error {
+		creds := rp.Credentials()
+		if parentDir.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+		parentDir.inode.incLinksLocked() // from child's ".."
+		childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
+		parentDir.insertChildLocked(&childDir.dentry, name)
+		return nil
+	})
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
+		creds := rp.Credentials()
+		var childInode *inode
+		switch opts.Mode.FileType() {
+		case linux.S_IFREG:
+			childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
+		case linux.S_IFIFO:
+			childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode)
+		case linux.S_IFBLK:
+			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor)
+		case linux.S_IFCHR:
+			childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor)
+		case linux.S_IFSOCK:
+			childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint)
+		default:
+			return syserror.EINVAL
+		}
+		child := fs.newDentry(childInode)
+		parentDir.insertChildLocked(child, name)
+		return nil
+	})
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	if opts.Flags&linux.O_TMPFILE != 0 {
+		// Not yet supported.
+		return nil, syserror.EOPNOTSUPP
+	}
+
+	// Handle O_CREAT and !O_CREAT separately, since in the latter case we
+	// don't need fs.mu for writing.
+	if opts.Flags&linux.O_CREAT == 0 {
+		fs.mu.RLock()
+		defer fs.mu.RUnlock()
+		d, err := resolveLocked(ctx, rp)
+		if err != nil {
+			return nil, err
+		}
+		return d.open(ctx, rp, &opts, false /* afterCreate */)
+	}
+
+	mustCreate := opts.Flags&linux.O_EXCL != 0
+	start := rp.Start().Impl().(*dentry)
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	if rp.Done() {
+		// Reject attempts to open mount root directory with O_CREAT.
+		if rp.MustBeDir() {
+			return nil, syserror.EISDIR
+		}
+		if mustCreate {
+			return nil, syserror.EEXIST
+		}
+		return start.open(ctx, rp, &opts, false /* afterCreate */)
+	}
+afterTrailingSymlink:
+	parentDir, err := walkParentDirLocked(ctx, rp, start)
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+	// Reject attempts to open directories with O_CREAT.
+	if rp.MustBeDir() {
+		return nil, syserror.EISDIR
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return nil, syserror.EISDIR
+	}
+	if len(name) > linux.NAME_MAX {
+		return nil, syserror.ENAMETOOLONG
+	}
+	// Determine whether or not we need to create a file.
+	child, ok := parentDir.childMap[name]
+	if !ok {
+		// Already checked for searchability above; now check for writability.
+		if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+			return nil, err
+		}
+		if err := rp.Mount().CheckBeginWrite(); err != nil {
+			return nil, err
+		}
+		defer rp.Mount().EndWrite()
+		// Create and open the child.
+		creds := rp.Credentials()
+		child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode))
+		parentDir.insertChildLocked(child, name)
+		fd, err := child.open(ctx, rp, &opts, true)
+		if err != nil {
+			return nil, err
+		}
+		parentDir.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */)
+		parentDir.inode.touchCMtime()
+		return fd, nil
+	}
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	// Is the file mounted over?
+	if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+		return nil, err
+	}
+	// Do we need to resolve a trailing symlink?
+	if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+		// Symlink traversal updates access time.
+		child.inode.touchAtime(rp.Mount())
+		if err := rp.HandleSymlink(symlink.target); err != nil {
+			return nil, err
+		}
+		start = &parentDir.dentry
+		goto afterTrailingSymlink
+	}
+	if rp.MustBeDir() && !child.inode.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return child.open(ctx, rp, &opts, false)
+}
+
+func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(opts)
+	if !afterCreate {
+		if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil {
+			return nil, err
+		}
+	}
+	switch impl := d.inode.impl.(type) {
+	case *regularFile:
+		var fd regularFileFD
+		fd.LockFD.Init(&d.inode.locks)
+		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
+			return nil, err
+		}
+		if !afterCreate && opts.Flags&linux.O_TRUNC != 0 {
+			if _, err := impl.truncate(0); err != nil {
+				return nil, err
+			}
+		}
+		return &fd.vfsfd, nil
+	case *directory:
+		// Can't open directories writably.
+		if ats&vfs.MayWrite != 0 {
+			return nil, syserror.EISDIR
+		}
+		var fd directoryFD
+		fd.LockFD.Init(&d.inode.locks)
+		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
+			return nil, err
+		}
+		return &fd.vfsfd, nil
+	case *symlink:
+		// TODO(gvisor.dev/issue/2782): Can't open symlinks without O_PATH.
+		return nil, syserror.ELOOP
+	case *namedPipe:
+		return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks)
+	case *deviceFile:
+		return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts)
+	case *socketFile:
+		return nil, syserror.ENXIO
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
+	}
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(ctx, rp)
+	if err != nil {
+		return "", err
+	}
+	symlink, ok := d.inode.impl.(*symlink)
+	if !ok {
+		return "", syserror.EINVAL
+	}
+	symlink.inode.touchAtime(rp.Mount())
+	return symlink.target, nil
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	if opts.Flags != 0 {
+		// TODO(b/145974740): Support renameat2 flags.
+		return syserror.EINVAL
+	}
+
+	// Resolve newParent first to verify that it's on this Mount.
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	newName := rp.Component()
+	if newName == "." || newName == ".." {
+		return syserror.EBUSY
+	}
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory)
+	if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	renamed, ok := oldParentDir.childMap[oldName]
+	if !ok {
+		return syserror.ENOENT
+	}
+	if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil {
+		return err
+	}
+	// Note that we don't need to call rp.CheckMount(), since if renamed is a
+	// mount point then we want to rename the mount point, not anything in the
+	// mounted filesystem.
+	if renamed.inode.isDir() {
+		if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) {
+			return syserror.EINVAL
+		}
+		if oldParentDir != newParentDir {
+			// Writability is needed to change renamed's "..".
+			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+				return err
+			}
+		}
+	} else {
+		if opts.MustBeDir || rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+
+	if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	replaced, ok := newParentDir.childMap[newName]
+	if ok {
+		replacedDir, ok := replaced.inode.impl.(*directory)
+		if ok {
+			if !renamed.inode.isDir() {
+				return syserror.EISDIR
+			}
+			if len(replacedDir.childMap) != 0 {
+				return syserror.ENOTEMPTY
+			}
+		} else {
+			if rp.MustBeDir() {
+				return syserror.ENOTDIR
+			}
+			if renamed.inode.isDir() {
+				return syserror.ENOTDIR
+			}
+		}
+	} else {
+		if renamed.inode.isDir() && newParentDir.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+	}
+	// tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can
+	// only be dead if it was deleted.
+	if newParentDir.dentry.vfsd.IsDead() {
+		return syserror.ENOENT
+	}
+
+	// Linux places this check before some of those above; we do it here for
+	// simplicity, under the assumption that applications are not intentionally
+	// doing noop renames expecting them to succeed where non-noop renames
+	// would fail.
+	if renamed == replaced {
+		return nil
+	}
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	var replacedVFSD *vfs.Dentry
+	if replaced != nil {
+		replacedVFSD = &replaced.vfsd
+	}
+	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
+		return err
+	}
+	if replaced != nil {
+		newParentDir.removeChildLocked(replaced)
+		if replaced.inode.isDir() {
+			// Remove links for replaced/. and replaced/..
+			replaced.inode.decLinksLocked(ctx)
+			newParentDir.inode.decLinksLocked(ctx)
+		}
+		replaced.inode.decLinksLocked(ctx)
+	}
+	oldParentDir.removeChildLocked(renamed)
+	newParentDir.insertChildLocked(renamed, newName)
+	vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
+	oldParentDir.inode.touchCMtime()
+	if oldParentDir != newParentDir {
+		if renamed.inode.isDir() {
+			oldParentDir.inode.decLinksLocked(ctx)
+			newParentDir.inode.incLinksLocked()
+		}
+		newParentDir.inode.touchCMtime()
+	}
+	renamed.inode.touchCtime()
+
+	vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir())
+	return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." {
+		return syserror.EINVAL
+	}
+	if name == ".." {
+		return syserror.ENOTEMPTY
+	}
+	child, ok := parentDir.childMap[name]
+	if !ok {
+		return syserror.ENOENT
+	}
+	if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
+		return err
+	}
+	childDir, ok := child.inode.impl.(*directory)
+	if !ok {
+		return syserror.ENOTDIR
+	}
+	if len(childDir.childMap) != 0 {
+		return syserror.ENOTEMPTY
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+		return err
+	}
+	parentDir.removeChildLocked(child)
+	parentDir.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */)
+	// Remove links for child, child/., and child/..
+	child.inode.decLinksLocked(ctx)
+	child.inode.decLinksLocked(ctx)
+	parentDir.inode.decLinksLocked(ctx)
+	vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
+	parentDir.inode.touchCMtime()
+	return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	fs.mu.RLock()
+	d, err := resolveLocked(ctx, rp)
+	if err != nil {
+		fs.mu.RUnlock()
+		return err
+	}
+	if err := d.inode.setStat(ctx, rp.Credentials(), &opts); err != nil {
+		fs.mu.RUnlock()
+		return err
+	}
+	fs.mu.RUnlock()
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+	}
+	return nil
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(ctx, rp)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	var stat linux.Statx
+	d.inode.statTo(&stat)
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	if _, err := resolveLocked(ctx, rp); err != nil {
+		return linux.Statfs{}, err
+	}
+	statfs := linux.Statfs{
+		Type:         linux.TMPFS_MAGIC,
+		BlockSize:    usermem.PageSize,
+		FragmentSize: usermem.PageSize,
+		NameLength:   linux.NAME_MAX,
+		// TODO(b/29637826): Allow configuring a tmpfs size and enforce it.
+		Blocks:     0,
+		BlocksFree: 0,
+	}
+	return statfs, nil
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error {
+		creds := rp.Credentials()
+		child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target))
+		parentDir.insertChildLocked(child, name)
+		return nil
+	})
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EISDIR
+	}
+	child, ok := parentDir.childMap[name]
+	if !ok {
+		return syserror.ENOENT
+	}
+	if err := parentDir.mayDelete(rp.Credentials(), child); err != nil {
+		return err
+	}
+	if child.inode.isDir() {
+		return syserror.EISDIR
+	}
+	if rp.MustBeDir() {
+		return syserror.ENOTDIR
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
+		return err
+	}
+
+	// Generate inotify events. Note that this must take place before the link
+	// count of the child is decremented, or else the watches may be dropped
+	// before these events are added.
+	vfs.InotifyRemoveChild(ctx, &child.inode.watches, &parentDir.inode.watches, name)
+
+	parentDir.removeChildLocked(child)
+	child.inode.decLinksLocked(ctx)
+	vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
+	parentDir.inode.touchCMtime()
+	return nil
+}
+
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(ctx, rp)
+	if err != nil {
+		return nil, err
+	}
+	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
+		return nil, err
+	}
+	switch impl := d.inode.impl.(type) {
+	case *socketFile:
+		return impl.ep, nil
+	default:
+		return nil, syserror.ECONNREFUSED
+	}
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(ctx, rp)
+	if err != nil {
+		return nil, err
+	}
+	return d.inode.listxattr(size)
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(ctx, rp)
+	if err != nil {
+		return "", err
+	}
+	return d.inode.getxattr(rp.Credentials(), &opts)
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	fs.mu.RLock()
+	d, err := resolveLocked(ctx, rp)
+	if err != nil {
+		fs.mu.RUnlock()
+		return err
+	}
+	if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
+		fs.mu.RUnlock()
+		return err
+	}
+	fs.mu.RUnlock()
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	fs.mu.RLock()
+	d, err := resolveLocked(ctx, rp)
+	if err != nil {
+		fs.mu.RUnlock()
+		return err
+	}
+	if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
+		fs.mu.RUnlock()
+		return err
+	}
+	fs.mu.RUnlock()
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	mnt := vd.Mount()
+	d := vd.Dentry().Impl().(*dentry)
+	for {
+		if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
+			return vfs.PrependPathAtVFSRootError{}
+		}
+		if &d.vfsd == mnt.Root() {
+			return nil
+		}
+		if d.parent == nil {
+			if d.name != "" {
+				// This must be an anonymous memfd file.
+				b.PrependComponent("/" + d.name)
+				return vfs.PrependPathSyntheticError{}
+			}
+			return vfs.PrependPathAtNonMountRootError{}
+		}
+		b.PrependComponent(d.name)
+		d = d.parent
+	}
+}
diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 732ed7c58..739350cf0 100644
--- a/pkg/sentry/fsimpl/memfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -12,15 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package memfs
+package tmpfs
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 type namedPipe struct {
@@ -32,28 +30,9 @@ type namedPipe struct {
 // Preconditions:
 //   * fs.mu must be locked.
 //   * rp.Mount().CheckBeginWrite() has been called successfully.
-func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
-	file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)}
-	file.inode.init(file, fs, creds, mode)
+func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
+	file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
+	file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode)
 	file.inode.nlink = 1 // Only the parent has a link.
 	return &file.inode
 }
-
-// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented
-// entirely via struct embedding.
-type namedPipeFD struct {
-	fileDescription
-
-	*pipe.VFSPipeFD
-}
-
-func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
-	var err error
-	var fd namedPipeFD
-	fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, rp, vfsd, &fd.vfsfd, flags)
-	if err != nil {
-		return nil, err
-	}
-	fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
-	return &fd.vfsfd, nil
-}
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index 0674b81a3..ec2701d8b 100644
--- a/pkg/sentry/fsimpl/memfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -12,33 +12,34 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package memfs
+package tmpfs
 
 import (
 	"bytes"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 const fileName = "mypipe"
 
 func TestSeparateFDs(t *testing.T) {
 	ctx, creds, vfsObj, root := setup(t)
-	defer root.DecRef()
+	defer root.DecRef(ctx)
 
 	// Open the read side. This is done in a concurrently because opening
 	// One end the pipe blocks until the other end is opened.
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	rfdchan := make(chan *vfs.FileDescription)
@@ -54,13 +55,13 @@ func TestSeparateFDs(t *testing.T) {
 	if err != nil {
 		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
 	}
-	defer wfd.DecRef()
+	defer wfd.DecRef(ctx)
 
 	rfd, ok := <-rfdchan
 	if !ok {
 		t.Fatalf("failed to open pipe for reading %q", fileName)
 	}
-	defer rfd.DecRef()
+	defer rfd.DecRef(ctx)
 
 	const msg = "vamos azul"
 	checkEmpty(ctx, t, rfd)
@@ -70,13 +71,13 @@ func TestSeparateFDs(t *testing.T) {
 
 func TestNonblockingRead(t *testing.T) {
 	ctx, creds, vfsObj, root := setup(t)
-	defer root.DecRef()
+	defer root.DecRef(ctx)
 
 	// Open the read side as nonblocking.
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK}
@@ -84,7 +85,7 @@ func TestNonblockingRead(t *testing.T) {
 	if err != nil {
 		t.Fatalf("failed to open pipe for reading %q: %v", fileName, err)
 	}
-	defer rfd.DecRef()
+	defer rfd.DecRef(ctx)
 
 	// Open the write side.
 	openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY}
@@ -92,7 +93,7 @@ func TestNonblockingRead(t *testing.T) {
 	if err != nil {
 		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
 	}
-	defer wfd.DecRef()
+	defer wfd.DecRef(ctx)
 
 	const msg = "geh blau"
 	checkEmpty(ctx, t, rfd)
@@ -102,13 +103,13 @@ func TestNonblockingRead(t *testing.T) {
 
 func TestNonblockingWriteError(t *testing.T) {
 	ctx, creds, vfsObj, root := setup(t)
-	defer root.DecRef()
+	defer root.DecRef(ctx)
 
 	// Open the write side as nonblocking, which should return ENXIO.
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK}
@@ -120,13 +121,13 @@ func TestNonblockingWriteError(t *testing.T) {
 
 func TestSingleFD(t *testing.T) {
 	ctx, creds, vfsObj, root := setup(t)
-	defer root.DecRef()
+	defer root.DecRef(ctx)
 
 	// Open the pipe as readable and writable.
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	openOpts := vfs.OpenOptions{Flags: linux.O_RDWR}
@@ -134,7 +135,7 @@ func TestSingleFD(t *testing.T) {
 	if err != nil {
 		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
 	}
-	defer fd.DecRef()
+	defer fd.DecRef(ctx)
 
 	const msg = "forza blu"
 	checkEmpty(ctx, t, fd)
@@ -150,9 +151,14 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	creds := auth.CredentialsFromContext(ctx)
 
 	// Create VFS.
-	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(ctx); err != nil {
+		t.Fatalf("VFS init: %v", err)
+	}
+	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
@@ -160,10 +166,9 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	// Create the pipe.
 	root := mntns.Root()
 	pop := vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Pathname:           fileName,
-		FollowFinalSymlink: true,
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(fileName),
 	}
 	mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644}
 	if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil {
@@ -174,7 +179,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}, &vfs.StatOptions{})
 	if err != nil {
@@ -194,7 +199,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
 	readData := make([]byte, 1)
 	dst := usermem.BytesIOSequence(readData)
-	bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{})
+	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
 	if err != syserror.ErrWouldBlock {
 		t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err)
 	}
@@ -207,7 +212,7 @@ func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
 func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
 	writeData := []byte(msg)
 	src := usermem.BytesIOSequence(writeData)
-	bytesWritten, err := fd.Impl().Write(ctx, src, vfs.WriteOptions{})
+	bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{})
 	if err != nil {
 		t.Fatalf("error writing to pipe %q: %v", fileName, err)
 	}
@@ -220,7 +225,7 @@ func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg
 func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
 	readData := make([]byte, len(msg))
 	dst := usermem.BytesIOSequence(readData)
-	bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{})
+	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
 	if err != nil {
 		t.Fatalf("error reading from pipe %q: %v", fileName, err)
 	}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
new file mode 100644
index 000000000..0710b65db
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -0,0 +1,637 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// regularFile is a regular (=S_IFREG) tmpfs file.
+type regularFile struct {
+	inode inode
+
+	// memFile is a platform.File used to allocate pages to this regularFile.
+	memFile *pgalloc.MemoryFile
+
+	// mapsMu protects mappings.
+	mapsMu sync.Mutex `state:"nosave"`
+
+	// mappings tracks mappings of the file into memmap.MappingSpaces.
+	//
+	// Protected by mapsMu.
+	mappings memmap.MappingSet
+
+	// writableMappingPages tracks how many pages of virtual memory are mapped
+	// as potentially writable from this file. If a page has multiple mappings,
+	// each mapping is counted separately.
+	//
+	// This counter is susceptible to overflow as we can potentially count
+	// mappings from many VMAs. We count pages rather than bytes to slightly
+	// mitigate this.
+	//
+	// Protected by mapsMu.
+	writableMappingPages uint64
+
+	// dataMu protects the fields below.
+	dataMu sync.RWMutex
+
+	// data maps offsets into the file to offsets into memFile that store
+	// the file's data.
+	//
+	// Protected by dataMu.
+	data fsutil.FileRangeSet
+
+	// seals represents file seals on this inode.
+	//
+	// Protected by dataMu.
+	seals uint32
+
+	// size is the size of data.
+	//
+	// Protected by both dataMu and inode.mu; reading it requires holding
+	// either mutex, while writing requires holding both AND using atomics.
+	// Readers that do not require consistency (like Stat) may read the
+	// value atomically without holding either lock.
+	size uint64
+}
+
+func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
+	file := &regularFile{
+		memFile: fs.memFile,
+		seals:   linux.F_SEAL_SEAL,
+	}
+	file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode)
+	file.inode.nlink = 1 // from parent directory
+	return &file.inode
+}
+
+// truncate grows or shrinks the file to the given size. It returns true if the
+// file size was updated.
+func (rf *regularFile) truncate(newSize uint64) (bool, error) {
+	rf.inode.mu.Lock()
+	defer rf.inode.mu.Unlock()
+	return rf.truncateLocked(newSize)
+}
+
+// Preconditions: rf.inode.mu must be held.
+func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) {
+	oldSize := rf.size
+	if newSize == oldSize {
+		// Nothing to do.
+		return false, nil
+	}
+
+	// Need to hold inode.mu and dataMu while modifying size.
+	rf.dataMu.Lock()
+	if newSize > oldSize {
+		// Can we grow the file?
+		if rf.seals&linux.F_SEAL_GROW != 0 {
+			rf.dataMu.Unlock()
+			return false, syserror.EPERM
+		}
+		// We only need to update the file size.
+		atomic.StoreUint64(&rf.size, newSize)
+		rf.dataMu.Unlock()
+		return true, nil
+	}
+
+	// We are shrinking the file. First check if this is allowed.
+	if rf.seals&linux.F_SEAL_SHRINK != 0 {
+		rf.dataMu.Unlock()
+		return false, syserror.EPERM
+	}
+
+	// Update the file size.
+	atomic.StoreUint64(&rf.size, newSize)
+	rf.dataMu.Unlock()
+
+	// Invalidate past translations of truncated pages.
+	oldpgend := fs.OffsetPageEnd(int64(oldSize))
+	newpgend := fs.OffsetPageEnd(int64(newSize))
+	if newpgend < oldpgend {
+		rf.mapsMu.Lock()
+		rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
+			// Compare Linux's mm/shmem.c:shmem_setattr() =>
+			// mm/memory.c:unmap_mapping_range(evencows=1).
+			InvalidatePrivate: true,
+		})
+		rf.mapsMu.Unlock()
+	}
+
+	// We are now guaranteed that there are no translations of truncated pages,
+	// and can remove them.
+	rf.dataMu.Lock()
+	rf.data.Truncate(newSize, rf.memFile)
+	rf.dataMu.Unlock()
+	return true, nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	rf.mapsMu.Lock()
+	defer rf.mapsMu.Unlock()
+	rf.dataMu.RLock()
+	defer rf.dataMu.RUnlock()
+
+	// Reject writable mapping if F_SEAL_WRITE is set.
+	if rf.seals&linux.F_SEAL_WRITE != 0 && writable {
+		return syserror.EPERM
+	}
+
+	rf.mappings.AddMapping(ms, ar, offset, writable)
+	if writable {
+		pagesBefore := rf.writableMappingPages
+
+		// ar is guaranteed to be page aligned per memmap.Mappable.
+		rf.writableMappingPages += uint64(ar.Length() / usermem.PageSize)
+
+		if rf.writableMappingPages < pagesBefore {
+			panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
+		}
+	}
+
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	rf.mapsMu.Lock()
+	defer rf.mapsMu.Unlock()
+
+	rf.mappings.RemoveMapping(ms, ar, offset, writable)
+
+	if writable {
+		pagesBefore := rf.writableMappingPages
+
+		// ar is guaranteed to be page aligned per memmap.Mappable.
+		rf.writableMappingPages -= uint64(ar.Length() / usermem.PageSize)
+
+		if rf.writableMappingPages > pagesBefore {
+			panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
+		}
+	}
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	return rf.AddMapping(ctx, ms, dstAR, offset, writable)
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	rf.dataMu.Lock()
+	defer rf.dataMu.Unlock()
+
+	// Constrain translations to f.attr.Size (rounded up) to prevent
+	// translation to pages that may be concurrently truncated.
+	pgend := fs.OffsetPageEnd(int64(rf.size))
+	var beyondEOF bool
+	if required.End > pgend {
+		if required.Start >= pgend {
+			return nil, &memmap.BusError{io.EOF}
+		}
+		beyondEOF = true
+		required.End = pgend
+	}
+	if optional.End > pgend {
+		optional.End = pgend
+	}
+
+	cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+		// Newly-allocated pages are zeroed, so we don't need to do anything.
+		return dsts.NumBytes(), nil
+	})
+
+	var ts []memmap.Translation
+	var translatedEnd uint64
+	for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
+		segMR := seg.Range().Intersect(optional)
+		ts = append(ts, memmap.Translation{
+			Source: segMR,
+			File:   rf.memFile,
+			Offset: seg.FileRangeOf(segMR).Start,
+			Perms:  usermem.AnyAccess,
+		})
+		translatedEnd = segMR.End
+	}
+
+	// Don't return the error returned by f.data.Fill if it occurred outside of
+	// required.
+	if translatedEnd < required.End && cerr != nil {
+		return ts, &memmap.BusError{cerr}
+	}
+	if beyondEOF {
+		return ts, &memmap.BusError{io.EOF}
+	}
+	return ts, nil
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (*regularFile) InvalidateUnsavable(context.Context) error {
+	return nil
+}
+
+type regularFileFD struct {
+	fileDescription
+
+	// off is the file offset. off is accessed using atomic memory operations.
+	// offMu serializes operations that may mutate off.
+	off   int64
+	offMu sync.Mutex
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release(context.Context) {
+	// noop
+}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	f := fd.inode().impl.(*regularFile)
+
+	f.inode.mu.Lock()
+	defer f.inode.mu.Unlock()
+	oldSize := f.size
+	size := offset + length
+	if oldSize >= size {
+		return nil
+	}
+	_, err := f.truncateLocked(size)
+	return err
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
+	// all state is in-memory.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+	f := fd.inode().impl.(*regularFile)
+	rw := getRegularFileReadWriter(f, offset)
+	n, err := dst.CopyOutFrom(ctx, rw)
+	putRegularFileReadWriter(rw)
+	fd.inode().touchAtime(fd.vfsfd.Mount())
+	return n, err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, _, err := fd.pwrite(ctx, src, offset, opts)
+	return n, err
+}
+
+// pwrite returns the number of bytes written, final offset and error. The
+// final offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
+	if offset < 0 {
+		return 0, offset, syserror.EINVAL
+	}
+
+	// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
+	// all state is in-memory.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
+		return 0, offset, syserror.EOPNOTSUPP
+	}
+
+	srclen := src.NumBytes()
+	if srclen == 0 {
+		return 0, offset, nil
+	}
+	f := fd.inode().impl.(*regularFile)
+	f.inode.mu.Lock()
+	defer f.inode.mu.Unlock()
+	// If the file is opened with O_APPEND, update offset to file size.
+	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+		// Locking f.inode.mu is sufficient for reading f.size.
+		offset = int64(f.size)
+	}
+	if end := offset + srclen; end < offset {
+		// Overflow.
+		return 0, offset, syserror.EINVAL
+	}
+
+	srclen, err = vfs.CheckLimit(ctx, offset, srclen)
+	if err != nil {
+		return 0, offset, err
+	}
+	src = src.TakeFirst64(srclen)
+
+	rw := getRegularFileReadWriter(f, offset)
+	n, err := src.CopyInTo(ctx, rw)
+	f.inode.touchCMtimeLocked()
+	putRegularFileReadWriter(rw)
+	return n, n + offset, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+	fd.off = off
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.offMu.Lock()
+	defer fd.offMu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// use offset as specified
+	case linux.SEEK_CUR:
+		offset += fd.off
+	case linux.SEEK_END:
+		offset += int64(atomic.LoadUint64(&fd.inode().impl.(*regularFile).size))
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	file := fd.inode().impl.(*regularFile)
+	return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts)
+}
+
+// regularFileReadWriter implements safemem.Reader and Safemem.Writer.
+type regularFileReadWriter struct {
+	file *regularFile
+
+	// Offset into the file to read/write at. Note that this may be
+	// different from the FD offset if PRead/PWrite is used.
+	off uint64
+}
+
+var regularFileReadWriterPool = sync.Pool{
+	New: func() interface{} {
+		return &regularFileReadWriter{}
+	},
+}
+
+func getRegularFileReadWriter(file *regularFile, offset int64) *regularFileReadWriter {
+	rw := regularFileReadWriterPool.Get().(*regularFileReadWriter)
+	rw.file = file
+	rw.off = uint64(offset)
+	return rw
+}
+
+func putRegularFileReadWriter(rw *regularFileReadWriter) {
+	rw.file = nil
+	regularFileReadWriterPool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	rw.file.dataMu.RLock()
+	defer rw.file.dataMu.RUnlock()
+	size := rw.file.size
+
+	// Compute the range to read (limited by file size and overflow-checked).
+	if rw.off >= size {
+		return 0, io.EOF
+	}
+	end := size
+	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
+		end = rend
+	}
+
+	var done uint64
+	seg, gap := rw.file.data.Find(uint64(rw.off))
+	for rw.off < end {
+		mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
+		switch {
+		case seg.Ok():
+			// Get internal mappings.
+			ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+			if err != nil {
+				return done, err
+			}
+
+			// Copy from internal mappings.
+			n, err := safemem.CopySeq(dsts, ims)
+			done += n
+			rw.off += uint64(n)
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Tmpfs holes are zero-filled.
+			gapmr := gap.Range().Intersect(mr)
+			dst := dsts.TakeFirst64(gapmr.Length())
+			n, err := safemem.ZeroSeq(dst)
+			done += n
+			rw.off += uint64(n)
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+		}
+	}
+	return done, nil
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+//
+// Preconditions: inode.mu must be held.
+func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	// Hold dataMu so we can modify size.
+	rw.file.dataMu.Lock()
+	defer rw.file.dataMu.Unlock()
+
+	// Compute the range to write (overflow-checked).
+	end := rw.off + srcs.NumBytes()
+	if end <= rw.off {
+		end = math.MaxInt64
+	}
+
+	// Check if seals prevent either file growth or all writes.
+	switch {
+	case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
+		return 0, syserror.EPERM
+	case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
+		// When growth is sealed, Linux effectively allows writes which would
+		// normally grow the file to partially succeed up to the current EOF,
+		// rounded down to the page boundary before the EOF.
+		//
+		// This happens because writes (and thus the growth check) for tmpfs
+		// files proceed page-by-page on Linux, and the final write to the page
+		// containing EOF fails, resulting in a partial write up to the start of
+		// that page.
+		//
+		// To emulate this behaviour, artifically truncate the write to the
+		// start of the page containing the current EOF.
+		//
+		// See Linux, mm/filemap.c:generic_perform_write() and
+		// mm/shmem.c:shmem_write_begin().
+		if pgstart := uint64(usermem.Addr(rw.file.size).RoundDown()); end > pgstart {
+			end = pgstart
+		}
+		if end <= rw.off {
+			// Truncation would result in no data being written.
+			return 0, syserror.EPERM
+		}
+	}
+
+	// Page-aligned mr for when we need to allocate memory. RoundUp can't
+	// overflow since end is an int64.
+	pgstartaddr := usermem.Addr(rw.off).RoundDown()
+	pgendaddr, _ := usermem.Addr(end).RoundUp()
+	pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)}
+
+	var (
+		done   uint64
+		retErr error
+	)
+	seg, gap := rw.file.data.Find(uint64(rw.off))
+	for rw.off < end {
+		mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
+		switch {
+		case seg.Ok():
+			// Get internal mappings.
+			ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Copy to internal mappings.
+			n, err := safemem.CopySeq(ims, srcs)
+			done += n
+			rw.off += uint64(n)
+			srcs = srcs.DropFirst64(n)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Allocate memory for the write.
+			gapMR := gap.Range().Intersect(pgMR)
+			fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Write to that memory as usual.
+			seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}
+		}
+	}
+exitLoop:
+	// If the write ends beyond the file's previous size, it causes the
+	// file to grow.
+	if rw.off > rw.file.size {
+		rw.file.size = rw.off
+	}
+
+	return done, retErr
+}
+
+// GetSeals returns the current set of seals on a memfd inode.
+func GetSeals(fd *vfs.FileDescription) (uint32, error) {
+	f, ok := fd.Impl().(*regularFileFD)
+	if !ok {
+		return 0, syserror.EINVAL
+	}
+	rf := f.inode().impl.(*regularFile)
+	rf.dataMu.RLock()
+	defer rf.dataMu.RUnlock()
+	return rf.seals, nil
+}
+
+// AddSeals adds new file seals to a memfd inode.
+func AddSeals(fd *vfs.FileDescription, val uint32) error {
+	f, ok := fd.Impl().(*regularFileFD)
+	if !ok {
+		return syserror.EINVAL
+	}
+	rf := f.inode().impl.(*regularFile)
+	rf.mapsMu.Lock()
+	defer rf.mapsMu.Unlock()
+	rf.dataMu.RLock()
+	defer rf.dataMu.RUnlock()
+
+	if rf.seals&linux.F_SEAL_SEAL != 0 {
+		// Seal applied which prevents addition of any new seals.
+		return syserror.EPERM
+	}
+
+	// F_SEAL_WRITE can only be added if there are no active writable maps.
+	if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 {
+		if rf.writableMappingPages > 0 {
+			return syserror.EBUSY
+		}
+	}
+
+	// Seals can only be added, never removed.
+	rf.seals |= val
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
new file mode 100644
index 000000000..146c7fdfe
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -0,0 +1,349 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Test that we can write some data to a file and read it back.`
+func TestSimpleWriteRead(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, 0644)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	// Write.
+	data := []byte("foobarbaz")
+	n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("fd.Write failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+	}
+	if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want {
+		t.Errorf("fd.Write left offset at %d, want %d", got, want)
+	}
+
+	// Seek back to beginning.
+	if _, err := fd.Seek(ctx, 0, linux.SEEK_SET); err != nil {
+		t.Fatalf("fd.Seek failed: %v", err)
+	}
+	if got, want := fd.Impl().(*regularFileFD).off, int64(0); got != want {
+		t.Errorf("fd.Seek(0) left offset at %d, want %d", got, want)
+	}
+
+	// Read.
+	buf := make([]byte, len(data))
+	n, err = fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+	if err != nil && err != io.EOF {
+		t.Fatalf("fd.Read failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Read got short read length %d, want %d", n, len(data))
+	}
+	if got, want := string(buf), string(data); got != want {
+		t.Errorf("Read got %q want %s", got, want)
+	}
+	if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want {
+		t.Errorf("fd.Write left offset at %d, want %d", got, want)
+	}
+}
+
+func TestPWrite(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, 0644)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	// Fill file with 1k 'a's.
+	data := bytes.Repeat([]byte{'a'}, 1000)
+	n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("fd.Write failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+	}
+
+	// Write "gVisor is awesome" at various offsets.
+	buf := []byte("gVisor is awesome")
+	offsets := []int{0, 1, 2, 10, 20, 50, 100, len(data) - 100, len(data) - 1, len(data), len(data) + 1}
+	for _, offset := range offsets {
+		name := fmt.Sprintf("PWrite offset=%d", offset)
+		t.Run(name, func(t *testing.T) {
+			n, err := fd.PWrite(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.WriteOptions{})
+			if err != nil {
+				t.Errorf("fd.PWrite got err %v want nil", err)
+			}
+			if n != int64(len(buf)) {
+				t.Errorf("fd.PWrite got %d bytes want %d", n, len(buf))
+			}
+
+			// Update data to reflect expected file contents.
+			if len(data) < offset+len(buf) {
+				data = append(data, make([]byte, (offset+len(buf))-len(data))...)
+			}
+			copy(data[offset:], buf)
+
+			// Read the whole file and compare with data.
+			readBuf := make([]byte, len(data))
+			n, err = fd.PRead(ctx, usermem.BytesIOSequence(readBuf), 0, vfs.ReadOptions{})
+			if err != nil {
+				t.Fatalf("fd.PRead failed: %v", err)
+			}
+			if n != int64(len(data)) {
+				t.Errorf("fd.PRead got short read length %d, want %d", n, len(data))
+			}
+			if got, want := string(readBuf), string(data); got != want {
+				t.Errorf("PRead got %q want %s", got, want)
+			}
+
+		})
+	}
+}
+
+func TestLocks(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, 0644)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	uid1 := 123
+	uid2 := 456
+	if err := fd.Impl().LockBSD(ctx, uid1, lock.ReadLock, nil); err != nil {
+		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
+	}
+	if err := fd.Impl().LockBSD(ctx, uid2, lock.ReadLock, nil); err != nil {
+		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
+	}
+	if got, want := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil), syserror.ErrWouldBlock; got != want {
+		t.Fatalf("fd.Impl().LockBSD failed: got = %v, want = %v", got, want)
+	}
+	if err := fd.Impl().UnlockBSD(ctx, uid1); err != nil {
+		t.Fatalf("fd.Impl().UnlockBSD failed: err = %v", err)
+	}
+	if err := fd.Impl().LockBSD(ctx, uid2, lock.WriteLock, nil); err != nil {
+		t.Fatalf("fd.Impl().LockBSD failed: err = %v", err)
+	}
+
+	if err := fd.Impl().LockPOSIX(ctx, uid1, lock.ReadLock, 0, 1, linux.SEEK_SET, nil); err != nil {
+		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
+	}
+	if err := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 1, 2, linux.SEEK_SET, nil); err != nil {
+		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
+	}
+	if err := fd.Impl().LockPOSIX(ctx, uid1, lock.WriteLock, 0, 1, linux.SEEK_SET, nil); err != nil {
+		t.Fatalf("fd.Impl().LockPOSIX failed: err = %v", err)
+	}
+	if got, want := fd.Impl().LockPOSIX(ctx, uid2, lock.ReadLock, 0, 1, linux.SEEK_SET, nil), syserror.ErrWouldBlock; got != want {
+		t.Fatalf("fd.Impl().LockPOSIX failed: got = %v, want = %v", got, want)
+	}
+	if err := fd.Impl().UnlockPOSIX(ctx, uid1, 0, 1, linux.SEEK_SET); err != nil {
+		t.Fatalf("fd.Impl().UnlockPOSIX failed: err = %v", err)
+	}
+}
+
+func TestPRead(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, 0644)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	// Write 100 sequences of 'gVisor is awesome'.
+	data := bytes.Repeat([]byte("gVisor is awsome"), 100)
+	n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("fd.Write failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+	}
+
+	// Read various sizes from various offsets.
+	sizes := []int{0, 1, 2, 10, 20, 50, 100, 1000}
+	offsets := []int{0, 1, 2, 10, 20, 50, 100, 1000, len(data) - 100, len(data) - 1, len(data), len(data) + 1}
+
+	for _, size := range sizes {
+		for _, offset := range offsets {
+			name := fmt.Sprintf("PRead offset=%d size=%d", offset, size)
+			t.Run(name, func(t *testing.T) {
+				var (
+					wantRead []byte
+					wantErr  error
+				)
+				if offset < len(data) {
+					wantRead = data[offset:]
+				} else if size > 0 {
+					wantErr = io.EOF
+				}
+				if offset+size < len(data) {
+					wantRead = wantRead[:size]
+				}
+				buf := make([]byte, size)
+				n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.ReadOptions{})
+				if err != wantErr {
+					t.Errorf("fd.PRead got err %v want %v", err, wantErr)
+				}
+				if n != int64(len(wantRead)) {
+					t.Errorf("fd.PRead got %d bytes want %d", n, len(wantRead))
+				}
+				if got := string(buf[:n]); got != string(wantRead) {
+					t.Errorf("fd.PRead got %q want %q", got, string(wantRead))
+				}
+			})
+		}
+	}
+}
+
+func TestTruncate(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, 0644)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	// Fill the file with some data.
+	data := bytes.Repeat([]byte("gVisor is awsome"), 100)
+	written, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("fd.Write failed: %v", err)
+	}
+
+	// Size should be same as written.
+	sizeStatOpts := vfs.StatOptions{Mask: linux.STATX_SIZE}
+	stat, err := fd.Stat(ctx, sizeStatOpts)
+	if err != nil {
+		t.Fatalf("fd.Stat failed: %v", err)
+	}
+	if got, want := int64(stat.Size), written; got != want {
+		t.Errorf("fd.Stat got size %d, want %d", got, want)
+	}
+
+	// Truncate down.
+	newSize := uint64(10)
+	if err := fd.SetStat(ctx, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: newSize,
+		},
+	}); err != nil {
+		t.Errorf("fd.Truncate failed: %v", err)
+	}
+	// Size should be updated.
+	statAfterTruncateDown, err := fd.Stat(ctx, sizeStatOpts)
+	if err != nil {
+		t.Fatalf("fd.Stat failed: %v", err)
+	}
+	if got, want := statAfterTruncateDown.Size, newSize; got != want {
+		t.Errorf("fd.Stat got size %d, want %d", got, want)
+	}
+	// We should only read newSize worth of data.
+	buf := make([]byte, 1000)
+	if n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0, vfs.ReadOptions{}); err != nil && err != io.EOF {
+		t.Fatalf("fd.PRead failed: %v", err)
+	} else if uint64(n) != newSize {
+		t.Errorf("fd.PRead got size %d, want %d", n, newSize)
+	}
+	// Mtime and Ctime should be bumped.
+	if got := statAfterTruncateDown.Mtime.ToNsec(); got <= stat.Mtime.ToNsec() {
+		t.Errorf("fd.Stat got Mtime %v, want > %v", got, stat.Mtime)
+	}
+	if got := statAfterTruncateDown.Ctime.ToNsec(); got <= stat.Ctime.ToNsec() {
+		t.Errorf("fd.Stat got Ctime %v, want > %v", got, stat.Ctime)
+	}
+
+	// Truncate up.
+	newSize = 100
+	if err := fd.SetStat(ctx, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: newSize,
+		},
+	}); err != nil {
+		t.Errorf("fd.Truncate failed: %v", err)
+	}
+	// Size should be updated.
+	statAfterTruncateUp, err := fd.Stat(ctx, sizeStatOpts)
+	if err != nil {
+		t.Fatalf("fd.Stat failed: %v", err)
+	}
+	if got, want := statAfterTruncateUp.Size, newSize; got != want {
+		t.Errorf("fd.Stat got size %d, want %d", got, want)
+	}
+	// We should read newSize worth of data.
+	buf = make([]byte, 1000)
+	if n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0, vfs.ReadOptions{}); err != nil && err != io.EOF {
+		t.Fatalf("fd.PRead failed: %v", err)
+	} else if uint64(n) != newSize {
+		t.Errorf("fd.PRead got size %d, want %d", n, newSize)
+	}
+	// Bytes should be null after 10, since we previously truncated to 10.
+	for i := uint64(10); i < newSize; i++ {
+		if buf[i] != 0 {
+			t.Errorf("fd.PRead got byte %d=%x, want 0", i, buf[i])
+			break
+		}
+	}
+	// Mtime and Ctime should be bumped.
+	if got := statAfterTruncateUp.Mtime.ToNsec(); got <= statAfterTruncateDown.Mtime.ToNsec() {
+		t.Errorf("fd.Stat got Mtime %v, want > %v", got, statAfterTruncateDown.Mtime)
+	}
+	if got := statAfterTruncateUp.Ctime.ToNsec(); got <= statAfterTruncateDown.Ctime.ToNsec() {
+		t.Errorf("fd.Stat got Ctime %v, want > %v", got, stat.Ctime)
+	}
+
+	// Truncate to the current size.
+	newSize = statAfterTruncateUp.Size
+	if err := fd.SetStat(ctx, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: linux.STATX_SIZE,
+			Size: newSize,
+		},
+	}); err != nil {
+		t.Errorf("fd.Truncate failed: %v", err)
+	}
+	statAfterTruncateNoop, err := fd.Stat(ctx, sizeStatOpts)
+	if err != nil {
+		t.Fatalf("fd.Stat failed: %v", err)
+	}
+	// Mtime and Ctime should not be bumped, since operation is a noop.
+	if got := statAfterTruncateNoop.Mtime.ToNsec(); got != statAfterTruncateUp.Mtime.ToNsec() {
+		t.Errorf("fd.Stat got Mtime %v, want %v", got, statAfterTruncateUp.Mtime)
+	}
+	if got := statAfterTruncateNoop.Ctime.ToNsec(); got != statAfterTruncateUp.Ctime.ToNsec() {
+		t.Errorf("fd.Stat got Ctime %v, want %v", got, statAfterTruncateUp.Ctime)
+	}
+}
diff --git a/pkg/sentry/fsimpl/proc/mounts.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go
index e81b1e910..3ed650474 100644
--- a/pkg/sentry/fsimpl/proc/mounts.go
+++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go
@@ -12,22 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package proc
+package tmpfs
 
-import "gvisor.dev/gvisor/pkg/sentry/kernel"
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+)
 
-// TODO(b/138862512): Implement mountInfoFile and mountsFile.
-
-// mountInfoFile implements vfs.DynamicBytesSource for /proc/[pid]/mountinfo.
-//
-// +stateify savable
-type mountInfoFile struct {
-	t *kernel.Task
+// socketFile is a socket (=S_IFSOCK) tmpfs file.
+type socketFile struct {
+	inode inode
+	ep    transport.BoundEndpoint
 }
 
-// mountsFile implements vfs.DynamicBytesSource for /proc/[pid]/mounts.
-//
-// +stateify savable
-type mountsFile struct {
-	t *kernel.Task
+func (fs *filesystem) newSocketFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, ep transport.BoundEndpoint) *inode {
+	file := &socketFile{ep: ep}
+	file.inode.init(file, fs, kuid, kgid, mode)
+	file.inode.nlink = 1 // from parent directory
+	return &file.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/stat_test.go b/pkg/sentry/fsimpl/tmpfs/stat_test.go
new file mode 100644
index 000000000..f7ee4aab2
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/stat_test.go
@@ -0,0 +1,236 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"fmt"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+func TestStatAfterCreate(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mode := linux.FileMode(0644)
+
+	// Run with different file types.
+	for _, typ := range []string{"file", "dir", "pipe"} {
+		t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
+			var (
+				fd      *vfs.FileDescription
+				cleanup func()
+				err     error
+			)
+			switch typ {
+			case "file":
+				fd, cleanup, err = newFileFD(ctx, mode)
+			case "dir":
+				fd, cleanup, err = newDirFD(ctx, mode)
+			case "pipe":
+				fd, cleanup, err = newPipeFD(ctx, mode)
+			default:
+				panic(fmt.Sprintf("unknown typ %q", typ))
+			}
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer cleanup()
+
+			got, err := fd.Stat(ctx, vfs.StatOptions{})
+			if err != nil {
+				t.Fatalf("Stat failed: %v", err)
+			}
+
+			// Atime, Ctime, Mtime should all be current time (non-zero).
+			atime, ctime, mtime := got.Atime.ToNsec(), got.Ctime.ToNsec(), got.Mtime.ToNsec()
+			if atime != ctime || ctime != mtime {
+				t.Errorf("got atime=%d ctime=%d mtime=%d, wanted equal values", atime, ctime, mtime)
+			}
+			if atime == 0 {
+				t.Errorf("got atime=%d, want non-zero", atime)
+			}
+
+			// Btime should be 0, as it is not set by tmpfs.
+			if btime := got.Btime.ToNsec(); btime != 0 {
+				t.Errorf("got btime %d, want 0", got.Btime.ToNsec())
+			}
+
+			// Size should be 0 (except for directories, which make up a size
+			// of 20 per entry, including the "." and ".." entries present in
+			// otherwise-empty directories).
+			wantSize := uint64(0)
+			if typ == "dir" {
+				wantSize = 40
+			}
+			if got.Size != wantSize {
+				t.Errorf("got size %d, want %d", got.Size, wantSize)
+			}
+
+			// Nlink should be 1 for files, 2 for dirs.
+			wantNlink := uint32(1)
+			if typ == "dir" {
+				wantNlink = 2
+			}
+			if got.Nlink != wantNlink {
+				t.Errorf("got nlink %d, want %d", got.Nlink, wantNlink)
+			}
+
+			// UID and GID are set from context creds.
+			creds := auth.CredentialsFromContext(ctx)
+			if got.UID != uint32(creds.EffectiveKUID) {
+				t.Errorf("got uid %d, want %d", got.UID, uint32(creds.EffectiveKUID))
+			}
+			if got.GID != uint32(creds.EffectiveKGID) {
+				t.Errorf("got gid %d, want %d", got.GID, uint32(creds.EffectiveKGID))
+			}
+
+			// Mode.
+			wantMode := uint16(mode)
+			switch typ {
+			case "file":
+				wantMode |= linux.S_IFREG
+			case "dir":
+				wantMode |= linux.S_IFDIR
+			case "pipe":
+				wantMode |= linux.S_IFIFO
+			default:
+				panic(fmt.Sprintf("unknown typ %q", typ))
+			}
+
+			if got.Mode != wantMode {
+				t.Errorf("got mode %x, want %x", got.Mode, wantMode)
+			}
+
+			// Ino.
+			if got.Ino == 0 {
+				t.Errorf("got ino %d, want not 0", got.Ino)
+			}
+		})
+	}
+}
+
+func TestSetStatAtime(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, 0644)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	allStatOptions := vfs.StatOptions{Mask: linux.STATX_ALL}
+
+	// Get initial stat.
+	initialStat, err := fd.Stat(ctx, allStatOptions)
+	if err != nil {
+		t.Fatalf("Stat failed: %v", err)
+	}
+
+	// Set atime, but without the mask.
+	if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: linux.Statx{
+		Mask:  0,
+		Atime: linux.NsecToStatxTimestamp(100),
+	}}); err != nil {
+		t.Errorf("SetStat atime without mask failed: %v", err)
+	}
+	// Atime should be unchanged.
+	if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
+		t.Errorf("Stat got error: %v", err)
+	} else if gotStat.Atime != initialStat.Atime {
+		t.Errorf("Stat got atime %d, want %d", gotStat.Atime, initialStat.Atime)
+	}
+
+	// Set atime, this time included in the mask.
+	setStat := linux.Statx{
+		Mask:  linux.STATX_ATIME,
+		Atime: linux.NsecToStatxTimestamp(100),
+	}
+	if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil {
+		t.Errorf("SetStat atime with mask failed: %v", err)
+	}
+	if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
+		t.Errorf("Stat got error: %v", err)
+	} else if gotStat.Atime != setStat.Atime {
+		t.Errorf("Stat got atime %d, want %d", gotStat.Atime, setStat.Atime)
+	}
+}
+
+func TestSetStat(t *testing.T) {
+	ctx := contexttest.Context(t)
+	mode := linux.FileMode(0644)
+
+	// Run with different file types.
+	for _, typ := range []string{"file", "dir", "pipe"} {
+		t.Run(fmt.Sprintf("type=%q", typ), func(t *testing.T) {
+			var (
+				fd      *vfs.FileDescription
+				cleanup func()
+				err     error
+			)
+			switch typ {
+			case "file":
+				fd, cleanup, err = newFileFD(ctx, mode)
+			case "dir":
+				fd, cleanup, err = newDirFD(ctx, mode)
+			case "pipe":
+				fd, cleanup, err = newPipeFD(ctx, mode)
+			default:
+				panic(fmt.Sprintf("unknown typ %q", typ))
+			}
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer cleanup()
+
+			allStatOptions := vfs.StatOptions{Mask: linux.STATX_ALL}
+
+			// Get initial stat.
+			initialStat, err := fd.Stat(ctx, allStatOptions)
+			if err != nil {
+				t.Fatalf("Stat failed: %v", err)
+			}
+
+			// Set atime, but without the mask.
+			if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: linux.Statx{
+				Mask:  0,
+				Atime: linux.NsecToStatxTimestamp(100),
+			}}); err != nil {
+				t.Errorf("SetStat atime without mask failed: %v", err)
+			}
+			// Atime should be unchanged.
+			if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
+				t.Errorf("Stat got error: %v", err)
+			} else if gotStat.Atime != initialStat.Atime {
+				t.Errorf("Stat got atime %d, want %d", gotStat.Atime, initialStat.Atime)
+			}
+
+			// Set atime, this time included in the mask.
+			setStat := linux.Statx{
+				Mask:  linux.STATX_ATIME,
+				Atime: linux.NsecToStatxTimestamp(100),
+			}
+			if err := fd.SetStat(ctx, vfs.SetStatOptions{Stat: setStat}); err != nil {
+				t.Errorf("SetStat atime with mask failed: %v", err)
+			}
+			if gotStat, err := fd.Stat(ctx, allStatOptions); err != nil {
+				t.Errorf("Stat got error: %v", err)
+			} else if gotStat.Atime != setStat.Atime {
+				t.Errorf("Stat got atime %d, want %d", gotStat.Atime, setStat.Atime)
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index b2ac2cbeb..b0de5fabe 100644
--- a/pkg/sentry/fsimpl/memfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-package memfs
+package tmpfs
 
 import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
@@ -23,11 +24,11 @@ type symlink struct {
 	target string // immutable
 }
 
-func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
+func (fs *filesystem) newSymlink(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, target string) *inode {
 	link := &symlink{
 		target: target,
 	}
-	link.inode.init(link, fs, creds, 0777)
+	link.inode.init(link, fs, kuid, kgid, linux.S_IFLNK|mode)
 	link.inode.nlink = 1 // from parent directory
 	return &link.inode
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
new file mode 100644
index 000000000..de2af6d01
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -0,0 +1,775 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tmpfs provides an in-memory filesystem whose contents are
+// application-mutable, consistent with Linux's tmpfs.
+//
+// Lock order:
+//
+// filesystem.mu
+//   inode.mu
+//     regularFileFD.offMu
+//       *** "memmap.Mappable locks" below this point
+//       regularFile.mapsMu
+//         *** "memmap.Mappable locks taken by Translate" below this point
+//         regularFile.dataMu
+//     directory.iterMu
+package tmpfs
+
+import (
+	"fmt"
+	"math"
+	"strconv"
+	"strings"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sentry/vfs/memxattr"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// Name is the default filesystem name.
+const Name = "tmpfs"
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// memFile is used to allocate pages to for regular files.
+	memFile *pgalloc.MemoryFile
+
+	// clock is a realtime clock used to set timestamps in file operations.
+	clock time.Clock
+
+	// devMinor is the filesystem's minor device number. devMinor is immutable.
+	devMinor uint32
+
+	// mu serializes changes to the Dentry tree.
+	mu sync.RWMutex
+
+	nextInoMinusOne uint64 // accessed using atomic memory operations
+}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// FilesystemOpts is used to pass configuration data to tmpfs.
+type FilesystemOpts struct {
+	// RootFileType is the FileType of the filesystem root. Valid values
+	// are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
+	RootFileType uint16
+
+	// RootSymlinkTarget is the target of the root symlink. Only valid if
+	// RootFileType == S_IFLNK.
+	RootSymlinkTarget string
+
+	// FilesystemType allows setting a different FilesystemType for this
+	// tmpfs filesystem. This allows tmpfs to "impersonate" other
+	// filesystems, like ramdiskfs and cgroupfs.
+	FilesystemType vfs.FilesystemType
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
+	if memFileProvider == nil {
+		panic("MemoryFileProviderFromContext returned nil")
+	}
+
+	rootFileType := uint16(linux.S_IFDIR)
+	newFSType := vfs.FilesystemType(&fstype)
+	tmpfsOpts, ok := opts.InternalData.(FilesystemOpts)
+	if ok {
+		if tmpfsOpts.RootFileType != 0 {
+			rootFileType = tmpfsOpts.RootFileType
+		}
+		if tmpfsOpts.FilesystemType != nil {
+			newFSType = tmpfsOpts.FilesystemType
+		}
+	}
+
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	rootMode := linux.FileMode(0777)
+	if rootFileType == linux.S_IFDIR {
+		rootMode = 01777
+	}
+	modeStr, ok := mopts["mode"]
+	if ok {
+		delete(mopts, "mode")
+		mode, err := strconv.ParseUint(modeStr, 8, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr)
+			return nil, nil, syserror.EINVAL
+		}
+		rootMode = linux.FileMode(mode & 07777)
+	}
+	rootKUID := creds.EffectiveKUID
+	uidStr, ok := mopts["uid"]
+	if ok {
+		delete(mopts, "uid")
+		uid, err := strconv.ParseUint(uidStr, 10, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr)
+			return nil, nil, syserror.EINVAL
+		}
+		kuid := creds.UserNamespace.MapToKUID(auth.UID(uid))
+		if !kuid.Ok() {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid)
+			return nil, nil, syserror.EINVAL
+		}
+		rootKUID = kuid
+	}
+	rootKGID := creds.EffectiveKGID
+	gidStr, ok := mopts["gid"]
+	if ok {
+		delete(mopts, "gid")
+		gid, err := strconv.ParseUint(gidStr, 10, 32)
+		if err != nil {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr)
+			return nil, nil, syserror.EINVAL
+		}
+		kgid := creds.UserNamespace.MapToKGID(auth.GID(gid))
+		if !kgid.Ok() {
+			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid)
+			return nil, nil, syserror.EINVAL
+		}
+		rootKGID = kgid
+	}
+	if len(mopts) != 0 {
+		ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
+	if err != nil {
+		return nil, nil, err
+	}
+	clock := time.RealtimeClockFromContext(ctx)
+	fs := filesystem{
+		memFile:  memFileProvider.MemoryFile(),
+		clock:    clock,
+		devMinor: devMinor,
+	}
+	fs.vfsfs.Init(vfsObj, newFSType, &fs)
+
+	var root *dentry
+	switch rootFileType {
+	case linux.S_IFREG:
+		root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode))
+	case linux.S_IFLNK:
+		root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget))
+	case linux.S_IFDIR:
+		root = &fs.newDirectory(rootKUID, rootKGID, rootMode).dentry
+	default:
+		fs.vfsfs.DecRef(ctx)
+		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
+	}
+	return &fs.vfsfs, &root.vfsd, nil
+}
+
+// NewFilesystem returns a new tmpfs filesystem.
+func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*vfs.Filesystem, *vfs.Dentry, error) {
+	return FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "", vfs.GetFilesystemOptions{})
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+	vfsd vfs.Dentry
+
+	// parent is this dentry's parent directory. Each referenced dentry holds a
+	// reference on parent.dentry. If this dentry is a filesystem root, parent
+	// is nil. parent is protected by filesystem.mu.
+	parent *dentry
+
+	// name is the name of this dentry in its parent. If this dentry is a
+	// filesystem root, name is the empty string. name is protected by
+	// filesystem.mu.
+	name string
+
+	// dentryEntry (ugh) links dentries into their parent directory.childList.
+	dentryEntry
+
+	// inode is the inode represented by this dentry. Multiple Dentries may
+	// share a single non-directory inode (with hard links). inode is
+	// immutable.
+	//
+	// tmpfs doesn't count references on dentries; because the dentry tree is
+	// the sole source of truth, it is by definition always consistent with the
+	// state of the filesystem. However, it does count references on inodes,
+	// because inode resources are released when all references are dropped.
+	// dentry therefore forwards reference counting directly to inode.
+	inode *inode
+}
+
+func (fs *filesystem) newDentry(inode *inode) *dentry {
+	d := &dentry{
+		inode: inode,
+	}
+	d.vfsd.Init(d)
+	return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+	d.inode.incRef()
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+	return d.inode.tryIncRef()
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef(ctx context.Context) {
+	d.inode.decRef(ctx)
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
+	if d.inode.isDir() {
+		events |= linux.IN_ISDIR
+	}
+
+	// tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
+	// that d was deleted.
+	deleted := d.vfsd.IsDead()
+
+	d.inode.fs.mu.RLock()
+	// The ordering below is important, Linux always notifies the parent first.
+	if d.parent != nil {
+		d.parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted)
+	}
+	d.inode.watches.Notify(ctx, "", events, cookie, et, deleted)
+	d.inode.fs.mu.RUnlock()
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+	return &d.inode.watches
+}
+
+// OnZeroWatches implements vfs.Dentry.OnZeroWatches.
+func (d *dentry) OnZeroWatches(context.Context) {}
+
+// inode represents a filesystem object.
+type inode struct {
+	// fs is the owning filesystem. fs is immutable.
+	fs *filesystem
+
+	// A reference is held on all inodes as long as they are reachable in the
+	// filesystem tree, i.e. nlink is nonzero. This reference is dropped when
+	// nlink reaches 0.
+	refs inodeRefs
+
+	// xattrs implements extended attributes.
+	//
+	// TODO(b/148380782): Support xattrs other than user.*
+	xattrs memxattr.SimpleExtendedAttributes
+
+	// Inode metadata. Writing multiple fields atomically requires holding
+	// mu, othewise atomic operations can be used.
+	mu    sync.Mutex
+	mode  uint32 // file type and mode
+	nlink uint32 // protected by filesystem.mu instead of inode.mu
+	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid   uint32 // auth.KGID, but ...
+	ino   uint64 // immutable
+
+	// Linux's tmpfs has no concept of btime.
+	atime int64 // nanoseconds
+	ctime int64 // nanoseconds
+	mtime int64 // nanoseconds
+
+	locks vfs.FileLocks
+
+	// Inotify watches for this inode.
+	watches vfs.Watches
+
+	impl interface{} // immutable
+}
+
+const maxLinks = math.MaxUint32
+
+func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) {
+	if mode.FileType() == 0 {
+		panic("file type is required in FileMode")
+	}
+	i.fs = fs
+	i.mode = uint32(mode)
+	i.uid = uint32(kuid)
+	i.gid = uint32(kgid)
+	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
+	// Tmpfs creation sets atime, ctime, and mtime to current time.
+	now := fs.clock.Now().Nanoseconds()
+	i.atime = now
+	i.ctime = now
+	i.mtime = now
+	// i.nlink initialized by caller
+	i.impl = impl
+	i.refs.EnableLeakCheck()
+}
+
+// incLinksLocked increments i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+// i.nlink < maxLinks.
+func (i *inode) incLinksLocked() {
+	if i.nlink == 0 {
+		panic("tmpfs.inode.incLinksLocked() called with no existing links")
+	}
+	if i.nlink == maxLinks {
+		panic("tmpfs.inode.incLinksLocked() called with maximum link count")
+	}
+	atomic.AddUint32(&i.nlink, 1)
+}
+
+// decLinksLocked decrements i's link count. If the link count reaches 0, we
+// remove a reference on i as well.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+func (i *inode) decLinksLocked(ctx context.Context) {
+	if i.nlink == 0 {
+		panic("tmpfs.inode.decLinksLocked() called with no existing links")
+	}
+	if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 {
+		i.decRef(ctx)
+	}
+}
+
+func (i *inode) incRef() {
+	i.refs.IncRef()
+}
+
+func (i *inode) tryIncRef() bool {
+	return i.refs.TryIncRef()
+}
+
+func (i *inode) decRef(ctx context.Context) {
+	i.refs.DecRef(func() {
+		i.watches.HandleDeletion(ctx)
+		if regFile, ok := i.impl.(*regularFile); ok {
+			// Release memory used by regFile to store data. Since regFile is
+			// no longer usable, we don't need to grab any locks or update any
+			// metadata.
+			regFile.data.DropAll(regFile.memFile)
+		}
+	})
+}
+
+func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+	return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
+}
+
+// Go won't inline this function, and returning linux.Statx (which is quite
+// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
+// output parameter.
+//
+// Note that Linux does not guarantee to return consistent data (in the case of
+// a concurrent modification), so we do not require holding inode.mu.
+func (i *inode) statTo(stat *linux.Statx) {
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
+		linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
+		linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME |
+		linux.STATX_MTIME
+	stat.Blksize = usermem.PageSize
+	stat.Nlink = atomic.LoadUint32(&i.nlink)
+	stat.UID = atomic.LoadUint32(&i.uid)
+	stat.GID = atomic.LoadUint32(&i.gid)
+	stat.Mode = uint16(atomic.LoadUint32(&i.mode))
+	stat.Ino = i.ino
+	stat.Atime = linux.NsecToStatxTimestamp(i.atime)
+	stat.Ctime = linux.NsecToStatxTimestamp(i.ctime)
+	stat.Mtime = linux.NsecToStatxTimestamp(i.mtime)
+	stat.DevMajor = linux.UNNAMED_MAJOR
+	stat.DevMinor = i.fs.devMinor
+	switch impl := i.impl.(type) {
+	case *regularFile:
+		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+		stat.Size = uint64(atomic.LoadUint64(&impl.size))
+		// TODO(jamieliu): This should be impl.data.Span() / 512, but this is
+		// too expensive to compute here. Cache it in regularFile.
+		stat.Blocks = allocatedBlocksForSize(stat.Size)
+	case *directory:
+		// "20" is mm/shmem.c:BOGO_DIRENT_SIZE.
+		stat.Size = 20 * (2 + uint64(atomic.LoadInt64(&impl.numChildren)))
+		// stat.Blocks is 0.
+	case *symlink:
+		stat.Size = uint64(len(impl.target))
+		// stat.Blocks is 0.
+	case *namedPipe, *socketFile:
+		// stat.Size and stat.Blocks are 0.
+	case *deviceFile:
+		// stat.Size and stat.Blocks are 0.
+		stat.RdevMajor = impl.major
+		stat.RdevMinor = impl.minor
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+	}
+}
+
+func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error {
+	stat := &opts.Stat
+	if stat.Mask == 0 {
+		return nil
+	}
+	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 {
+		return syserror.EPERM
+	}
+	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+	if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
+		return err
+	}
+	i.mu.Lock()
+	defer i.mu.Unlock()
+	var (
+		needsMtimeBump bool
+		needsCtimeBump bool
+	)
+	mask := stat.Mask
+	if mask&linux.STATX_MODE != 0 {
+		ft := atomic.LoadUint32(&i.mode) & linux.S_IFMT
+		atomic.StoreUint32(&i.mode, ft|uint32(stat.Mode&^linux.S_IFMT))
+		needsCtimeBump = true
+	}
+	if mask&linux.STATX_UID != 0 {
+		atomic.StoreUint32(&i.uid, stat.UID)
+		needsCtimeBump = true
+	}
+	if mask&linux.STATX_GID != 0 {
+		atomic.StoreUint32(&i.gid, stat.GID)
+		needsCtimeBump = true
+	}
+	if mask&linux.STATX_SIZE != 0 {
+		switch impl := i.impl.(type) {
+		case *regularFile:
+			updated, err := impl.truncateLocked(stat.Size)
+			if err != nil {
+				return err
+			}
+			if updated {
+				needsMtimeBump = true
+				needsCtimeBump = true
+			}
+		case *directory:
+			return syserror.EISDIR
+		default:
+			return syserror.EINVAL
+		}
+	}
+	now := i.fs.clock.Now().Nanoseconds()
+	if mask&linux.STATX_ATIME != 0 {
+		if stat.Atime.Nsec == linux.UTIME_NOW {
+			atomic.StoreInt64(&i.atime, now)
+		} else {
+			atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped())
+		}
+		needsCtimeBump = true
+	}
+	if mask&linux.STATX_MTIME != 0 {
+		if stat.Mtime.Nsec == linux.UTIME_NOW {
+			atomic.StoreInt64(&i.mtime, now)
+		} else {
+			atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped())
+		}
+		needsCtimeBump = true
+		// Ignore the mtime bump, since we just set it ourselves.
+		needsMtimeBump = false
+	}
+	if mask&linux.STATX_CTIME != 0 {
+		if stat.Ctime.Nsec == linux.UTIME_NOW {
+			atomic.StoreInt64(&i.ctime, now)
+		} else {
+			atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped())
+		}
+		// Ignore the ctime bump, since we just set it ourselves.
+		needsCtimeBump = false
+	}
+	if needsMtimeBump {
+		atomic.StoreInt64(&i.mtime, now)
+	}
+	if needsCtimeBump {
+		atomic.StoreInt64(&i.ctime, now)
+	}
+
+	return nil
+}
+
+// allocatedBlocksForSize returns the number of 512B blocks needed to
+// accommodate the given size in bytes, as appropriate for struct
+// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
+// size is independent of the "preferred block size for I/O", struct
+// stat::st_blksize and struct statx::stx_blksize.)
+func allocatedBlocksForSize(size uint64) uint64 {
+	return (size + 511) / 512
+}
+
+func (i *inode) direntType() uint8 {
+	switch impl := i.impl.(type) {
+	case *regularFile:
+		return linux.DT_REG
+	case *directory:
+		return linux.DT_DIR
+	case *symlink:
+		return linux.DT_LNK
+	case *socketFile:
+		return linux.DT_SOCK
+	case *namedPipe:
+		return linux.DT_FIFO
+	case *deviceFile:
+		switch impl.kind {
+		case vfs.BlockDevice:
+			return linux.DT_BLK
+		case vfs.CharDevice:
+			return linux.DT_CHR
+		default:
+			panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind))
+		}
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+	}
+}
+
+func (i *inode) isDir() bool {
+	return linux.FileMode(i.mode).FileType() == linux.S_IFDIR
+}
+
+func (i *inode) touchAtime(mnt *vfs.Mount) {
+	if mnt.Flags.NoATime {
+		return
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return
+	}
+	now := i.fs.clock.Now().Nanoseconds()
+	i.mu.Lock()
+	atomic.StoreInt64(&i.atime, now)
+	i.mu.Unlock()
+	mnt.EndWrite()
+}
+
+// Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
+func (i *inode) touchCtime() {
+	now := i.fs.clock.Now().Nanoseconds()
+	i.mu.Lock()
+	atomic.StoreInt64(&i.ctime, now)
+	i.mu.Unlock()
+}
+
+// Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
+func (i *inode) touchCMtime() {
+	now := i.fs.clock.Now().Nanoseconds()
+	i.mu.Lock()
+	atomic.StoreInt64(&i.mtime, now)
+	atomic.StoreInt64(&i.ctime, now)
+	i.mu.Unlock()
+}
+
+// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds
+// inode.mu.
+func (i *inode) touchCMtimeLocked() {
+	now := i.fs.clock.Now().Nanoseconds()
+	atomic.StoreInt64(&i.mtime, now)
+	atomic.StoreInt64(&i.ctime, now)
+}
+
+func (i *inode) listxattr(size uint64) ([]string, error) {
+	return i.xattrs.Listxattr(size)
+}
+
+func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+	if err := i.checkPermissions(creds, vfs.MayRead); err != nil {
+		return "", err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return "", syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return "", syserror.ENODATA
+	}
+	return i.xattrs.Getxattr(opts)
+}
+
+func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return syserror.EPERM
+	}
+	return i.xattrs.Setxattr(opts)
+}
+
+func (i *inode) removexattr(creds *auth.Credentials, name string) error {
+	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+		return err
+	}
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	if !i.userXattrSupported() {
+		return syserror.EPERM
+	}
+	return i.xattrs.Removexattr(name)
+}
+
+// Extended attributes in the user.* namespace are only supported for regular
+// files and directories.
+func (i *inode) userXattrSupported() bool {
+	filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode)
+	return filetype == linux.S_IFREG || filetype == linux.S_IFDIR
+}
+
+// fileDescription is embedded by tmpfs implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) dentry() *dentry {
+	return fd.vfsfd.Dentry().Impl().(*dentry)
+}
+
+func (fd *fileDescription) inode() *inode {
+	return fd.dentry().inode
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	fd.inode().statTo(&stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	d := fd.dentry()
+	if err := d.inode.setStat(ctx, creds, &opts); err != nil {
+		return err
+	}
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+	}
+	return nil
+}
+
+// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
+func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.inode().listxattr(size)
+}
+
+// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
+func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
+	return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts)
+}
+
+// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
+func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+	d := fd.dentry()
+	if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+		return err
+	}
+
+	// Generate inotify events.
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
+func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+	d := fd.dentry()
+	if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+		return err
+	}
+
+	// Generate inotify events.
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// NewMemfd creates a new tmpfs regular file and file description that can back
+// an anonymous fd created by memfd_create.
+func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
+	fs, ok := mount.Filesystem().Impl().(*filesystem)
+	if !ok {
+		panic("NewMemfd() called with non-tmpfs mount")
+	}
+
+	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
+	// S_IRWXUGO.
+	inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
+	rf := inode.impl.(*regularFile)
+	if allowSeals {
+		rf.seals = 0
+	}
+
+	d := fs.newDentry(inode)
+	defer d.DecRef(ctx)
+	d.name = name
+
+	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
+	// FMODE_READ | FMODE_WRITE.
+	var fd regularFileFD
+	fd.Init(&inode.locks)
+	flags := uint32(linux.O_RDWR)
+	if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all
+// filesystem state is in-memory.
+func (*fileDescription) Sync(context.Context) error {
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
new file mode 100644
index 000000000..6f3e3ae6f
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
@@ -0,0 +1,156 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// nextFileID is used to generate unique file names.
+var nextFileID int64
+
+// newTmpfsRoot creates a new tmpfs mount, and returns the root. If the error
+// is not nil, then cleanup should be called when the root is no longer needed.
+func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentry, func(), error) {
+	creds := auth.CredentialsFromContext(ctx)
+
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(ctx); err != nil {
+		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err)
+	}
+
+	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
+	}
+	root := mntns.Root()
+	return vfsObj, root, func() {
+		root.DecRef(ctx)
+		mntns.DecRef(ctx)
+	}, nil
+}
+
+// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If
+// the returned err is not nil, then cleanup should be called when the FD is no
+// longer needed.
+func newFileFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+	creds := auth.CredentialsFromContext(ctx)
+	vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	filename := fmt.Sprintf("tmpfs-test-file-%d", atomic.AddInt64(&nextFileID, 1))
+
+	// Create the file that will be write/read.
+	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(filename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+		Mode:  linux.ModeRegular | mode,
+	})
+	if err != nil {
+		cleanup()
+		return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err)
+	}
+
+	return fd, cleanup, nil
+}
+
+// newDirFD is like newFileFD, but for directories.
+func newDirFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+	creds := auth.CredentialsFromContext(ctx)
+	vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	dirname := fmt.Sprintf("tmpfs-test-dir-%d", atomic.AddInt64(&nextFileID, 1))
+
+	// Create the dir.
+	if err := vfsObj.MkdirAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(dirname),
+	}, &vfs.MkdirOptions{
+		Mode: linux.ModeDirectory | mode,
+	}); err != nil {
+		cleanup()
+		return nil, nil, fmt.Errorf("failed to create directory %q: %v", dirname, err)
+	}
+
+	// Open the dir and return it.
+	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(dirname),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY | linux.O_DIRECTORY,
+	})
+	if err != nil {
+		cleanup()
+		return nil, nil, fmt.Errorf("failed to open directory %q: %v", dirname, err)
+	}
+
+	return fd, cleanup, nil
+}
+
+// newPipeFD is like newFileFD, but for pipes.
+func newPipeFD(ctx context.Context, mode linux.FileMode) (*vfs.FileDescription, func(), error) {
+	creds := auth.CredentialsFromContext(ctx)
+	vfsObj, root, cleanup, err := newTmpfsRoot(ctx)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	name := fmt.Sprintf("tmpfs-test-%d", atomic.AddInt64(&nextFileID, 1))
+
+	if err := vfsObj.MknodAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(name),
+	}, &vfs.MknodOptions{
+		Mode: linux.ModeNamedPipe | mode,
+	}); err != nil {
+		cleanup()
+		return nil, nil, fmt.Errorf("failed to create pipe %q: %v", name, err)
+	}
+
+	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(name),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR,
+	})
+	if err != nil {
+		cleanup()
+		return nil, nil, fmt.Errorf("failed to open pipe %q: %v", name, err)
+	}
+
+	return fd, cleanup, nil
+}
diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD
new file mode 100644
index 000000000..28d2a4bcb
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/BUILD
@@ -0,0 +1,23 @@
+load("//tools:defs.bzl", "go_library")
+
+licenses(["notice"])
+
+go_library(
+    name = "verity",
+    srcs = [
+        "filesystem.go",
+        "verity.go",
+    ],
+    visibility = ["//pkg/sentry:internal"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/socket/unix/transport",
+        "//pkg/sentry/vfs",
+        "//pkg/sync",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
new file mode 100644
index 000000000..78c6074bd
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -0,0 +1,333 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package verity
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	// All files should be read-only.
+	return nil
+}
+
+var dentrySlicePool = sync.Pool{
+	New: func() interface{} {
+		ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
+		return &ds
+	},
+}
+
+func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
+	if ds == nil {
+		ds = dentrySlicePool.Get().(*[]*dentry)
+	}
+	*ds = append(*ds, d)
+	return ds
+}
+
+// Preconditions: ds != nil.
+func putDentrySlice(ds *[]*dentry) {
+	// Allow dentries to be GC'd.
+	for i := range *ds {
+		(*ds)[i] = nil
+	}
+	*ds = (*ds)[:0]
+	dentrySlicePool.Put(ds)
+}
+
+// renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls
+// dentry.checkDropLocked on all dentries in *ds with fs.renameMu locked for
+// writing.
+//
+// ds is a pointer-to-pointer since defer evaluates its arguments immediately,
+// but dentry slices are allocated lazily, and it's much easier to say "defer
+// fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
+// fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
+func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+	fs.renameMu.RUnlock()
+	if *ds == nil {
+		return
+	}
+	if len(**ds) != 0 {
+		fs.renameMu.Lock()
+		for _, d := range **ds {
+			d.checkDropLocked(ctx)
+		}
+		fs.renameMu.Unlock()
+	}
+	putDentrySlice(*ds)
+}
+
+func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
+	if *ds == nil {
+		fs.renameMu.Unlock()
+		return
+	}
+	for _, d := range **ds {
+		d.checkDropLocked(ctx)
+	}
+	fs.renameMu.Unlock()
+	putDentrySlice(*ds)
+}
+
+// resolveLocked resolves rp to an existing file.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+	// TODO(b/159261227): Implement resolveLocked.
+	return nil, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// Preconditions: fs.renameMu must be locked. !rp.Done().
+func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
+	// TODO(b/159261227): Implement walkParentDirLocked.
+	return nil, nil
+}
+
+// AccessAt implements vfs.Filesystem.Impl.AccessAt.
+func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
+	// Verity file system is read-only.
+	if ats&vfs.MayWrite != 0 {
+		return syserror.EROFS
+	}
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return err
+	}
+	return d.checkPermissions(creds, ats)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	if opts.CheckSearchable {
+		if !d.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+			return nil, err
+		}
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	start := rp.Start().Impl().(*dentry)
+	d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	//TODO(b/159261227): Implement OpenAt.
+	return nil, nil
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	//TODO(b/162787271): Provide integrity check for ReadlinkAt.
+	return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+	})
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+
+	var stat linux.Statx
+	stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+	}, &opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	// TODO(b/159261227): Implement StatFSAt.
+	return linux.Statfs{}, nil
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	if _, err := fs.resolveLocked(ctx, rp, &ds); err != nil {
+		return nil, err
+	}
+	return nil, syserror.ECONNREFUSED
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return nil, err
+	}
+	lowerVD := d.lowerVD
+	return fs.vfsfs.VirtualFilesystem().ListxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+	}, size)
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
+	if err != nil {
+		return "", err
+	}
+	lowerVD := d.lowerVD
+	return fs.vfsfs.VirtualFilesystem().GetxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+	}, &opts)
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	// Verity file system is read-only.
+	return syserror.EROFS
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.renameMu.RLock()
+	defer fs.renameMu.RUnlock()
+	mnt := vd.Mount()
+	d := vd.Dentry().Impl().(*dentry)
+	for {
+		if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() {
+			return vfs.PrependPathAtVFSRootError{}
+		}
+		if &d.vfsd == mnt.Root() {
+			return nil
+		}
+		if d.parent == nil {
+			return vfs.PrependPathAtNonMountRootError{}
+		}
+		b.PrependComponent(d.name)
+		d = d.parent
+	}
+}
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
new file mode 100644
index 000000000..cb29d33a5
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -0,0 +1,355 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package verity provides a filesystem implementation that is a wrapper of
+// another file system.
+// The verity file system provides integrity check for the underlying file
+// system by providing verification for path traversals and each read.
+// The verity file system is read-only, except for one case: when
+// allowRuntimeEnable is true, additional Merkle files can be generated using
+// the FS_IOC_ENABLE_VERITY ioctl.
+package verity
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Name is the default filesystem name.
+const Name = "verity"
+
+// testOnlyDebugging allows verity file system to return error instead of
+// crashing the application when a malicious action is detected. This should
+// only be set for tests.
+var testOnlyDebugging bool
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// creds is a copy of the filesystem's creator's credentials, which are
+	// used for accesses to the underlying file system. creds is immutable.
+	creds *auth.Credentials
+
+	// allowRuntimeEnable is true if using ioctl with FS_IOC_ENABLE_VERITY
+	// to build Merkle trees in the verity file system is allowed. If this
+	// is false, no new Merkle trees can be built, and only the files that
+	// had Merkle trees before startup (e.g. from a host filesystem mounted
+	// with gofer fs) can be verified.
+	allowRuntimeEnable bool
+
+	// lowerMount is the underlying file system mount.
+	lowerMount *vfs.Mount
+
+	// rootDentry is the mount root Dentry for this file system, which
+	// stores the root hash of the whole file system in bytes.
+	rootDentry *dentry
+
+	// renameMu synchronizes renaming with non-renaming operations in order
+	// to ensure consistent lock ordering between dentry.dirMu in different
+	// dentries.
+	renameMu sync.RWMutex
+}
+
+// InternalFilesystemOptions may be passed as
+// vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+type InternalFilesystemOptions struct {
+	// RootMerkleFileName is the name of the verity root Merkle tree file.
+	RootMerkleFileName string
+
+	// LowerName is the name of the filesystem wrapped by verity fs.
+	LowerName string
+
+	// RootHash is the root hash of the overall verity file system.
+	RootHash []byte
+
+	// AllowRuntimeEnable specifies whether the verity file system allows
+	// enabling verification for files (i.e. building Merkle trees) during
+	// runtime.
+	AllowRuntimeEnable bool
+
+	// LowerGetFSOptions is the file system option for the lower layer file
+	// system wrapped by verity file system.
+	LowerGetFSOptions vfs.GetFilesystemOptions
+
+	// TestOnlyDebugging allows verity file system to return error instead
+	// of crashing the application when a malicious action is detected. This
+	// should only be set for tests.
+	TestOnlyDebugging bool
+}
+
+// Name implements vfs.FilesystemType.Name.
+func (FilesystemType) Name() string {
+	return Name
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	//TODO(b/159261227): Implement GetFilesystem.
+	return nil, nil, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release(ctx context.Context) {
+	fs.lowerMount.DecRef(ctx)
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+	vfsd vfs.Dentry
+
+	refs int64
+
+	// fs is the owning filesystem. fs is immutable.
+	fs *filesystem
+
+	// mode, uid and gid are the file mode, owner, and group of the file in
+	// the underlying file system.
+	mode uint32
+	uid  uint32
+	gid  uint32
+
+	// parent is the dentry corresponding to this dentry's parent directory.
+	// name is this dentry's name in parent. If this dentry is a filesystem
+	// root, parent is nil and name is the empty string. parent and name are
+	// protected by fs.renameMu.
+	parent *dentry
+	name   string
+
+	// If this dentry represents a directory, children maps the names of
+	// children for which dentries have been instantiated to those dentries,
+	// and dirents (if not nil) is a cache of dirents as returned by
+	// directoryFDs representing this directory. children is protected by
+	// dirMu.
+	dirMu    sync.Mutex
+	children map[string]*dentry
+
+	// lowerVD is the VirtualDentry in the underlying file system.
+	lowerVD vfs.VirtualDentry
+
+	// lowerMerkleVD is the VirtualDentry of the corresponding Merkle tree
+	// in the underlying file system.
+	lowerMerkleVD vfs.VirtualDentry
+
+	// rootHash is the rootHash for the current file or directory.
+	rootHash []byte
+}
+
+// newDentry creates a new dentry representing the given verity file. The
+// dentry initially has no references; it is the caller's responsibility to set
+// the dentry's reference count and/or call dentry.destroy() as appropriate.
+// The dentry is initially invalid in that it contains no underlying dentry;
+// the caller is responsible for setting them.
+func (fs *filesystem) newDentry() *dentry {
+	d := &dentry{
+		fs: fs,
+	}
+	d.vfsd.Init(d)
+	return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+	atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&d.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef(ctx context.Context) {
+	if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+		d.fs.renameMu.Lock()
+		d.checkDropLocked(ctx)
+		d.fs.renameMu.Unlock()
+	} else if refs < 0 {
+		panic("verity.dentry.DecRef() called without holding a reference")
+	}
+}
+
+// checkDropLocked should be called after d's reference count becomes 0 or it
+// becomes deleted.
+func (d *dentry) checkDropLocked(ctx context.Context) {
+	// Dentries with a positive reference count must be retained. Dentries
+	// with a negative reference count have already been destroyed.
+	if atomic.LoadInt64(&d.refs) != 0 {
+		return
+	}
+	// Refs is still zero; destroy it.
+	d.destroyLocked(ctx)
+	return
+}
+
+// destroyLocked destroys the dentry.
+//
+// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0.
+func (d *dentry) destroyLocked(ctx context.Context) {
+	switch atomic.LoadInt64(&d.refs) {
+	case 0:
+		// Mark the dentry destroyed.
+		atomic.StoreInt64(&d.refs, -1)
+	case -1:
+		panic("verity.dentry.destroyLocked() called on already destroyed dentry")
+	default:
+		panic("verity.dentry.destroyLocked() called with references on the dentry")
+	}
+
+	if d.lowerVD.Ok() {
+		d.lowerVD.DecRef(ctx)
+	}
+
+	if d.lowerMerkleVD.Ok() {
+		d.lowerMerkleVD.DecRef(ctx)
+	}
+
+	if d.parent != nil {
+		d.parent.dirMu.Lock()
+		if !d.vfsd.IsDead() {
+			delete(d.parent.children, d.name)
+		}
+		d.parent.dirMu.Unlock()
+		if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+			d.parent.checkDropLocked(ctx)
+		} else if refs < 0 {
+			panic("verity.dentry.DecRef() called without holding a reference")
+		}
+	}
+}
+
+// InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
+func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
+	//TODO(b/159261227): Implement InotifyWithParent.
+}
+
+// Watches implements vfs.DentryImpl.Watches.
+func (d *dentry) Watches() *vfs.Watches {
+	//TODO(b/159261227): Implement Watches.
+	return nil
+}
+
+// OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
+func (d *dentry) OnZeroWatches(context.Context) {
+	//TODO(b/159261227): Implement OnZeroWatches.
+}
+
+func (d *dentry) isSymlink() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
+}
+
+func (d *dentry) isDir() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
+}
+
+func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
+	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
+}
+
+func (d *dentry) readlink(ctx context.Context) (string, error) {
+	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+	})
+}
+
+// FileDescription implements vfs.FileDescriptionImpl for verity fds.
+// FileDescription is a wrapper of the underlying lowerFD, with support to build
+// Merkle trees through the Linux fs-verity API to verify contents read from
+// lowerFD.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+
+	// d is the corresponding dentry to the fileDescription.
+	d *dentry
+
+	// isDir specifies whehter the fileDescription points to a directory.
+	isDir bool
+
+	// lowerFD is the FileDescription corresponding to the file in the
+	// underlying file system.
+	lowerFD *vfs.FileDescription
+
+	// merkleReader is the read-only FileDescription corresponding to the
+	// Merkle tree file in the underlying file system.
+	merkleReader *vfs.FileDescription
+
+	// merkleWriter is the FileDescription corresponding to the Merkle tree
+	// file in the underlying file system for writing. This should only be
+	// used when allowRuntimeEnable is set to true.
+	merkleWriter *vfs.FileDescription
+
+	// parentMerkleWriter is the FileDescription of the Merkle tree for the
+	// directory that contains the current file/directory. This is only used
+	// if allowRuntimeEnable is set to true.
+	parentMerkleWriter *vfs.FileDescription
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *fileDescription) Release(ctx context.Context) {
+	fd.lowerFD.DecRef(ctx)
+	fd.merkleReader.DecRef(ctx)
+	if fd.merkleWriter != nil {
+		fd.merkleWriter.DecRef(ctx)
+	}
+	if fd.parentMerkleWriter != nil {
+		fd.parentMerkleWriter.DecRef(ctx)
+	}
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	// TODO(b/162788573): Add integrity check for metadata.
+	stat, err := fd.lowerFD.Stat(ctx, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	// Verity files are read-only.
+	return syserror.EPERM
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+}
author	Ian Lewis <ianmlewis@gmail.com>	2020-08-17 21:44:31 -0400
committer	Ian Lewis <ianmlewis@gmail.com>	2020-08-17 21:44:31 -0400
commit	ac324f646ee3cb7955b0b45a7453aeb9671cbdf1 (patch)
tree	0cbc5018e8807421d701d190dc20525726c7ca76 /pkg/sentry/fsimpl
parent	352ae1022ce19de28fc72e034cc469872ad79d06 (diff)
parent	6d0c5803d557d453f15ac6f683697eeb46dab680 (diff)