144 files changed, 10298 insertions, 3247 deletions
diff --git a/pkg/sentry/fsimpl/devpts/BUILD b/pkg/sentry/fsimpl/devpts/BUILD
index 93512c9b6..6af3c3781 100644
--- a/pkg/sentry/fsimpl/devpts/BUILD
+++ b/pkg/sentry/fsimpl/devpts/BUILD
@@ -1,7 +1,19 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "root_inode_refs",
+    out = "root_inode_refs.go",
+    package = "devpts",
+    prefix = "rootInode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "rootInode",
+    },
+)
+
 go_library(
     name = "devpts",
     srcs = [
@@ -9,15 +21,22 @@ go_library(
         "line_discipline.go",
         "master.go",
         "queue.go",
-        "slave.go",
+        "replica.go",
+        "root_inode_refs.go",
         "terminal.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
+        "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/safemem",
         "//pkg/sentry/arch",
+        "//pkg/sentry/fs",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
diff --git a/pkg/sentry/fsimpl/devpts/devpts.go b/pkg/sentry/fsimpl/devpts/devpts.go
index 7169e91af..9185877f6 100644
--- a/pkg/sentry/fsimpl/devpts/devpts.go
+++ b/pkg/sentry/fsimpl/devpts/devpts.go
@@ -35,29 +35,56 @@ import (
 const Name = "devpts"
 
 // FilesystemType implements vfs.FilesystemType.
-type FilesystemType struct{}
+//
+// +stateify savable
+type FilesystemType struct {
+	initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1663): not yet supported.
+	initErr  error
+
+	// fs backs all mounts of this FilesystemType. root is fs' root. fs and root
+	// are immutable.
+	fs   *vfs.Filesystem
+	root *vfs.Dentry
+}
 
 // Name implements vfs.FilesystemType.Name.
-func (FilesystemType) Name() string {
+func (*FilesystemType) Name() string {
 	return Name
 }
 
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
-func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+func (fstype *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	// No data allowed.
 	if opts.Data != "" {
 		return nil, nil, syserror.EINVAL
 	}
 
-	fs, root, err := fstype.newFilesystem(vfsObj, creds)
-	if err != nil {
-		return nil, nil, err
+	fstype.initOnce.Do(func() {
+		fs, root, err := fstype.newFilesystem(ctx, vfsObj, creds)
+		if err != nil {
+			fstype.initErr = err
+			return
+		}
+		fstype.fs = fs.VFSFilesystem()
+		fstype.root = root.VFSDentry()
+	})
+	if fstype.initErr != nil {
+		return nil, nil, fstype.initErr
+	}
+	fstype.fs.IncRef()
+	fstype.root.IncRef()
+	return fstype.fs, fstype.root, nil
+}
+
+// Release implements vfs.FilesystemType.Release.
+func (fstype *FilesystemType) Release(ctx context.Context) {
+	if fstype.fs != nil {
+		fstype.root.DecRef(ctx)
+		fstype.fs.DecRef(ctx)
 	}
-	return fs.Filesystem.VFSFilesystem(), root.VFSDentry(), nil
 }
 
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -66,7 +93,7 @@ type filesystem struct {
 
 // newFilesystem creates a new devpts filesystem with root directory and ptmx
 // master inode. It returns the filesystem and root Dentry.
-func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) {
+func (fstype *FilesystemType) newFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*filesystem, *kernfs.Dentry, error) {
 	devMinor, err := vfsObj.GetAnonBlockDevMinor()
 	if err != nil {
 		return nil, nil, err
@@ -79,27 +106,29 @@ func (fstype FilesystemType) newFilesystem(vfsObj *vfs.VirtualFilesystem, creds
 
 	// Construct the root directory. This is always inode id 1.
 	root := &rootInode{
-		slaves: make(map[uint32]*slaveInode),
+		replicas: make(map[uint32]*replicaInode),
 	}
-	root.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
+	root.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, devMinor, 1, linux.ModeDirectory|0555)
 	root.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	root.dentry.Init(root)
+	root.EnableLeakCheck()
+
+	var rootD kernfs.Dentry
+	rootD.Init(&fs.Filesystem, root)
 
 	// Construct the pts master inode and dentry. Linux always uses inode
 	// id 2 for ptmx. See fs/devpts/inode.c:mknod_ptmx.
 	master := &masterInode{
 		root: root,
 	}
-	master.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666)
-	master.dentry.Init(master)
+	master.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, devMinor, 2, linux.ModeCharacterDevice|0666)
 
 	// Add the master as a child of the root.
-	links := root.OrderedChildren.Populate(&root.dentry, map[string]*kernfs.Dentry{
-		"ptmx": &master.dentry,
+	links := root.OrderedChildren.Populate(map[string]kernfs.Inode{
+		"ptmx": master,
 	})
 	root.IncLinks(links)
 
-	return fs, &root.dentry, nil
+	return fs, &rootD, nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
@@ -109,29 +138,28 @@ func (fs *filesystem) Release(ctx context.Context) {
 }
 
 // rootInode is the root directory inode for the devpts mounts.
+//
+// +stateify savable
 type rootInode struct {
-	kernfs.AlwaysValid
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
 	kernfs.InodeDirectoryNoNewChildren
 	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
 	kernfs.OrderedChildren
+	rootInodeRefs
 
 	locks vfs.FileLocks
 
-	// Keep a reference to this inode's dentry.
-	dentry kernfs.Dentry
-
 	// master is the master pty inode. Immutable.
 	master *masterInode
 
-	// root is the root directory inode for this filesystem. Immutable.
-	root *rootInode
-
 	// mu protects the fields below.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
-	// slaves maps pty ids to slave inodes.
-	slaves map[uint32]*slaveInode
+	// replicas maps pty ids to replica inodes.
+	replicas map[uint32]*replicaInode
 
 	// nextIdx is the next pty index to use. Must be accessed atomically.
 	//
@@ -142,7 +170,7 @@ type rootInode struct {
 var _ kernfs.Inode = (*rootInode)(nil)
 
 // allocateTerminal creates a new Terminal and installs a pts node for it.
-func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error) {
+func (i *rootInode) allocateTerminal(ctx context.Context, creds *auth.Credentials) (*Terminal, error) {
 	i.mu.Lock()
 	defer i.mu.Unlock()
 	if i.nextIdx == math.MaxUint32 {
@@ -151,41 +179,46 @@ func (i *rootInode) allocateTerminal(creds *auth.Credentials) (*Terminal, error)
 	idx := i.nextIdx
 	i.nextIdx++
 
-	// Sanity check that slave with idx does not exist.
-	if _, ok := i.slaves[idx]; ok {
+	// Sanity check that replica with idx does not exist.
+	if _, ok := i.replicas[idx]; ok {
 		panic(fmt.Sprintf("pty index collision; index %d already exists", idx))
 	}
 
-	// Create the new terminal and slave.
+	// Create the new terminal and replica.
 	t := newTerminal(idx)
-	slave := &slaveInode{
+	replica := &replicaInode{
 		root: i,
 		t:    t,
 	}
 	// Linux always uses pty index + 3 as the inode id. See
 	// fs/devpts/inode.c:devpts_pty_new().
-	slave.InodeAttrs.Init(creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
-	slave.dentry.Init(slave)
-	i.slaves[idx] = slave
+	replica.InodeAttrs.Init(ctx, creds, i.InodeAttrs.DevMajor(), i.InodeAttrs.DevMinor(), uint64(idx+3), linux.ModeCharacterDevice|0600)
+	i.replicas[idx] = replica
 
 	return t, nil
 }
 
 // masterClose is called when the master end of t is closed.
-func (i *rootInode) masterClose(t *Terminal) {
+func (i *rootInode) masterClose(ctx context.Context, t *Terminal) {
 	i.mu.Lock()
 	defer i.mu.Unlock()
 
-	// Sanity check that slave with idx exists.
-	if _, ok := i.slaves[t.n]; !ok {
+	// Sanity check that replica with idx exists.
+	ri, ok := i.replicas[t.n]
+	if !ok {
 		panic(fmt.Sprintf("pty with index %d does not exist", t.n))
 	}
-	delete(i.slaves, t.n)
+
+	// Drop the ref on replica inode taken during rootInode.allocateTerminal.
+	ri.DecRef(ctx)
+	delete(i.replicas, t.n)
 }
 
 // Open implements kernfs.Inode.Open.
-func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
@@ -193,27 +226,34 @@ func (i *rootInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
 }
 
 // Lookup implements kernfs.Inode.Lookup.
-func (i *rootInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (i *rootInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+	// Check if a static entry was looked up.
+	if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil {
+		return d, nil
+	}
+
+	// Not a static entry.
 	idx, err := strconv.ParseUint(name, 10, 32)
 	if err != nil {
 		return nil, syserror.ENOENT
 	}
 	i.mu.Lock()
 	defer i.mu.Unlock()
-	if si, ok := i.slaves[uint32(idx)]; ok {
-		si.dentry.IncRef()
-		return si.dentry.VFSDentry(), nil
+	if ri, ok := i.replicas[uint32(idx)]; ok {
+		ri.IncRef() // This ref is passed to the dentry upon creation via Init.
+		return ri, nil
 
 	}
 	return nil, syserror.ENOENT
 }
 
 // IterDirents implements kernfs.Inode.IterDirents.
-func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+func (i *rootInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	i.mu.Lock()
 	defer i.mu.Unlock()
-	ids := make([]int, 0, len(i.slaves))
-	for id := range i.slaves {
+	i.InodeAttrs.TouchAtime(ctx, mnt)
+	ids := make([]int, 0, len(i.replicas))
+	for id := range i.replicas {
 		ids = append(ids, int(id))
 	}
 	sort.Ints(ids)
@@ -221,7 +261,7 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
 		dirent := vfs.Dirent{
 			Name:    strconv.FormatUint(uint64(id), 10),
 			Type:    linux.DT_CHR,
-			Ino:     i.slaves[uint32(id)].InodeAttrs.Ino(),
+			Ino:     i.replicas[uint32(id)].InodeAttrs.Ino(),
 			NextOff: offset + 1,
 		}
 		if err := cb.Handle(dirent); err != nil {
@@ -231,3 +271,16 @@ func (i *rootInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback,
 	}
 	return offset, nil
 }
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *rootInode) DecRef(ctx context.Context) {
+	i.rootInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
+// +stateify savable
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.DEVPTS_SUPER_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/devpts/devpts_test.go b/pkg/sentry/fsimpl/devpts/devpts_test.go
index b7c149047..448390cfe 100644
--- a/pkg/sentry/fsimpl/devpts/devpts_test.go
+++ b/pkg/sentry/fsimpl/devpts/devpts_test.go
@@ -22,8 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func TestSimpleMasterToSlave(t *testing.T) {
-	ld := newLineDiscipline(linux.DefaultSlaveTermios)
+func TestSimpleMasterToReplica(t *testing.T) {
+	ld := newLineDiscipline(linux.DefaultReplicaTermios)
 	ctx := contexttest.Context(t)
 	inBytes := []byte("hello, tty\n")
 	src := usermem.BytesIOSequence(inBytes)
diff --git a/pkg/sentry/fsimpl/devpts/line_discipline.go b/pkg/sentry/fsimpl/devpts/line_discipline.go
index f7bc325d1..ae95fdd08 100644
--- a/pkg/sentry/fsimpl/devpts/line_discipline.go
+++ b/pkg/sentry/fsimpl/devpts/line_discipline.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -41,7 +42,7 @@ const (
 )
 
 // lineDiscipline dictates how input and output are handled between the
-// pseudoterminal (pty) master and slave. It can be configured to alter I/O,
+// pseudoterminal (pty) master and replica. It can be configured to alter I/O,
 // modify control characters (e.g. Ctrl-C for SIGINT), etc. The following man
 // pages are good resources for how to affect the line discipline:
 //
@@ -52,8 +53,8 @@ const (
 //
 // lineDiscipline has a simple structure but supports a multitude of options
 // (see the above man pages). It consists of two queues of bytes: one from the
-// terminal master to slave (the input queue) and one from slave to master (the
-// output queue). When bytes are written to one end of the pty, the line
+// terminal master to replica (the input queue) and one from replica to master
+// (the output queue). When bytes are written to one end of the pty, the line
 // discipline reads the bytes, modifies them or takes special action if
 // required, and enqueues them to be read by the other end of the pty:
 //
@@ -62,7 +63,7 @@ const (
 //    |   (inputQueueWrite)     +-------------+     (inputQueueRead)      |
 //    |                                                                   |
 //    |                                                                   v
-// masterFD                                                            slaveFD
+// masterFD                                                           replicaFD
 //    ^                                                                   |
 //    |                                                                   |
 //    |   output to terminal   +--------------+    output from process    |
@@ -99,10 +100,10 @@ type lineDiscipline struct {
 	column int
 
 	// masterWaiter is used to wait on the master end of the TTY.
-	masterWaiter waiter.Queue `state:"zerovalue"`
+	masterWaiter waiter.Queue
 
-	// slaveWaiter is used to wait on the slave end of the TTY.
-	slaveWaiter waiter.Queue `state:"zerovalue"`
+	// replicaWaiter is used to wait on the replica end of the TTY.
+	replicaWaiter waiter.Queue
 }
 
 func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
@@ -113,27 +114,23 @@ func newLineDiscipline(termios linux.KernelTermios) *lineDiscipline {
 }
 
 // getTermios gets the linux.Termios for the tty.
-func (l *lineDiscipline) getTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) getTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
 	// We must copy a Termios struct, not KernelTermios.
 	t := l.termios.ToTermios()
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), t, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := t.CopyOut(task, args[2].Pointer())
 	return 0, err
 }
 
 // setTermios sets a linux.Termios for the tty.
-func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+func (l *lineDiscipline) setTermios(task *kernel.Task, args arch.SyscallArguments) (uintptr, error) {
 	l.termiosMu.Lock()
 	defer l.termiosMu.Unlock()
 	oldCanonEnabled := l.termios.LEnabled(linux.ICANON)
 	// We must copy a Termios struct, not KernelTermios.
 	var t linux.Termios
-	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &t, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := t.CopyIn(task, args[2].Pointer())
 	l.termios.FromTermios(t)
 
 	// If canonical mode is turned off, move bytes from inQueue's wait
@@ -144,27 +141,23 @@ func (l *lineDiscipline) setTermios(ctx context.Context, io usermem.IO, args arc
 		l.inQueue.pushWaitBufLocked(l)
 		l.inQueue.readable = true
 		l.inQueue.mu.Unlock()
-		l.slaveWaiter.Notify(waiter.EventIn)
+		l.replicaWaiter.Notify(waiter.EventIn)
 	}
 
 	return 0, err
 }
 
-func (l *lineDiscipline) windowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) windowSize(t *kernel.Task, args arch.SyscallArguments) error {
 	l.sizeMu.Lock()
 	defer l.sizeMu.Unlock()
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), l.size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := l.size.CopyOut(t, args[2].Pointer())
 	return err
 }
 
-func (l *lineDiscipline) setWindowSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (l *lineDiscipline) setWindowSize(t *kernel.Task, args arch.SyscallArguments) error {
 	l.sizeMu.Lock()
 	defer l.sizeMu.Unlock()
-	_, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &l.size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := l.size.CopyIn(t, args[2].Pointer())
 	return err
 }
 
@@ -174,14 +167,14 @@ func (l *lineDiscipline) masterReadiness() waiter.EventMask {
 	return l.inQueue.writeReadiness(&linux.MasterTermios) | l.outQueue.readReadiness(&linux.MasterTermios)
 }
 
-func (l *lineDiscipline) slaveReadiness() waiter.EventMask {
+func (l *lineDiscipline) replicaReadiness() waiter.EventMask {
 	l.termiosMu.RLock()
 	defer l.termiosMu.RUnlock()
 	return l.outQueue.writeReadiness(&l.termios) | l.inQueue.readReadiness(&l.termios)
 }
 
-func (l *lineDiscipline) inputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	return l.inQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) inputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
+	return l.inQueue.readableSize(t, io, args)
 }
 
 func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -194,7 +187,7 @@ func (l *lineDiscipline) inputQueueRead(ctx context.Context, dst usermem.IOSeque
 	if n > 0 {
 		l.masterWaiter.Notify(waiter.EventOut)
 		if pushed {
-			l.slaveWaiter.Notify(waiter.EventIn)
+			l.replicaWaiter.Notify(waiter.EventIn)
 		}
 		return n, nil
 	}
@@ -209,14 +202,14 @@ func (l *lineDiscipline) inputQueueWrite(ctx context.Context, src usermem.IOSequ
 		return 0, err
 	}
 	if n > 0 {
-		l.slaveWaiter.Notify(waiter.EventIn)
+		l.replicaWaiter.Notify(waiter.EventIn)
 		return n, nil
 	}
 	return 0, syserror.ErrWouldBlock
 }
 
-func (l *lineDiscipline) outputQueueReadSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
-	return l.outQueue.readableSize(ctx, io, args)
+func (l *lineDiscipline) outputQueueReadSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
+	return l.outQueue.readableSize(t, io, args)
 }
 
 func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequence) (int64, error) {
@@ -227,7 +220,7 @@ func (l *lineDiscipline) outputQueueRead(ctx context.Context, dst usermem.IOSequ
 		return 0, err
 	}
 	if n > 0 {
-		l.slaveWaiter.Notify(waiter.EventOut)
+		l.replicaWaiter.Notify(waiter.EventOut)
 		if pushed {
 			l.masterWaiter.Notify(waiter.EventIn)
 		}
diff --git a/pkg/sentry/fsimpl/devpts/master.go b/pkg/sentry/fsimpl/devpts/master.go
index 3bb397f71..e91fa26a4 100644
--- a/pkg/sentry/fsimpl/devpts/master.go
+++ b/pkg/sentry/fsimpl/devpts/master.go
@@ -17,9 +17,11 @@ package devpts
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/unimpl"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -29,7 +31,10 @@ import (
 )
 
 // masterInode is the inode for the master end of the Terminal.
+//
+// +stateify savable
 type masterInode struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeNotDirectory
@@ -37,9 +42,6 @@ type masterInode struct {
 
 	locks vfs.FileLocks
 
-	// Keep a reference to this inode's dentry.
-	dentry kernfs.Dentry
-
 	// root is the devpts root inode.
 	root *rootInode
 }
@@ -47,20 +49,18 @@ type masterInode struct {
 var _ kernfs.Inode = (*masterInode)(nil)
 
 // Open implements kernfs.Inode.Open.
-func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	t, err := mi.root.allocateTerminal(rp.Credentials())
+func (mi *masterInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	t, err := mi.root.allocateTerminal(ctx, rp.Credentials())
 	if err != nil {
 		return nil, err
 	}
 
-	mi.IncRef()
 	fd := &masterFileDescription{
 		inode: mi,
 		t:     t,
 	}
 	fd.LockFD.Init(&mi.locks)
-	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
-		mi.DecRef(ctx)
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
@@ -86,6 +86,7 @@ func (mi *masterInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds
 	return mi.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
 }
 
+// +stateify savable
 type masterFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -99,8 +100,7 @@ var _ vfs.FileDescriptionImpl = (*masterFileDescription)(nil)
 
 // Release implements vfs.FileDescriptionImpl.Release.
 func (mfd *masterFileDescription) Release(ctx context.Context) {
-	mfd.inode.root.masterClose(mfd.t)
-	mfd.inode.DecRef(ctx)
+	mfd.inode.root.masterClose(ctx, mfd.t)
 }
 
 // EventRegister implements waiter.Waitable.EventRegister.
@@ -130,46 +130,51 @@ func (mfd *masterFileDescription) Write(ctx context.Context, src usermem.IOSeque
 
 // Ioctl implements vfs.FileDescriptionImpl.Ioctl.
 func (mfd *masterFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
 	switch cmd := args[1].Uint(); cmd {
 	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
 		// Get the number of bytes in the output queue read buffer.
-		return 0, mfd.t.ld.outputQueueReadSize(ctx, io, args)
+		return 0, mfd.t.ld.outputQueueReadSize(t, io, args)
 	case linux.TCGETS:
 		// N.B. TCGETS on the master actually returns the configuration
-		// of the slave end.
-		return mfd.t.ld.getTermios(ctx, io, args)
+		// of the replica end.
+		return mfd.t.ld.getTermios(t, args)
 	case linux.TCSETS:
 		// N.B. TCSETS on the master actually affects the configuration
-		// of the slave end.
-		return mfd.t.ld.setTermios(ctx, io, args)
+		// of the replica end.
+		return mfd.t.ld.setTermios(t, args)
 	case linux.TCSETSW:
 		// TODO(b/29356795): This should drain the output queue first.
-		return mfd.t.ld.setTermios(ctx, io, args)
+		return mfd.t.ld.setTermios(t, args)
 	case linux.TIOCGPTN:
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(mfd.t.n), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		nP := primitive.Uint32(mfd.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
 		return 0, err
 	case linux.TIOCSPTLCK:
 		// TODO(b/29356795): Implement pty locking. For now just pretend we do.
 		return 0, nil
 	case linux.TIOCGWINSZ:
-		return 0, mfd.t.ld.windowSize(ctx, io, args)
+		return 0, mfd.t.ld.windowSize(t, args)
 	case linux.TIOCSWINSZ:
-		return 0, mfd.t.ld.setWindowSize(ctx, io, args)
+		return 0, mfd.t.ld.setWindowSize(t, args)
 	case linux.TIOCSCTTY:
 		// Make the given terminal the controlling terminal of the
 		// calling process.
-		return 0, mfd.t.setControllingTTY(ctx, io, args, true /* isMaster */)
+		return 0, mfd.t.setControllingTTY(ctx, args, true /* isMaster */)
 	case linux.TIOCNOTTY:
 		// Release this process's controlling terminal.
-		return 0, mfd.t.releaseControllingTTY(ctx, io, args, true /* isMaster */)
+		return 0, mfd.t.releaseControllingTTY(ctx, args, true /* isMaster */)
 	case linux.TIOCGPGRP:
 		// Get the foreground process group.
-		return mfd.t.foregroundProcessGroup(ctx, io, args, true /* isMaster */)
+		return mfd.t.foregroundProcessGroup(ctx, args, true /* isMaster */)
 	case linux.TIOCSPGRP:
 		// Set the foreground process group.
-		return mfd.t.setForegroundProcessGroup(ctx, io, args, true /* isMaster */)
+		return mfd.t.setForegroundProcessGroup(ctx, args, true /* isMaster */)
 	default:
 		maybeEmitUnimplementedEvent(ctx, cmd)
 		return 0, syserror.ENOTTY
diff --git a/pkg/sentry/fsimpl/devpts/queue.go b/pkg/sentry/fsimpl/devpts/queue.go
index dffb4232c..55bff3e60 100644
--- a/pkg/sentry/fsimpl/devpts/queue.go
+++ b/pkg/sentry/fsimpl/devpts/queue.go
@@ -17,8 +17,10 @@ package devpts
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -30,7 +32,7 @@ import (
 const waitBufMaxBytes = 131072
 
 // queue represents one of the input or output queues between a pty master and
-// slave. Bytes written to a queue are added to the read buffer until it is
+// replica. Bytes written to a queue are added to the read buffer until it is
 // full, at which point they are written to the wait buffer. Bytes are
 // processed (i.e. undergo termios transformations) as they are added to the
 // read buffer. The read buffer is readable when its length is nonzero and
@@ -83,17 +85,15 @@ func (q *queue) writeReadiness(t *linux.KernelTermios) waiter.EventMask {
 }
 
 // readableSize writes the number of readable bytes to userspace.
-func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.SyscallArguments) error {
+func (q *queue) readableSize(t *kernel.Task, io usermem.IO, args arch.SyscallArguments) error {
 	q.mu.Lock()
 	defer q.mu.Unlock()
-	var size int32
+	size := primitive.Int32(0)
 	if q.readable {
-		size = int32(len(q.readBuf))
+		size = primitive.Int32(len(q.readBuf))
 	}
 
-	_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), size, usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	_, err := size.CopyOut(t, args[2].Pointer())
 	return err
 
 }
@@ -102,8 +102,7 @@ func (q *queue) readableSize(ctx context.Context, io usermem.IO, args arch.Sysca
 // as whether the read caused more readable data to become available (whether
 // data was pushed from the wait buffer to the read buffer).
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipline) (int64, bool, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -143,8 +142,7 @@ func (q *queue) read(ctx context.Context, dst usermem.IOSequence, l *lineDiscipl
 
 // write writes to q from userspace.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscipline) (int64, error) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
@@ -186,8 +184,7 @@ func (q *queue) write(ctx context.Context, src usermem.IOSequence, l *lineDiscip
 
 // writeBytes writes to q from b.
 //
-// Preconditions:
-// * l.termiosMu must be held for reading.
+// Preconditions: l.termiosMu must be held for reading.
 func (q *queue) writeBytes(b []byte, l *lineDiscipline) {
 	q.mu.Lock()
 	defer q.mu.Unlock()
diff --git a/pkg/sentry/fsimpl/devpts/replica.go b/pkg/sentry/fsimpl/devpts/replica.go
new file mode 100644
index 000000000..70c68cf0a
--- /dev/null
+++ b/pkg/sentry/fsimpl/devpts/replica.go
@@ -0,0 +1,201 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devpts
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+// replicaInode is the inode for the replica end of the Terminal.
+//
+// +stateify savable
+type replicaInode struct {
+	implStatFS
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+
+	locks vfs.FileLocks
+
+	// root is the devpts root inode.
+	root *rootInode
+
+	// t is the connected Terminal.
+	t *Terminal
+}
+
+var _ kernfs.Inode = (*replicaInode)(nil)
+
+// Open implements kernfs.Inode.Open.
+func (ri *replicaInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd := &replicaFileDescription{
+		inode: ri,
+	}
+	fd.LockFD.Init(&ri.locks)
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+
+}
+
+// Valid implements kernfs.Inode.Valid.
+func (ri *replicaInode) Valid(context.Context) bool {
+	// Return valid if the replica still exists.
+	ri.root.mu.Lock()
+	defer ri.root.mu.Unlock()
+	_, ok := ri.root.replicas[ri.t.n]
+	return ok
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (ri *replicaInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	statx, err := ri.InodeAttrs.Stat(ctx, vfsfs, opts)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	statx.Blksize = 1024
+	statx.RdevMajor = linux.UNIX98_PTY_REPLICA_MAJOR
+	statx.RdevMinor = ri.t.n
+	return statx, nil
+}
+
+// SetStat implements kernfs.Inode.SetStat
+func (ri *replicaInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask&linux.STATX_SIZE != 0 {
+		return syserror.EINVAL
+	}
+	return ri.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
+}
+
+// +stateify savable
+type replicaFileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.LockFD
+
+	inode *replicaInode
+}
+
+var _ vfs.FileDescriptionImpl = (*replicaFileDescription)(nil)
+
+// Release implements fs.FileOperations.Release.
+func (rfd *replicaFileDescription) Release(ctx context.Context) {}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (rfd *replicaFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	rfd.inode.t.ld.replicaWaiter.EventRegister(e, mask)
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (rfd *replicaFileDescription) EventUnregister(e *waiter.Entry) {
+	rfd.inode.t.ld.replicaWaiter.EventUnregister(e)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (rfd *replicaFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
+	return rfd.inode.t.ld.replicaReadiness()
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (rfd *replicaFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
+	return rfd.inode.t.ld.inputQueueRead(ctx, dst)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (rfd *replicaFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
+	return rfd.inode.t.ld.outputQueueWrite(ctx, src)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (rfd *replicaFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		// ioctl(2) may only be called from a task goroutine.
+		return 0, syserror.ENOTTY
+	}
+
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
+		// Get the number of bytes in the input queue read buffer.
+		return 0, rfd.inode.t.ld.inputQueueReadSize(t, io, args)
+	case linux.TCGETS:
+		return rfd.inode.t.ld.getTermios(t, args)
+	case linux.TCSETS:
+		return rfd.inode.t.ld.setTermios(t, args)
+	case linux.TCSETSW:
+		// TODO(b/29356795): This should drain the output queue first.
+		return rfd.inode.t.ld.setTermios(t, args)
+	case linux.TIOCGPTN:
+		nP := primitive.Uint32(rfd.inode.t.n)
+		_, err := nP.CopyOut(t, args[2].Pointer())
+		return 0, err
+	case linux.TIOCGWINSZ:
+		return 0, rfd.inode.t.ld.windowSize(t, args)
+	case linux.TIOCSWINSZ:
+		return 0, rfd.inode.t.ld.setWindowSize(t, args)
+	case linux.TIOCSCTTY:
+		// Make the given terminal the controlling terminal of the
+		// calling process.
+		return 0, rfd.inode.t.setControllingTTY(ctx, args, false /* isMaster */)
+	case linux.TIOCNOTTY:
+		// Release this process's controlling terminal.
+		return 0, rfd.inode.t.releaseControllingTTY(ctx, args, false /* isMaster */)
+	case linux.TIOCGPGRP:
+		// Get the foreground process group.
+		return rfd.inode.t.foregroundProcessGroup(ctx, args, false /* isMaster */)
+	case linux.TIOCSPGRP:
+		// Set the foreground process group.
+		return rfd.inode.t.setForegroundProcessGroup(ctx, args, false /* isMaster */)
+	default:
+		maybeEmitUnimplementedEvent(ctx, cmd)
+		return 0, syserror.ENOTTY
+	}
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (rfd *replicaFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return rfd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (rfd *replicaFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := rfd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return rfd.inode.Stat(ctx, fs, opts)
+}
+
+// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
+func (rfd *replicaFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
+	return rfd.Locks().LockPOSIX(ctx, &rfd.vfsfd, uid, t, start, length, whence, block)
+}
+
+// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
+func (rfd *replicaFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
+	return rfd.Locks().UnlockPOSIX(ctx, &rfd.vfsfd, uid, start, length, whence)
+}
diff --git a/pkg/sentry/fsimpl/devpts/slave.go b/pkg/sentry/fsimpl/devpts/slave.go
deleted file mode 100644
index 32e4e1908..000000000
--- a/pkg/sentry/fsimpl/devpts/slave.go
+++ /dev/null
@@ -1,197 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package devpts
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/arch"
-	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
-	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
-	"gvisor.dev/gvisor/pkg/waiter"
-)
-
-// slaveInode is the inode for the slave end of the Terminal.
-type slaveInode struct {
-	kernfs.InodeAttrs
-	kernfs.InodeNoopRefCount
-	kernfs.InodeNotDirectory
-	kernfs.InodeNotSymlink
-
-	locks vfs.FileLocks
-
-	// Keep a reference to this inode's dentry.
-	dentry kernfs.Dentry
-
-	// root is the devpts root inode.
-	root *rootInode
-
-	// t is the connected Terminal.
-	t *Terminal
-}
-
-var _ kernfs.Inode = (*slaveInode)(nil)
-
-// Open implements kernfs.Inode.Open.
-func (si *slaveInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	si.IncRef()
-	fd := &slaveFileDescription{
-		inode: si,
-	}
-	fd.LockFD.Init(&si.locks)
-	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
-		si.DecRef(ctx)
-		return nil, err
-	}
-	return &fd.vfsfd, nil
-
-}
-
-// Valid implements kernfs.Inode.Valid.
-func (si *slaveInode) Valid(context.Context) bool {
-	// Return valid if the slave still exists.
-	si.root.mu.Lock()
-	defer si.root.mu.Unlock()
-	_, ok := si.root.slaves[si.t.n]
-	return ok
-}
-
-// Stat implements kernfs.Inode.Stat.
-func (si *slaveInode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	statx, err := si.InodeAttrs.Stat(ctx, vfsfs, opts)
-	if err != nil {
-		return linux.Statx{}, err
-	}
-	statx.Blksize = 1024
-	statx.RdevMajor = linux.UNIX98_PTY_SLAVE_MAJOR
-	statx.RdevMinor = si.t.n
-	return statx, nil
-}
-
-// SetStat implements kernfs.Inode.SetStat
-func (si *slaveInode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
-	if opts.Stat.Mask&linux.STATX_SIZE != 0 {
-		return syserror.EINVAL
-	}
-	return si.InodeAttrs.SetStat(ctx, vfsfs, creds, opts)
-}
-
-type slaveFileDescription struct {
-	vfsfd vfs.FileDescription
-	vfs.FileDescriptionDefaultImpl
-	vfs.LockFD
-
-	inode *slaveInode
-}
-
-var _ vfs.FileDescriptionImpl = (*slaveFileDescription)(nil)
-
-// Release implements fs.FileOperations.Release.
-func (sfd *slaveFileDescription) Release(ctx context.Context) {
-	sfd.inode.DecRef(ctx)
-}
-
-// EventRegister implements waiter.Waitable.EventRegister.
-func (sfd *slaveFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
-	sfd.inode.t.ld.slaveWaiter.EventRegister(e, mask)
-}
-
-// EventUnregister implements waiter.Waitable.EventUnregister.
-func (sfd *slaveFileDescription) EventUnregister(e *waiter.Entry) {
-	sfd.inode.t.ld.slaveWaiter.EventUnregister(e)
-}
-
-// Readiness implements waiter.Waitable.Readiness.
-func (sfd *slaveFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
-	return sfd.inode.t.ld.slaveReadiness()
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (sfd *slaveFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
-	return sfd.inode.t.ld.inputQueueRead(ctx, dst)
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (sfd *slaveFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
-	return sfd.inode.t.ld.outputQueueWrite(ctx, src)
-}
-
-// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
-func (sfd *slaveFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
-	switch cmd := args[1].Uint(); cmd {
-	case linux.FIONREAD: // linux.FIONREAD == linux.TIOCINQ
-		// Get the number of bytes in the input queue read buffer.
-		return 0, sfd.inode.t.ld.inputQueueReadSize(ctx, io, args)
-	case linux.TCGETS:
-		return sfd.inode.t.ld.getTermios(ctx, io, args)
-	case linux.TCSETS:
-		return sfd.inode.t.ld.setTermios(ctx, io, args)
-	case linux.TCSETSW:
-		// TODO(b/29356795): This should drain the output queue first.
-		return sfd.inode.t.ld.setTermios(ctx, io, args)
-	case linux.TIOCGPTN:
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), uint32(sfd.inode.t.n), usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
-		return 0, err
-	case linux.TIOCGWINSZ:
-		return 0, sfd.inode.t.ld.windowSize(ctx, io, args)
-	case linux.TIOCSWINSZ:
-		return 0, sfd.inode.t.ld.setWindowSize(ctx, io, args)
-	case linux.TIOCSCTTY:
-		// Make the given terminal the controlling terminal of the
-		// calling process.
-		return 0, sfd.inode.t.setControllingTTY(ctx, io, args, false /* isMaster */)
-	case linux.TIOCNOTTY:
-		// Release this process's controlling terminal.
-		return 0, sfd.inode.t.releaseControllingTTY(ctx, io, args, false /* isMaster */)
-	case linux.TIOCGPGRP:
-		// Get the foreground process group.
-		return sfd.inode.t.foregroundProcessGroup(ctx, io, args, false /* isMaster */)
-	case linux.TIOCSPGRP:
-		// Set the foreground process group.
-		return sfd.inode.t.setForegroundProcessGroup(ctx, io, args, false /* isMaster */)
-	default:
-		maybeEmitUnimplementedEvent(ctx, cmd)
-		return 0, syserror.ENOTTY
-	}
-}
-
-// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (sfd *slaveFileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	creds := auth.CredentialsFromContext(ctx)
-	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return sfd.inode.SetStat(ctx, fs, creds, opts)
-}
-
-// Stat implements vfs.FileDescriptionImpl.Stat.
-func (sfd *slaveFileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
-	fs := sfd.vfsfd.VirtualDentry().Mount().Filesystem()
-	return sfd.inode.Stat(ctx, fs, opts)
-}
-
-// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
-func (sfd *slaveFileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
-	return sfd.Locks().LockPOSIX(ctx, &sfd.vfsfd, uid, t, start, length, whence, block)
-}
-
-// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
-func (sfd *slaveFileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
-	return sfd.Locks().UnlockPOSIX(ctx, &sfd.vfsfd, uid, start, length, whence)
-}
diff --git a/pkg/sentry/fsimpl/devpts/terminal.go b/pkg/sentry/fsimpl/devpts/terminal.go
index 7d2781c54..510bd6d89 100644
--- a/pkg/sentry/fsimpl/devpts/terminal.go
+++ b/pkg/sentry/fsimpl/devpts/terminal.go
@@ -17,9 +17,9 @@ package devpts
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Terminal is a pseudoterminal.
@@ -36,25 +36,25 @@ type Terminal struct {
 	// this terminal. This field is immutable.
 	masterKTTY *kernel.TTY
 
-	// slaveKTTY contains the controlling process of the slave end of this
+	// replicaKTTY contains the controlling process of the replica end of this
 	// terminal. This field is immutable.
-	slaveKTTY *kernel.TTY
+	replicaKTTY *kernel.TTY
 }
 
 func newTerminal(n uint32) *Terminal {
-	termios := linux.DefaultSlaveTermios
+	termios := linux.DefaultReplicaTermios
 	t := Terminal{
-		n:          n,
-		ld:         newLineDiscipline(termios),
-		masterKTTY: &kernel.TTY{Index: n},
-		slaveKTTY:  &kernel.TTY{Index: n},
+		n:           n,
+		ld:          newLineDiscipline(termios),
+		masterKTTY:  &kernel.TTY{Index: n},
+		replicaKTTY: &kernel.TTY{Index: n},
 	}
 	return &t
 }
 
 // setControllingTTY makes tm the controlling terminal of the calling thread
 // group.
-func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) setControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("setControllingTTY must be called from a task context")
@@ -65,7 +65,7 @@ func (tm *Terminal) setControllingTTY(ctx context.Context, io usermem.IO, args a
 
 // releaseControllingTTY removes tm as the controlling terminal of the calling
 // thread group.
-func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) error {
+func (tm *Terminal) releaseControllingTTY(ctx context.Context, args arch.SyscallArguments, isMaster bool) error {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("releaseControllingTTY must be called from a task context")
@@ -75,7 +75,7 @@ func (tm *Terminal) releaseControllingTTY(ctx context.Context, io usermem.IO, ar
 }
 
 // foregroundProcessGroup gets the process group ID of tm's foreground process.
-func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) foregroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("foregroundProcessGroup must be called from a task context")
@@ -87,24 +87,21 @@ func (tm *Terminal) foregroundProcessGroup(ctx context.Context, io usermem.IO, a
 	}
 
 	// Write it out to *arg.
-	_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), int32(ret), usermem.IOOpts{
-		AddressSpaceActive: true,
-	})
+	retP := primitive.Int32(ret)
+	_, err = retP.CopyOut(task, args[2].Pointer())
 	return 0, err
 }
 
 // foregroundProcessGroup sets tm's foreground process.
-func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, io usermem.IO, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
+func (tm *Terminal) setForegroundProcessGroup(ctx context.Context, args arch.SyscallArguments, isMaster bool) (uintptr, error) {
 	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		panic("setForegroundProcessGroup must be called from a task context")
 	}
 
 	// Read in the process group ID.
-	var pgid int32
-	if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgid, usermem.IOOpts{
-		AddressSpaceActive: true,
-	}); err != nil {
+	var pgid primitive.Int32
+	if _, err := pgid.CopyIn(task, args[2].Pointer()); err != nil {
 		return 0, err
 	}
 
@@ -116,5 +113,5 @@ func (tm *Terminal) tty(isMaster bool) *kernel.TTY {
 	if isMaster {
 		return tm.masterKTTY
 	}
-	return tm.slaveKTTY
+	return tm.replicaKTTY
 }
diff --git a/pkg/sentry/fsimpl/devtmpfs/BUILD b/pkg/sentry/fsimpl/devtmpfs/BUILD
index aa0c2ad8c..e49a04c1b 100644
--- a/pkg/sentry/fsimpl/devtmpfs/BUILD
+++ b/pkg/sentry/fsimpl/devtmpfs/BUILD
@@ -4,7 +4,10 @@ licenses(["notice"])
 
 go_library(
     name = "devtmpfs",
-    srcs = ["devtmpfs.go"],
+    srcs = [
+        "devtmpfs.go",
+        "save_restore.go",
+    ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
@@ -24,6 +27,7 @@ go_test(
     library = ":devtmpfs",
     deps = [
         "//pkg/abi/linux",
+        "//pkg/context",
         "//pkg/fspath",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/tmpfs",
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
index 2ed5fa8a9..e6fe0fc0d 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs.go
@@ -18,6 +18,7 @@ package devtmpfs
 
 import (
 	"fmt"
+	"path"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -32,8 +33,10 @@ import (
 const Name = "devtmpfs"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct {
-	initOnce sync.Once
+	initOnce sync.Once `state:"nosave"` // FIXME(gvisor.dev/issue/1664): not yet supported.
 	initErr  error
 
 	// fs is the tmpfs filesystem that backs all mounts of this FilesystemType.
@@ -68,6 +71,15 @@ func (fst *FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virtua
 	return fst.fs, fst.root, nil
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (fst *FilesystemType) Release(ctx context.Context) {
+	if fst.fs != nil {
+		// Release the original reference obtained when creating the filesystem.
+		fst.root.DecRef(ctx)
+		fst.fs.DecRef(ctx)
+	}
+}
+
 // Accessor allows devices to create device special files in devtmpfs.
 type Accessor struct {
 	vfsObj *vfs.VirtualFilesystem
@@ -79,14 +91,17 @@ type Accessor struct {
 // NewAccessor returns an Accessor that supports creation of device special
 // files in the devtmpfs instance registered with name fsTypeName in vfsObj.
 func NewAccessor(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, fsTypeName string) (*Accessor, error) {
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "devtmpfs" /* source */, fsTypeName, &vfs.MountOptions{})
 	if err != nil {
 		return nil, err
 	}
+	// Pass a reference on root to the Accessor.
+	root := mntns.Root()
+	root.IncRef()
 	return &Accessor{
 		vfsObj: vfsObj,
 		mntns:  mntns,
-		root:   mntns.Root(),
+		root:   root,
 		creds:  creds,
 	}, nil
 }
@@ -150,13 +165,11 @@ func (a *Accessor) CreateDeviceFile(ctx context.Context, pathname string, kind v
 
 	// Create any parent directories. See
 	// devtmpfs.c:handle_create()=>path_create().
-	for it := fspath.Parse(pathname).Begin; it.NextOk(); it = it.Next() {
-		pop := a.pathOperationAt(it.String())
-		if err := a.vfsObj.MkdirAt(actx, a.creds, pop, &vfs.MkdirOptions{
-			Mode: 0755,
-		}); err != nil {
-			return fmt.Errorf("failed to create directory %q: %v", it.String(), err)
-		}
+	parent := path.Dir(pathname)
+	if err := a.vfsObj.MkdirAllAt(ctx, parent, a.root, a.creds, &vfs.MkdirOptions{
+		Mode: 0755,
+	}); err != nil {
+		return fmt.Errorf("failed to create device parent directory %q: %v", parent, err)
 	}
 
 	// NOTE: Linux's devtmpfs refuses to automatically delete files it didn't
diff --git a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
index 747867cca..e058eda7a 100644
--- a/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
+++ b/pkg/sentry/fsimpl/devtmpfs/devtmpfs_test.go
@@ -15,9 +15,11 @@
 package devtmpfs
 
 import (
+	"path"
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
@@ -25,10 +27,13 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
-func TestDevtmpfs(t *testing.T) {
+const devPath = "/dev"
+
+func setupDevtmpfs(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry, func()) {
+	t.Helper()
+
 	ctx := contexttest.Context(t)
 	creds := auth.CredentialsFromContext(ctx)
-
 	vfsObj := &vfs.VirtualFilesystem{}
 	if err := vfsObj.Init(ctx); err != nil {
 		t.Fatalf("VFS init: %v", err)
@@ -43,14 +48,12 @@ func TestDevtmpfs(t *testing.T) {
 	})
 
 	// Create a test mount namespace with devtmpfs mounted at "/dev".
-	const devPath = "/dev"
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "tmpfs" /* source */, "tmpfs" /* fsTypeName */, &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
-	defer mntns.DecRef(ctx)
 	root := mntns.Root()
-	defer root.DecRef(ctx)
+	root.IncRef()
 	devpop := vfs.PathOperation{
 		Root:  root,
 		Start: root,
@@ -61,10 +64,20 @@ func TestDevtmpfs(t *testing.T) {
 	}); err != nil {
 		t.Fatalf("failed to create mount point: %v", err)
 	}
-	if err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil {
+	if _, err := vfsObj.MountAt(ctx, creds, "devtmpfs" /* source */, &devpop, "devtmpfs" /* fsTypeName */, &vfs.MountOptions{}); err != nil {
 		t.Fatalf("failed to mount devtmpfs: %v", err)
 	}
 
+	return ctx, creds, vfsObj, root, func() {
+		root.DecRef(ctx)
+		mntns.DecRef(ctx)
+	}
+}
+
+func TestUserspaceInit(t *testing.T) {
+	ctx, creds, vfsObj, root, cleanup := setupDevtmpfs(t)
+	defer cleanup()
+
 	a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs")
 	if err != nil {
 		t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
@@ -75,48 +88,143 @@ func TestDevtmpfs(t *testing.T) {
 	if err := a.UserspaceInit(ctx); err != nil {
 		t.Fatalf("failed to userspace-initialize devtmpfs: %v", err)
 	}
+
 	// Created files should be visible in the test mount namespace.
-	abspath := devPath + "/fd"
-	target, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(abspath),
-	})
-	if want := "/proc/self/fd"; err != nil || target != want {
-		t.Fatalf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, target, err, want)
+	links := []struct {
+		source string
+		target string
+	}{
+		{
+			source: "fd",
+			target: "/proc/self/fd",
+		},
+		{
+			source: "stdin",
+			target: "/proc/self/fd/0",
+		},
+		{
+			source: "stdout",
+			target: "/proc/self/fd/1",
+		},
+		{
+			source: "stderr",
+			target: "/proc/self/fd/2",
+		},
+		{
+			source: "ptmx",
+			target: "pts/ptmx",
+		},
 	}
 
-	// Create a dummy device special file using a devtmpfs.Accessor.
-	const (
-		pathInDev = "dummy"
-		kind      = vfs.CharDevice
-		major     = 12
-		minor     = 34
-		perms     = 0600
-		wantMode  = linux.S_IFCHR | perms
-	)
-	if err := a.CreateDeviceFile(ctx, pathInDev, kind, major, minor, perms); err != nil {
-		t.Fatalf("failed to create device file: %v", err)
+	for _, link := range links {
+		abspath := path.Join(devPath, link.source)
+		if gotTarget, err := vfsObj.ReadlinkAt(ctx, creds, &vfs.PathOperation{
+			Root:  root,
+			Start: root,
+			Path:  fspath.Parse(abspath),
+		}); err != nil || gotTarget != link.target {
+			t.Errorf("readlink(%q): got (%q, %v), wanted (%q, nil)", abspath, gotTarget, err, link.target)
+		}
 	}
-	// The device special file should be visible in the test mount namespace.
-	abspath = devPath + "/" + pathInDev
-	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(abspath),
-	}, &vfs.StatOptions{
-		Mask: linux.STATX_TYPE | linux.STATX_MODE,
-	})
-	if err != nil {
-		t.Fatalf("failed to stat device file at %q: %v", abspath, err)
+
+	dirs := []string{"shm", "pts"}
+	for _, dir := range dirs {
+		abspath := path.Join(devPath, dir)
+		statx, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+			Root:  root,
+			Start: root,
+			Path:  fspath.Parse(abspath),
+		}, &vfs.StatOptions{
+			Mask: linux.STATX_MODE,
+		})
+		if err != nil {
+			t.Errorf("stat(%q): got error %v ", abspath, err)
+			continue
+		}
+		if want := uint16(0755) | linux.S_IFDIR; statx.Mode != want {
+			t.Errorf("stat(%q): got mode %x, want %x", abspath, statx.Mode, want)
+		}
 	}
-	if stat.Mode != wantMode {
-		t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode)
+}
+
+func TestCreateDeviceFile(t *testing.T) {
+	ctx, creds, vfsObj, root, cleanup := setupDevtmpfs(t)
+	defer cleanup()
+
+	a, err := NewAccessor(ctx, vfsObj, creds, "devtmpfs")
+	if err != nil {
+		t.Fatalf("failed to create devtmpfs.Accessor: %v", err)
 	}
-	if stat.RdevMajor != major {
-		t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, major)
+	defer a.Release(ctx)
+
+	devFiles := []struct {
+		path  string
+		kind  vfs.DeviceKind
+		major uint32
+		minor uint32
+		perms uint16
+	}{
+		{
+			path:  "dummy",
+			kind:  vfs.CharDevice,
+			major: 12,
+			minor: 34,
+			perms: 0600,
+		},
+		{
+			path:  "foo/bar",
+			kind:  vfs.BlockDevice,
+			major: 13,
+			minor: 35,
+			perms: 0660,
+		},
+		{
+			path:  "foo/baz",
+			kind:  vfs.CharDevice,
+			major: 12,
+			minor: 40,
+			perms: 0666,
+		},
+		{
+			path:  "a/b/c/d/e",
+			kind:  vfs.BlockDevice,
+			major: 12,
+			minor: 34,
+			perms: 0600,
+		},
 	}
-	if stat.RdevMinor != minor {
-		t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, minor)
+
+	for _, f := range devFiles {
+		if err := a.CreateDeviceFile(ctx, f.path, f.kind, f.major, f.minor, f.perms); err != nil {
+			t.Fatalf("failed to create device file: %v", err)
+		}
+		// The device special file should be visible in the test mount namespace.
+		abspath := path.Join(devPath, f.path)
+		stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+			Root:  root,
+			Start: root,
+			Path:  fspath.Parse(abspath),
+		}, &vfs.StatOptions{
+			Mask: linux.STATX_TYPE | linux.STATX_MODE,
+		})
+		if err != nil {
+			t.Fatalf("failed to stat device file at %q: %v", abspath, err)
+		}
+		if stat.RdevMajor != f.major {
+			t.Errorf("major device number: got %v, wanted %v", stat.RdevMajor, f.major)
+		}
+		if stat.RdevMinor != f.minor {
+			t.Errorf("minor device number: got %v, wanted %v", stat.RdevMinor, f.minor)
+		}
+		wantMode := f.perms
+		switch f.kind {
+		case vfs.CharDevice:
+			wantMode |= linux.S_IFCHR
+		case vfs.BlockDevice:
+			wantMode |= linux.S_IFBLK
+		}
+		if stat.Mode != wantMode {
+			t.Errorf("device file mode: got %v, wanted %v", stat.Mode, wantMode)
+		}
 	}
 }
diff --git a/pkg/sentry/fsimpl/devtmpfs/save_restore.go b/pkg/sentry/fsimpl/devtmpfs/save_restore.go
new file mode 100644
index 000000000..28832d850
--- /dev/null
+++ b/pkg/sentry/fsimpl/devtmpfs/save_restore.go
@@ -0,0 +1,23 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package devtmpfs
+
+// afterLoad is invoked by stateify.
+func (fst *FilesystemType) afterLoad() {
+	if fst.fs != nil {
+		// Ensure that we don't create another filesystem.
+		fst.initOnce.Do(func() {})
+	}
+}
diff --git a/pkg/sentry/fsimpl/eventfd/eventfd.go b/pkg/sentry/fsimpl/eventfd/eventfd.go
index 812171fa3..5b29f2358 100644
--- a/pkg/sentry/fsimpl/eventfd/eventfd.go
+++ b/pkg/sentry/fsimpl/eventfd/eventfd.go
@@ -30,9 +30,11 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// EventFileDescription implements FileDescriptionImpl for file-based event
+// EventFileDescription implements vfs.FileDescriptionImpl for file-based event
 // notification (eventfd). Eventfds are usually internal to the Sentry but in
 // certain situations they may be converted into a host-backed eventfd.
+//
+// +stateify savable
 type EventFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -41,7 +43,7 @@ type EventFileDescription struct {
 
 	// queue is used to notify interested parties when the event object
 	// becomes readable or writable.
-	queue waiter.Queue `state:"zerovalue"`
+	queue waiter.Queue
 
 	// mu protects the fields below.
 	mu sync.Mutex `state:"nosave"`
@@ -106,7 +108,7 @@ func (efd *EventFileDescription) HostFD() (int, error) {
 	return efd.hostfd, nil
 }
 
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
 func (efd *EventFileDescription) Release(context.Context) {
 	efd.mu.Lock()
 	defer efd.mu.Unlock()
@@ -119,7 +121,7 @@ func (efd *EventFileDescription) Release(context.Context) {
 	}
 }
 
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
 func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
 	if dst.NumBytes() < 8 {
 		return 0, syscall.EINVAL
@@ -130,7 +132,7 @@ func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequenc
 	return 8, nil
 }
 
-// Write implements FileDescriptionImpl.Write.
+// Write implements vfs.FileDescriptionImpl.Write.
 func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ vfs.WriteOptions) (int64, error) {
 	if src.NumBytes() < 8 {
 		return 0, syscall.EINVAL
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index abc610ef3..7b1eec3da 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -51,6 +51,8 @@ go_library(
         "//pkg/fd",
         "//pkg/fspath",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/marshal/primitive",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
@@ -86,9 +88,9 @@ go_test(
     library = ":ext",
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/marshal/primitive",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/ext/disklayout",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 8f7d5a9bb..2ee7cc7ac 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -59,13 +59,18 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
 	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: int(f.Fd()),
+		},
+	})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
 	}
 
 	root := mntns.Root()
+	root.IncRef()
 
 	tearDown := func() {
 		root.DecRef(ctx)
@@ -90,7 +95,7 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf
 	ctx := contexttest.Context(b)
 	creds := auth.CredentialsFromContext(ctx)
 
-	if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
+	if _, err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
 		GetFilesystemOptions: vfs.GetFilesystemOptions{
 			InternalData: int(f.Fd()),
 		},
diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go
index 8bb104ff0..1165234f9 100644
--- a/pkg/sentry/fsimpl/ext/block_map_file.go
+++ b/pkg/sentry/fsimpl/ext/block_map_file.go
@@ -18,7 +18,7 @@ import (
 	"io"
 	"math"
 
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
@@ -34,19 +34,19 @@ type blockMapFile struct {
 
 	// directBlks are the direct blocks numbers. The physical blocks pointed by
 	// these holds file data. Contains file blocks 0 to 11.
-	directBlks [numDirectBlks]uint32
+	directBlks [numDirectBlks]primitive.Uint32
 
 	// indirectBlk is the physical block which contains (blkSize/4) direct block
 	// numbers (as uint32 integers).
-	indirectBlk uint32
+	indirectBlk primitive.Uint32
 
 	// doubleIndirectBlk is the physical block which contains (blkSize/4) indirect
 	// block numbers (as uint32 integers).
-	doubleIndirectBlk uint32
+	doubleIndirectBlk primitive.Uint32
 
 	// tripleIndirectBlk is the physical block which contains (blkSize/4) doubly
 	// indirect block numbers (as uint32 integers).
-	tripleIndirectBlk uint32
+	tripleIndirectBlk primitive.Uint32
 
 	// coverage at (i)th index indicates the amount of file data a node at
 	// height (i) covers. Height 0 is the direct block.
@@ -68,10 +68,12 @@ func newBlockMapFile(args inodeArgs) (*blockMapFile, error) {
 	}
 
 	blkMap := file.regFile.inode.diskInode.Data()
-	binary.Unmarshal(blkMap[:numDirectBlks*4], binary.LittleEndian, &file.directBlks)
-	binary.Unmarshal(blkMap[numDirectBlks*4:(numDirectBlks+1)*4], binary.LittleEndian, &file.indirectBlk)
-	binary.Unmarshal(blkMap[(numDirectBlks+1)*4:(numDirectBlks+2)*4], binary.LittleEndian, &file.doubleIndirectBlk)
-	binary.Unmarshal(blkMap[(numDirectBlks+2)*4:(numDirectBlks+3)*4], binary.LittleEndian, &file.tripleIndirectBlk)
+	for i := 0; i < numDirectBlks; i++ {
+		file.directBlks[i].UnmarshalBytes(blkMap[i*4 : (i+1)*4])
+	}
+	file.indirectBlk.UnmarshalBytes(blkMap[numDirectBlks*4 : (numDirectBlks+1)*4])
+	file.doubleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+1)*4 : (numDirectBlks+2)*4])
+	file.tripleIndirectBlk.UnmarshalBytes(blkMap[(numDirectBlks+2)*4 : (numDirectBlks+3)*4])
 	return file, nil
 }
 
@@ -117,16 +119,16 @@ func (f *blockMapFile) ReadAt(dst []byte, off int64) (int, error) {
 		switch {
 		case offset < dirBlksEnd:
 			// Direct block.
-			curR, err = f.read(f.directBlks[offset/f.regFile.inode.blkSize], offset%f.regFile.inode.blkSize, 0, dst[read:])
+			curR, err = f.read(uint32(f.directBlks[offset/f.regFile.inode.blkSize]), offset%f.regFile.inode.blkSize, 0, dst[read:])
 		case offset < indirBlkEnd:
 			// Indirect block.
-			curR, err = f.read(f.indirectBlk, offset-dirBlksEnd, 1, dst[read:])
+			curR, err = f.read(uint32(f.indirectBlk), offset-dirBlksEnd, 1, dst[read:])
 		case offset < doubIndirBlkEnd:
 			// Doubly indirect block.
-			curR, err = f.read(f.doubleIndirectBlk, offset-indirBlkEnd, 2, dst[read:])
+			curR, err = f.read(uint32(f.doubleIndirectBlk), offset-indirBlkEnd, 2, dst[read:])
 		default:
 			// Triply indirect block.
-			curR, err = f.read(f.tripleIndirectBlk, offset-doubIndirBlkEnd, 3, dst[read:])
+			curR, err = f.read(uint32(f.tripleIndirectBlk), offset-doubIndirBlkEnd, 3, dst[read:])
 		}
 
 		read += curR
@@ -174,13 +176,13 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds
 	read := 0
 	curChildOff := relFileOff % childCov
 	for i := startIdx; i < endIdx; i++ {
-		var childPhyBlk uint32
+		var childPhyBlk primitive.Uint32
 		err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk)
 		if err != nil {
 			return read, err
 		}
 
-		n, err := f.read(childPhyBlk, curChildOff, height-1, dst[read:])
+		n, err := f.read(uint32(childPhyBlk), curChildOff, height-1, dst[read:])
 		read += n
 		if err != nil {
 			return read, err
diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go
index 6fa84e7aa..ed98b482e 100644
--- a/pkg/sentry/fsimpl/ext/block_map_test.go
+++ b/pkg/sentry/fsimpl/ext/block_map_test.go
@@ -20,7 +20,7 @@ import (
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 )
 
@@ -87,29 +87,33 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
 	mockDisk := make([]byte, mockBMDiskSize)
 	var fileData []byte
 	blkNums := newBlkNumGen()
-	var data []byte
+	off := 0
+	data := make([]byte, (numDirectBlks+3)*(*primitive.Uint32)(nil).SizeBytes())
 
 	// Write the direct blocks.
 	for i := 0; i < numDirectBlks; i++ {
-		curBlkNum := blkNums.next()
-		data = binary.Marshal(data, binary.LittleEndian, curBlkNum)
-		fileData = append(fileData, writeFileDataToBlock(mockDisk, curBlkNum, 0, blkNums)...)
+		curBlkNum := primitive.Uint32(blkNums.next())
+		curBlkNum.MarshalBytes(data[off:])
+		off += curBlkNum.SizeBytes()
+		fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(curBlkNum), 0, blkNums)...)
 	}
 
 	// Write to indirect block.
-	indirectBlk := blkNums.next()
-	data = binary.Marshal(data, binary.LittleEndian, indirectBlk)
-	fileData = append(fileData, writeFileDataToBlock(mockDisk, indirectBlk, 1, blkNums)...)
-
-	// Write to indirect block.
-	doublyIndirectBlk := blkNums.next()
-	data = binary.Marshal(data, binary.LittleEndian, doublyIndirectBlk)
-	fileData = append(fileData, writeFileDataToBlock(mockDisk, doublyIndirectBlk, 2, blkNums)...)
-
-	// Write to indirect block.
-	triplyIndirectBlk := blkNums.next()
-	data = binary.Marshal(data, binary.LittleEndian, triplyIndirectBlk)
-	fileData = append(fileData, writeFileDataToBlock(mockDisk, triplyIndirectBlk, 3, blkNums)...)
+	indirectBlk := primitive.Uint32(blkNums.next())
+	indirectBlk.MarshalBytes(data[off:])
+	off += indirectBlk.SizeBytes()
+	fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(indirectBlk), 1, blkNums)...)
+
+	// Write to double indirect block.
+	doublyIndirectBlk := primitive.Uint32(blkNums.next())
+	doublyIndirectBlk.MarshalBytes(data[off:])
+	off += doublyIndirectBlk.SizeBytes()
+	fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(doublyIndirectBlk), 2, blkNums)...)
+
+	// Write to triple indirect block.
+	triplyIndirectBlk := primitive.Uint32(blkNums.next())
+	triplyIndirectBlk.MarshalBytes(data[off:])
+	fileData = append(fileData, writeFileDataToBlock(mockDisk, uint32(triplyIndirectBlk), 3, blkNums)...)
 
 	args := inodeArgs{
 		fs: &filesystem{
@@ -142,9 +146,9 @@ func writeFileDataToBlock(disk []byte, blkNum uint32, height uint, blkNums *blkN
 
 	var fileData []byte
 	for off := blkNum * mockBMBlkSize; off < (blkNum+1)*mockBMBlkSize; off += 4 {
-		curBlkNum := blkNums.next()
-		copy(disk[off:off+4], binary.Marshal(nil, binary.LittleEndian, curBlkNum))
-		fileData = append(fileData, writeFileDataToBlock(disk, curBlkNum, height-1, blkNums)...)
+		curBlkNum := primitive.Uint32(blkNums.next())
+		curBlkNum.MarshalBytes(disk[off : off+4])
+		fileData = append(fileData, writeFileDataToBlock(disk, uint32(curBlkNum), height-1, blkNums)...)
 	}
 	return fileData
 }
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index 7a1b4219f..9bfed883a 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -20,6 +20,8 @@ import (
 )
 
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
diff --git a/pkg/sentry/fsimpl/ext/directory.go b/pkg/sentry/fsimpl/ext/directory.go
index 0fc01668d..0ad79b381 100644
--- a/pkg/sentry/fsimpl/ext/directory.go
+++ b/pkg/sentry/fsimpl/ext/directory.go
@@ -16,7 +16,6 @@ package ext
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -28,6 +27,8 @@ import (
 )
 
 // directory represents a directory inode. It holds the childList in memory.
+//
+// +stateify savable
 type directory struct {
 	inode inode
 
@@ -39,7 +40,7 @@ type directory struct {
 	// Lock Order (outermost locks must be taken first):
 	//   directory.mu
 	//     filesystem.mu
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// childList is a list containing (1) child dirents and (2) fake dirents
 	// (with diskDirent == nil) that represent the iteration position of
@@ -98,7 +99,7 @@ func newDirectory(args inodeArgs, newDirent bool) (*directory, error) {
 		} else {
 			curDirent.diskDirent = &disklayout.DirentOld{}
 		}
-		binary.Unmarshal(buf, binary.LittleEndian, curDirent.diskDirent)
+		curDirent.diskDirent.UnmarshalBytes(buf)
 
 		if curDirent.diskDirent.Inode() != 0 && len(curDirent.diskDirent.FileName()) != 0 {
 			// Inode number and name length fields being set to 0 is used to indicate
@@ -120,6 +121,8 @@ func (i *inode) isDir() bool {
 }
 
 // dirent is the directory.childList node.
+//
+// +stateify savable
 type dirent struct {
 	diskDirent disklayout.Dirent
 
@@ -129,6 +132,8 @@ type dirent struct {
 
 // directoryFD represents a directory file description. It implements
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/ext/disklayout/BUILD b/pkg/sentry/fsimpl/ext/disklayout/BUILD
index 9bd9c76c0..d98a05dd8 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/BUILD
+++ b/pkg/sentry/fsimpl/ext/disklayout/BUILD
@@ -22,10 +22,11 @@ go_library(
         "superblock_old.go",
         "test_utils.go",
     ],
+    marshal = True,
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
+        "//pkg/marshal",
         "//pkg/sentry/fs",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/kernel/time",
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group.go b/pkg/sentry/fsimpl/ext/disklayout/block_group.go
index ad6f4fef8..0d56ae9da 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group.go
@@ -14,6 +14,10 @@
 
 package disklayout
 
+import (
+	"gvisor.dev/gvisor/pkg/marshal"
+)
+
 // BlockGroup represents a Linux ext block group descriptor. An ext file system
 // is split into a series of block groups. This provides an access layer to
 // information needed to access and use a block group.
@@ -30,6 +34,8 @@ package disklayout
 //
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#block-group-descriptors.
 type BlockGroup interface {
+	marshal.Marshallable
+
 	// InodeTable returns the absolute block number of the block containing the
 	// inode table. This points to an array of Inode structs. Inode tables are
 	// statically allocated at mkfs time. The superblock records the number of
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
index 3e16c76db..a35fa22a0 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_32.go
@@ -17,6 +17,8 @@ package disklayout
 // BlockGroup32Bit emulates the first half of struct ext4_group_desc in
 // fs/ext4/ext4.h. It is the block group descriptor struct for ext2, ext3 and
 // 32-bit ext4 filesystems. It implements BlockGroup interface.
+//
+// +marshal
 type BlockGroup32Bit struct {
 	BlockBitmapLo         uint32
 	InodeBitmapLo         uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
index 9a809197a..d54d1d345 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_64.go
@@ -18,6 +18,8 @@ package disklayout
 // It is the block group descriptor struct for 64-bit ext4 filesystems.
 // It implements BlockGroup interface. It is an extension of the 32-bit
 // version of BlockGroup.
+//
+// +marshal
 type BlockGroup64Bit struct {
 	// We embed the 32-bit struct here because 64-bit version is just an extension
 	// of the 32-bit version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
index 0ef4294c0..e4ce484e4 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/block_group_test.go
@@ -21,6 +21,8 @@ import (
 // TestBlockGroupSize tests that the block group descriptor structs are of the
 // correct size.
 func TestBlockGroupSize(t *testing.T) {
-	assertSize(t, BlockGroup32Bit{}, 32)
-	assertSize(t, BlockGroup64Bit{}, 64)
+	var bgSmall BlockGroup32Bit
+	assertSize(t, &bgSmall, 32)
+	var bgBig BlockGroup64Bit
+	assertSize(t, &bgBig, 64)
 }
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent.go b/pkg/sentry/fsimpl/ext/disklayout/dirent.go
index 417b6cf65..568c8cb4c 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent.go
@@ -15,6 +15,7 @@
 package disklayout
 
 import (
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
 )
 
@@ -51,6 +52,8 @@ var (
 //
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#linear-classic-directories.
 type Dirent interface {
+	marshal.Marshallable
+
 	// Inode returns the absolute inode number of the underlying inode.
 	// Inode number 0 signifies an unused dirent.
 	Inode() uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
index 29ae4a5c2..51f9c2946 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_new.go
@@ -29,12 +29,14 @@ import (
 // Note: This struct can be of variable size on disk. The one described below
 // is of maximum size and the FileName beyond NameLength bytes might contain
 // garbage.
+//
+// +marshal
 type DirentNew struct {
 	InodeNumber  uint32
 	RecordLength uint16
 	NameLength   uint8
 	FileTypeRaw  uint8
-	FileNameRaw  [MaxFileName]byte
+	FileNameRaw  [MaxFileName]byte `marshal:"unaligned"`
 }
 
 // Compiles only if DirentNew implements Dirent.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
index 6fff12a6e..d4b19e086 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_old.go
@@ -22,11 +22,13 @@ import "gvisor.dev/gvisor/pkg/sentry/fs"
 // Note: This struct can be of variable size on disk. The one described below
 // is of maximum size and the FileName beyond NameLength bytes might contain
 // garbage.
+//
+// +marshal
 type DirentOld struct {
 	InodeNumber  uint32
 	RecordLength uint16
 	NameLength   uint16
-	FileNameRaw  [MaxFileName]byte
+	FileNameRaw  [MaxFileName]byte `marshal:"unaligned"`
 }
 
 // Compiles only if DirentOld implements Dirent.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
index 934919f8a..3486864dc 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/dirent_test.go
@@ -21,6 +21,8 @@ import (
 // TestDirentSize tests that the dirent structs are of the correct
 // size.
 func TestDirentSize(t *testing.T) {
-	assertSize(t, DirentOld{}, uintptr(DirentSize))
-	assertSize(t, DirentNew{}, uintptr(DirentSize))
+	var dOld DirentOld
+	assertSize(t, &dOld, DirentSize)
+	var dNew DirentNew
+	assertSize(t, &dNew, DirentSize)
 }
diff --git a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
index bdf4e2132..0834e9ba8 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/disklayout.go
@@ -36,8 +36,6 @@
 //     escape analysis on an unknown implementation at compile time.
 //
 // Notes:
-//   - All fields in these structs are exported because binary.Read would
-//     panic otherwise.
 //   - All structures on disk are in little-endian order. Only jbd2 (journal)
 //     structures are in big-endian order.
 //   - All OS dependent fields in these structures will be interpretted using
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent.go b/pkg/sentry/fsimpl/ext/disklayout/extent.go
index 4110649ab..b13999bfc 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent.go
@@ -14,6 +14,10 @@
 
 package disklayout
 
+import (
+	"gvisor.dev/gvisor/pkg/marshal"
+)
+
 // Extents were introduced in ext4 and provide huge performance gains in terms
 // data locality and reduced metadata block usage. Extents are organized in
 // extent trees. The root node is contained in inode.BlocksRaw.
@@ -64,6 +68,8 @@ type ExtentNode struct {
 // ExtentEntry represents an extent tree node entry. The entry can either be
 // an ExtentIdx or Extent itself. This exists to simplify navigation logic.
 type ExtentEntry interface {
+	marshal.Marshallable
+
 	// FileBlock returns the first file block number covered by this entry.
 	FileBlock() uint32
 
@@ -75,6 +81,8 @@ type ExtentEntry interface {
 // tree node begins with this and is followed by `NumEntries` number of:
 //   - Extent         if `Depth` == 0
 //   - ExtentIdx      otherwise
+//
+// +marshal
 type ExtentHeader struct {
 	// Magic in the extent magic number, must be 0xf30a.
 	Magic uint16
@@ -96,6 +104,8 @@ type ExtentHeader struct {
 // internal nodes. Sorted in ascending order based on FirstFileBlock since
 // Linux does a binary search on this. This points to a block containing the
 // child node.
+//
+// +marshal
 type ExtentIdx struct {
 	FirstFileBlock uint32
 	ChildBlockLo   uint32
@@ -121,6 +131,8 @@ func (ei *ExtentIdx) PhysicalBlock() uint64 {
 // nodes. Sorted in ascending order based on FirstFileBlock since Linux does a
 // binary search on this. This points to an array of data blocks containing the
 // file data. It covers `Length` data blocks starting from `StartBlock`.
+//
+// +marshal
 type Extent struct {
 	FirstFileBlock uint32
 	Length         uint16
diff --git a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
index 8762b90db..c96002e19 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/extent_test.go
@@ -21,7 +21,10 @@ import (
 // TestExtentSize tests that the extent structs are of the correct
 // size.
 func TestExtentSize(t *testing.T) {
-	assertSize(t, ExtentHeader{}, ExtentHeaderSize)
-	assertSize(t, ExtentIdx{}, ExtentEntrySize)
-	assertSize(t, Extent{}, ExtentEntrySize)
+	var h ExtentHeader
+	assertSize(t, &h, ExtentHeaderSize)
+	var i ExtentIdx
+	assertSize(t, &i, ExtentEntrySize)
+	var e Extent
+	assertSize(t, &e, ExtentEntrySize)
 }
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode.go b/pkg/sentry/fsimpl/ext/disklayout/inode.go
index 88ae913f5..ef25040a9 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode.go
@@ -16,6 +16,7 @@ package disklayout
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/time"
 )
@@ -38,6 +39,8 @@ const (
 //
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#index-nodes.
 type Inode interface {
+	marshal.Marshallable
+
 	// Mode returns the linux file mode which is majorly used to extract
 	// information like:
 	// - File permissions (read/write/execute by user/group/others).
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
index 8f9f574ce..a4503f5cf 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_new.go
@@ -27,6 +27,8 @@ import "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 // are used to provide nanoscond precision. Hence, these timestamps will now
 // overflow in May 2446.
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/dynamic.html#inode-timestamps.
+//
+// +marshal
 type InodeNew struct {
 	InodeOld
 
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
index db25b11b6..e6b28babf 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_old.go
@@ -30,6 +30,8 @@ const (
 //
 // All fields representing time are in seconds since the epoch. Which means that
 // they will overflow in January 2038.
+//
+// +marshal
 type InodeOld struct {
 	ModeRaw uint16
 	UIDLo   uint16
diff --git a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
index dd03ee50e..90744e956 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/inode_test.go
@@ -24,10 +24,12 @@ import (
 
 // TestInodeSize tests that the inode structs are of the correct size.
 func TestInodeSize(t *testing.T) {
-	assertSize(t, InodeOld{}, OldInodeSize)
+	var iOld InodeOld
+	assertSize(t, &iOld, OldInodeSize)
 
 	// This was updated from 156 bytes to 160 bytes in Oct 2015.
-	assertSize(t, InodeNew{}, 160)
+	var iNew InodeNew
+	assertSize(t, &iNew, 160)
 }
 
 // TestTimestampSeconds tests that the seconds part of [a/c/m] timestamps in
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock.go b/pkg/sentry/fsimpl/ext/disklayout/superblock.go
index 8bb327006..70948ebe9 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock.go
@@ -14,6 +14,10 @@
 
 package disklayout
 
+import (
+	"gvisor.dev/gvisor/pkg/marshal"
+)
+
 const (
 	// SbOffset is the absolute offset at which the superblock is placed.
 	SbOffset = 1024
@@ -38,6 +42,8 @@ const (
 //
 // See https://www.kernel.org/doc/html/latest/filesystems/ext4/globals.html#super-block.
 type SuperBlock interface {
+	marshal.Marshallable
+
 	// InodesCount returns the total number of inodes in this filesystem.
 	InodesCount() uint32
 
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
index 53e515fd3..4dc6080fb 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_32.go
@@ -17,6 +17,8 @@ package disklayout
 // SuperBlock32Bit implements SuperBlock and represents the 32-bit version of
 // the ext4_super_block struct in fs/ext4/ext4.h. Should be used only if
 // RevLevel = DynamicRev and 64-bit feature is disabled.
+//
+// +marshal
 type SuperBlock32Bit struct {
 	// We embed the old superblock struct here because the 32-bit version is just
 	// an extension of the old version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
index 7c1053fb4..2c9039327 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_64.go
@@ -19,6 +19,8 @@ package disklayout
 // 1024 bytes (smallest possible block size) and hence the superblock always
 // fits in no more than one data block. Should only be used when the 64-bit
 // feature is set.
+//
+// +marshal
 type SuperBlock64Bit struct {
 	// We embed the 32-bit struct here because 64-bit version is just an extension
 	// of the 32-bit version.
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
index 9221e0251..e4709f23c 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_old.go
@@ -16,6 +16,8 @@ package disklayout
 
 // SuperBlockOld implements SuperBlock and represents the old version of the
 // superblock struct. Should be used only if RevLevel = OldRev.
+//
+// +marshal
 type SuperBlockOld struct {
 	InodesCountRaw      uint32
 	BlocksCountLo       uint32
diff --git a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
index 463b5ba21..b734b6987 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/superblock_test.go
@@ -21,7 +21,10 @@ import (
 // TestSuperBlockSize tests that the superblock structs are of the correct
 // size.
 func TestSuperBlockSize(t *testing.T) {
-	assertSize(t, SuperBlockOld{}, 84)
-	assertSize(t, SuperBlock32Bit{}, 336)
-	assertSize(t, SuperBlock64Bit{}, 1024)
+	var sbOld SuperBlockOld
+	assertSize(t, &sbOld, 84)
+	var sb32 SuperBlock32Bit
+	assertSize(t, &sb32, 336)
+	var sb64 SuperBlock64Bit
+	assertSize(t, &sb64, 1024)
 }
diff --git a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
index 9c63f04c0..a4bc08411 100644
--- a/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
+++ b/pkg/sentry/fsimpl/ext/disklayout/test_utils.go
@@ -18,13 +18,13 @@ import (
 	"reflect"
 	"testing"
 
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal"
 )
 
-func assertSize(t *testing.T, v interface{}, want uintptr) {
+func assertSize(t *testing.T, v marshal.Marshallable, want int) {
 	t.Helper()
 
-	if got := binary.Size(v); got != want {
+	if got := v.SizeBytes(); got != want {
 		t.Errorf("struct %s should be exactly %d bytes but is %d bytes", reflect.TypeOf(v).Name(), want, got)
 	}
 }
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
index 08ffc2834..38fb7962b 100644
--- a/pkg/sentry/fsimpl/ext/ext.go
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -34,11 +34,10 @@ import (
 const Name = "ext"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
-// Compiles only if FilesystemType implements vfs.FilesystemType.
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
 // getDeviceFd returns an io.ReaderAt to the underlying device.
 // Currently there are two ways of mounting an ext(2/3/4) fs:
 //   1. Specify a mount with our internal special MountType in the OCI spec.
@@ -99,6 +98,9 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	// TODO(b/134676337): Ensure that the user is mounting readonly. If not,
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 2dbaee287..d9fd4590c 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -71,13 +71,18 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
 	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: int(f.Fd()),
+		},
+	})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
 	}
 
 	root := mntns.Root()
+	root.IncRef()
 
 	tearDown := func() {
 		root.DecRef(ctx)
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index c36225a7c..778460107 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -18,12 +18,13 @@ import (
 	"io"
 	"sort"
 
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // extentFile is a type of regular file which uses extents to store file data.
+//
+// +stateify savable
 type extentFile struct {
 	regFile regularFile
 
@@ -58,7 +59,7 @@ func newExtentFile(args inodeArgs) (*extentFile, error) {
 func (f *extentFile) buildExtTree() error {
 	rootNodeData := f.regFile.inode.diskInode.Data()
 
-	binary.Unmarshal(rootNodeData[:disklayout.ExtentHeaderSize], binary.LittleEndian, &f.root.Header)
+	f.root.Header.UnmarshalBytes(rootNodeData[:disklayout.ExtentHeaderSize])
 
 	// Root node can not have more than 4 entries: 60 bytes = 1 header + 4 entries.
 	if f.root.Header.NumEntries > 4 {
@@ -77,7 +78,7 @@ func (f *extentFile) buildExtTree() error {
 			// Internal node.
 			curEntry = &disklayout.ExtentIdx{}
 		}
-		binary.Unmarshal(rootNodeData[off:off+disklayout.ExtentEntrySize], binary.LittleEndian, curEntry)
+		curEntry.UnmarshalBytes(rootNodeData[off : off+disklayout.ExtentEntrySize])
 		f.root.Entries[i].Entry = curEntry
 	}
 
diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go
index cd10d46ee..985f76ac0 100644
--- a/pkg/sentry/fsimpl/ext/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/extent_test.go
@@ -21,7 +21,6 @@ import (
 
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 )
 
@@ -202,13 +201,14 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []
 // writeTree writes the tree represented by `root` to the inode and disk. It
 // also writes random file data on disk.
 func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBlkSize uint64) []byte {
-	rootData := binary.Marshal(nil, binary.LittleEndian, root.Header)
+	rootData := in.diskInode.Data()
+	root.Header.MarshalBytes(rootData)
+	off := root.Header.SizeBytes()
 	for _, ep := range root.Entries {
-		rootData = binary.Marshal(rootData, binary.LittleEndian, ep.Entry)
+		ep.Entry.MarshalBytes(rootData[off:])
+		off += ep.Entry.SizeBytes()
 	}
 
-	copy(in.diskInode.Data(), rootData)
-
 	var fileData []byte
 	for _, ep := range root.Entries {
 		if root.Header.Height == 0 {
@@ -223,13 +223,14 @@ func writeTree(in *inode, disk []byte, root *disklayout.ExtentNode, mockExtentBl
 // writeTreeToDisk is the recursive step for writeTree which writes the tree
 // on the disk only. Also writes random file data on disk.
 func writeTreeToDisk(disk []byte, curNode disklayout.ExtentEntryPair) []byte {
-	nodeData := binary.Marshal(nil, binary.LittleEndian, curNode.Node.Header)
+	nodeData := disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:]
+	curNode.Node.Header.MarshalBytes(nodeData)
+	off := curNode.Node.Header.SizeBytes()
 	for _, ep := range curNode.Node.Entries {
-		nodeData = binary.Marshal(nodeData, binary.LittleEndian, ep.Entry)
+		ep.Entry.MarshalBytes(nodeData[off:])
+		off += ep.Entry.SizeBytes()
 	}
 
-	copy(disk[curNode.Entry.PhysicalBlock()*mockExtentBlkSize:], nodeData)
-
 	var fileData []byte
 	for _, ep := range curNode.Node.Entries {
 		if curNode.Node.Header.Height == 0 {
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index c714ddf73..917f1873d 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -38,11 +38,13 @@ var (
 )
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
 	// mu serializes changes to the Dentry tree.
-	mu sync.RWMutex
+	mu sync.RWMutex `state:"nosave"`
 
 	// dev represents the underlying fs device. It does not require protection
 	// because io.ReaderAt permits concurrent read calls to it. It translates to
@@ -81,9 +83,9 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil)
 // stepLocked is loosely analogous to fs/namei.c:walk_component().
 //
 // Preconditions:
-//     - filesystem.mu must be locked (for writing if write param is true).
-//     - !rp.Done().
-//     - inode == vfsd.Impl().(*Dentry).inode.
+// * filesystem.mu must be locked (for writing if write param is true).
+// * !rp.Done().
+// * inode == vfsd.Impl().(*Dentry).inode.
 func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
 	if !inode.isDir() {
 		return nil, nil, syserror.ENOTDIR
@@ -166,7 +168,7 @@ func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, in
 // walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
 //
 // Preconditions:
-//     - filesystem.mu must be locked (for writing if write param is true).
+// * filesystem.mu must be locked (for writing if write param is true).
 func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
 	vfsd := rp.Start()
 	inode := vfsd.Impl().(*dentry).inode
@@ -194,8 +196,8 @@ func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.De
 // walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat().
 //
 // Preconditions:
-//     - filesystem.mu must be locked (for writing if write param is true).
-//     - !rp.Done().
+// * filesystem.mu must be locked (for writing if write param is true).
+// * !rp.Done().
 func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
 	vfsd := rp.Start()
 	inode := vfsd.Impl().(*dentry).inode
@@ -490,7 +492,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return syserror.EROFS
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	_, inode, err := fs.walk(ctx, rp, false)
 	if err != nil {
@@ -504,8 +506,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return nil, err
@@ -513,8 +515,8 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 	return nil, syserror.ENOTSUP
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return "", err
@@ -522,8 +524,8 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return "", syserror.ENOTSUP
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return err
@@ -531,8 +533,8 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return syserror.ENOTSUP
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	_, _, err := fs.walk(ctx, rp, false)
 	if err != nil {
 		return err
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 30636cf66..9009ba3c7 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -37,6 +37,8 @@ import (
 //           |-- regular--
 //                       |-- extent file
 //                       |-- block map file
+//
+// +stateify savable
 type inode struct {
 	// refs is a reference count. refs is accessed using atomic memory operations.
 	refs int64
diff --git a/pkg/sentry/fsimpl/ext/regular_file.go b/pkg/sentry/fsimpl/ext/regular_file.go
index e73e740d6..4a5539b37 100644
--- a/pkg/sentry/fsimpl/ext/regular_file.go
+++ b/pkg/sentry/fsimpl/ext/regular_file.go
@@ -31,6 +31,8 @@ import (
 // regularFile represents a regular file's inode. This too follows the
 // inheritance pattern prevelant in the vfs layer described in
 // pkg/sentry/vfs/README.md.
+//
+// +stateify savable
 type regularFile struct {
 	inode inode
 
@@ -67,6 +69,8 @@ func (in *inode) isRegular() bool {
 
 // directoryFD represents a directory file description. It implements
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type regularFileFD struct {
 	fileDescription
 	vfs.LockFD
@@ -75,7 +79,7 @@ type regularFileFD struct {
 	off int64
 
 	// offMu serializes operations that may mutate off.
-	offMu sync.Mutex
+	offMu sync.Mutex `state:"nosave"`
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
diff --git a/pkg/sentry/fsimpl/ext/symlink.go b/pkg/sentry/fsimpl/ext/symlink.go
index 2fd0d1fa8..5e2bcc837 100644
--- a/pkg/sentry/fsimpl/ext/symlink.go
+++ b/pkg/sentry/fsimpl/ext/symlink.go
@@ -23,6 +23,8 @@ import (
 )
 
 // symlink represents a symlink inode.
+//
+// +stateify savable
 type symlink struct {
 	inode  inode
 	target string // immutable
@@ -61,9 +63,11 @@ func (in *inode) isSymlink() bool {
 	return ok
 }
 
-// symlinkFD represents a symlink file description and implements implements
+// symlinkFD represents a symlink file description and implements
 // vfs.FileDescriptionImpl. which may only be used if open options contains
 // O_PATH. For this reason most of the functions return EBADF.
+//
+// +stateify savable
 type symlinkFD struct {
 	fileDescription
 	vfs.NoLockFD
diff --git a/pkg/sentry/fsimpl/ext/utils.go b/pkg/sentry/fsimpl/ext/utils.go
index d8b728f8c..58ef7b9b8 100644
--- a/pkg/sentry/fsimpl/ext/utils.go
+++ b/pkg/sentry/fsimpl/ext/utils.go
@@ -17,21 +17,21 @@ package ext
 import (
 	"io"
 
-	"gvisor.dev/gvisor/pkg/binary"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // readFromDisk performs a binary read from disk into the given struct from
 // the absolute offset provided.
-func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error {
-	n := binary.Size(v)
+func readFromDisk(dev io.ReaderAt, abOff int64, v marshal.Marshallable) error {
+	n := v.SizeBytes()
 	buf := make([]byte, n)
 	if read, _ := dev.ReadAt(buf, abOff); read < int(n) {
 		return syserror.EIO
 	}
 
-	binary.Unmarshal(buf, binary.LittleEndian, v)
+	v.UnmarshalBytes(buf)
 	return nil
 }
 
diff --git a/pkg/sentry/fsimpl/fuse/BUILD b/pkg/sentry/fsimpl/fuse/BUILD
index 999111deb..2158b1bbc 100644
--- a/pkg/sentry/fsimpl/fuse/BUILD
+++ b/pkg/sentry/fsimpl/fuse/BUILD
@@ -15,21 +15,42 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "inode_refs",
+    out = "inode_refs.go",
+    package = "fuse",
+    prefix = "inode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "inode",
+    },
+)
+
 go_library(
     name = "fuse",
     srcs = [
         "connection.go",
+        "connection_control.go",
         "dev.go",
+        "directory.go",
+        "file.go",
         "fusefs.go",
-        "init.go",
+        "inode_refs.go",
+        "read_write.go",
         "register.go",
+        "regular_file.go",
         "request_list.go",
+        "request_response.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/log",
+        "//pkg/marshal",
+        "//pkg/refs",
+        "//pkg/refsvfs2",
+        "//pkg/safemem",
         "//pkg/sentry/fsimpl/devtmpfs",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
@@ -39,7 +60,6 @@ go_library(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
         "@org_golang_x_sys//unix:go_default_library",
     ],
 )
@@ -47,10 +67,15 @@ go_library(
 go_test(
     name = "fuse_test",
     size = "small",
-    srcs = ["dev_test.go"],
+    srcs = [
+        "connection_test.go",
+        "dev_test.go",
+        "utils_test.go",
+    ],
     library = ":fuse",
     deps = [
         "//pkg/abi/linux",
+        "//pkg/marshal",
         "//pkg/sentry/fsimpl/testutil",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
@@ -58,6 +83,5 @@ go_test(
         "//pkg/syserror",
         "//pkg/usermem",
         "//pkg/waiter",
-        "//tools/go_marshal/marshal",
     ],
 )
diff --git a/pkg/sentry/fsimpl/fuse/connection.go b/pkg/sentry/fsimpl/fuse/connection.go
index 6df2728ab..8ccda1264 100644
--- a/pkg/sentry/fsimpl/fuse/connection.go
+++ b/pkg/sentry/fsimpl/fuse/connection.go
@@ -15,31 +15,17 @@
 package fuse
 
 import (
-	"errors"
-	"fmt"
 	"sync"
-	"sync/atomic"
-	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
-// maxActiveRequestsDefault is the default setting controlling the upper bound
-// on the number of active requests at any given time.
-const maxActiveRequestsDefault = 10000
-
-// Ordinary requests have even IDs, while interrupts IDs are odd.
-// Used to increment the unique ID for each FUSE request.
-var reqIDStep uint64 = 2
-
 const (
 	// fuseDefaultMaxBackground is the default value for MaxBackground.
 	fuseDefaultMaxBackground = 12
@@ -52,43 +38,39 @@ const (
 	fuseDefaultMaxPagesPerReq = 32
 )
 
-// Request represents a FUSE operation request that hasn't been sent to the
-// server yet.
+// connection is the struct by which the sentry communicates with the FUSE server daemon.
 //
-// +stateify savable
-type Request struct {
-	requestEntry
-
-	id   linux.FUSEOpID
-	hdr  *linux.FUSEHeaderIn
-	data []byte
-}
-
-// Response represents an actual response from the server, including the
-// response payload.
+// Lock order:
+// - conn.fd.mu
+// - conn.mu
+// - conn.asyncMu
 //
 // +stateify savable
-type Response struct {
-	opcode linux.FUSEOpcode
-	hdr    linux.FUSEHeaderOut
-	data   []byte
-}
-
-// connection is the struct by which the sentry communicates with the FUSE server daemon.
 type connection struct {
 	fd *DeviceFD
 
+	// mu protects access to struct memebers.
+	mu sync.Mutex `state:"nosave"`
+
+	// attributeVersion is the version of connection's attributes.
+	attributeVersion uint64
+
+	// We target FUSE 7.23.
 	// The following FUSE_INIT flags are currently unsupported by this implementation:
-	// - FUSE_ATOMIC_O_TRUNC: requires open(..., O_TRUNC)
 	// - FUSE_EXPORT_SUPPORT
-	// - FUSE_HANDLE_KILLPRIV
 	// - FUSE_POSIX_LOCKS: requires POSIX locks
 	// - FUSE_FLOCK_LOCKS: requires POSIX locks
 	// - FUSE_AUTO_INVAL_DATA: requires page caching eviction
-	// - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction
 	// - FUSE_DO_READDIRPLUS/FUSE_READDIRPLUS_AUTO: requires FUSE_READDIRPLUS implementation
 	// - FUSE_ASYNC_DIO
-	// - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler
+	// - FUSE_PARALLEL_DIROPS (7.25)
+	// - FUSE_HANDLE_KILLPRIV (7.26)
+	// - FUSE_POSIX_ACL: affects defaultPermissions, posixACL, xattr handler (7.26)
+	// - FUSE_ABORT_ERROR (7.27)
+	// - FUSE_CACHE_SYMLINKS (7.28)
+	// - FUSE_NO_OPENDIR_SUPPORT (7.29)
+	// - FUSE_EXPLICIT_INVAL_DATA: requires page caching eviction (7.30)
+	// - FUSE_MAP_ALIGNMENT (7.31)
 
 	// initialized after receiving FUSE_INIT reply.
 	// Until it's set, suspend sending FUSE requests.
@@ -96,11 +78,7 @@ type connection struct {
 	initialized int32
 
 	// initializedChan is used to block requests before initialization.
-	initializedChan chan struct{}
-
-	// blocked when there are too many outstading backgrounds requests (NumBackground == MaxBackground).
-	// TODO(gvisor.dev/issue/3185): update the numBackground accordingly; use a channel to block.
-	blocked bool
+	initializedChan chan struct{} `state:".(bool)"`
 
 	// connected (connection established) when a new FUSE file system is created.
 	// Set to false when:
@@ -109,48 +87,55 @@ type connection struct {
 	//   device release.
 	connected bool
 
-	// aborted via sysfs.
-	// TODO(gvisor.dev/issue/3185): abort all queued requests.
-	aborted bool
-
 	// connInitError if FUSE_INIT encountered error (major version mismatch).
 	// Only set in INIT.
 	connInitError bool
 
 	// connInitSuccess if FUSE_INIT is successful.
 	// Only set in INIT.
-	// Used for destory.
+	// Used for destory (not yet implemented).
 	connInitSuccess bool
 
-	// TODO(gvisor.dev/issue/3185): All the queue logic are working in progress.
-
-	// NumberBackground is the number of requests in the background.
-	numBackground uint16
+	// aborted via sysfs, and will send ECONNABORTED to read after disconnection (instead of ENODEV).
+	// Set only if abortErr is true and via fuse control fs (not yet implemented).
+	// TODO(gvisor.dev/issue/3525): set this to true when user aborts.
+	aborted bool
 
-	// congestionThreshold for NumBackground.
-	// Negotiated in FUSE_INIT.
-	congestionThreshold uint16
+	// numWating is the number of requests waiting to be
+	// sent to FUSE device or being processed by FUSE daemon.
+	numWaiting uint32
 
-	// maxBackground is the maximum number of NumBackground.
-	// Block connection when it is reached.
-	// Negotiated in FUSE_INIT.
-	maxBackground uint16
+	// Terminology note:
+	//
+	// - `asyncNumMax` is the `MaxBackground` in the FUSE_INIT_IN struct.
+	//
+	// - `asyncCongestionThreshold` is the `CongestionThreshold` in the FUSE_INIT_IN struct.
+	//
+	// We call the "background" requests in unix term as async requests.
+	// The "async requests" in unix term is our async requests that expect a reply,
+	// i.e. `!request.noReply`
 
-	// numActiveBackground is the number of requests in background and has being marked as active.
-	numActiveBackground uint16
+	// asyncMu protects the async request fields.
+	asyncMu sync.Mutex `state:"nosave"`
 
-	// numWating is the number of requests waiting for completion.
-	numWaiting uint32
+	// asyncNum is the number of async requests.
+	// Protected by asyncMu.
+	asyncNum uint16
 
-	// TODO(gvisor.dev/issue/3185): BgQueue
-	// some queue for background queued requests.
+	// asyncCongestionThreshold the number of async requests.
+	// Negotiated in FUSE_INIT as "CongestionThreshold".
+	// TODO(gvisor.dev/issue/3529): add congestion control.
+	// Protected by asyncMu.
+	asyncCongestionThreshold uint16
 
-	// bgLock protects:
-	// MaxBackground, CongestionThreshold, NumBackground,
-	// NumActiveBackground, BgQueue, Blocked.
-	bgLock sync.Mutex
+	// asyncNumMax is the maximum number of asyncNum.
+	// Connection blocks the async requests when it is reached.
+	// Negotiated in FUSE_INIT as "MaxBackground".
+	// Protected by asyncMu.
+	asyncNumMax uint16
 
 	// maxRead is the maximum size of a read buffer in in bytes.
+	// Initialized from a fuse fs parameter.
 	maxRead uint32
 
 	// maxWrite is the maximum size of a write buffer in bytes.
@@ -165,23 +150,20 @@ type connection struct {
 	// Negotiated and only set in INIT.
 	minor uint32
 
-	// asyncRead if read pages asynchronously.
+	// atomicOTrunc is true when FUSE does not send a separate SETATTR request
+	// before open with O_TRUNC flag.
 	// Negotiated and only set in INIT.
-	asyncRead bool
+	atomicOTrunc bool
 
-	// abortErr is true if kernel need to return an unique read error after abort.
+	// asyncRead if read pages asynchronously.
 	// Negotiated and only set in INIT.
-	abortErr bool
+	asyncRead bool
 
 	// writebackCache is true for write-back cache policy,
 	// false for write-through policy.
 	// Negotiated and only set in INIT.
 	writebackCache bool
 
-	// cacheSymlinks if filesystem needs to cache READLINK responses in page cache.
-	// Negotiated and only set in INIT.
-	cacheSymlinks bool
-
 	// bigWrites if doing multi-page cached writes.
 	// Negotiated and only set in INIT.
 	bigWrites bool
@@ -189,116 +171,86 @@ type connection struct {
 	// dontMask if filestestem does not apply umask to creation modes.
 	// Negotiated in INIT.
 	dontMask bool
+
+	// noOpen if FUSE server doesn't support open operation.
+	// This flag only influence performance, not correctness of the program.
+	noOpen bool
+}
+
+func (conn *connection) saveInitializedChan() bool {
+	select {
+	case <-conn.initializedChan:
+		return true // Closed.
+	default:
+		return false // Not closed.
+	}
+}
+
+func (conn *connection) loadInitializedChan(closed bool) {
+	conn.initializedChan = make(chan struct{}, 1)
+	if closed {
+		close(conn.initializedChan)
+	}
 }
 
 // newFUSEConnection creates a FUSE connection to fd.
-func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, maxInFlightRequests uint64) (*connection, error) {
+func newFUSEConnection(_ context.Context, fd *vfs.FileDescription, opts *filesystemOptions) (*connection, error) {
 	// Mark the device as ready so it can be used. /dev/fuse can only be used if the FD was used to
 	// mount a FUSE filesystem.
 	fuseFD := fd.Impl().(*DeviceFD)
-	fuseFD.mounted = true
 
 	// Create the writeBuf for the header to be stored in.
 	hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
 	fuseFD.writeBuf = make([]byte, hdrLen)
 	fuseFD.completions = make(map[linux.FUSEOpID]*futureResponse)
-	fuseFD.fullQueueCh = make(chan struct{}, maxInFlightRequests)
+	fuseFD.fullQueueCh = make(chan struct{}, opts.maxActiveRequests)
 	fuseFD.writeCursor = 0
 
 	return &connection{
-		fd:                  fuseFD,
-		maxBackground:       fuseDefaultMaxBackground,
-		congestionThreshold: fuseDefaultCongestionThreshold,
-		maxPages:            fuseDefaultMaxPagesPerReq,
-		initializedChan:     make(chan struct{}),
-		connected:           true,
-	}, nil
-}
-
-// SetInitialized atomically sets the connection as initialized.
-func (conn *connection) SetInitialized() {
-	// Unblock the requests sent before INIT.
-	close(conn.initializedChan)
-
-	// Close the channel first to avoid the non-atomic situation
-	// where conn.initialized is true but there are
-	// tasks being blocked on the channel.
-	// And it prevents the newer tasks from gaining
-	// unnecessary higher chance to be issued before the blocked one.
-
-	atomic.StoreInt32(&(conn.initialized), int32(1))
-}
-
-// IsInitialized atomically check if the connection is initialized.
-// pairs with SetInitialized().
-func (conn *connection) Initialized() bool {
-	return atomic.LoadInt32(&(conn.initialized)) != 0
-}
-
-// NewRequest creates a new request that can be sent to the FUSE server.
-func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
-	conn.fd.mu.Lock()
-	defer conn.fd.mu.Unlock()
-	conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
-
-	hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
-	hdr := linux.FUSEHeaderIn{
-		Len:    uint32(hdrLen + payload.SizeBytes()),
-		Opcode: opcode,
-		Unique: conn.fd.nextOpID,
-		NodeID: ino,
-		UID:    uint32(creds.EffectiveKUID),
-		GID:    uint32(creds.EffectiveKGID),
-		PID:    pid,
-	}
-
-	buf := make([]byte, hdr.Len)
-	hdr.MarshalUnsafe(buf[:hdrLen])
-	payload.MarshalUnsafe(buf[hdrLen:])
-
-	return &Request{
-		id:   hdr.Unique,
-		hdr:  &hdr,
-		data: buf,
+		fd:                       fuseFD,
+		asyncNumMax:              fuseDefaultMaxBackground,
+		asyncCongestionThreshold: fuseDefaultCongestionThreshold,
+		maxRead:                  opts.maxRead,
+		maxPages:                 fuseDefaultMaxPagesPerReq,
+		initializedChan:          make(chan struct{}),
+		connected:                true,
 	}, nil
 }
 
-// Call makes a request to the server and blocks the invoking task until a
-// server responds with a response. Task should never be nil.
-// Requests will not be sent before the connection is initialized.
-// For async tasks, use CallAsync().
-func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
-	// Block requests sent before connection is initalized.
-	if !conn.Initialized() {
-		if err := t.Block(conn.initializedChan); err != nil {
-			return nil, err
-		}
-	}
-
-	return conn.call(t, r)
+// CallAsync makes an async (aka background) request.
+// It's a simple wrapper around Call().
+func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+	r.async = true
+	_, err := conn.Call(t, r)
+	return err
 }
 
-// CallAsync makes an async (aka background) request.
-// Those requests either do not expect a response (e.g. release) or
-// the response should be handled by others (e.g. init).
-// Return immediately unless the connection is blocked (before initialization).
-// Async call example: init, release, forget, aio, interrupt.
+// Call makes a request to the server.
+// Block before the connection is initialized.
 // When the Request is FUSE_INIT, it will not be blocked before initialization.
-func (conn *connection) CallAsync(t *kernel.Task, r *Request) error {
+// Task should never be nil.
+//
+// For a sync request, it blocks the invoking task until
+// a server responds with a response.
+//
+// For an async request (that do not expect a response immediately),
+// it returns directly unless being blocked either before initialization
+// or when there are too many async requests ongoing.
+//
+// Example for async request:
+// init, readahead, write, async read/write, fuse_notify_reply,
+// non-sync release, interrupt, forget.
+//
+// The forget request does not have a reply,
+// as documented in include/uapi/linux/fuse.h:FUSE_FORGET.
+func (conn *connection) Call(t *kernel.Task, r *Request) (*Response, error) {
 	// Block requests sent before connection is initalized.
 	if !conn.Initialized() && r.hdr.Opcode != linux.FUSE_INIT {
 		if err := t.Block(conn.initializedChan); err != nil {
-			return err
+			return nil, err
 		}
 	}
 
-	// This should be the only place that invokes call() with a nil task.
-	_, err := conn.call(nil, r)
-	return err
-}
-
-// call makes a call without blocking checks.
-func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
 	if !conn.connected {
 		return nil, syserror.ENOTCONN
 	}
@@ -315,31 +267,6 @@ func (conn *connection) call(t *kernel.Task, r *Request) (*Response, error) {
 	return fut.resolve(t)
 }
 
-// Error returns the error of the FUSE call.
-func (r *Response) Error() error {
-	errno := r.hdr.Error
-	if errno >= 0 {
-		return nil
-	}
-
-	sysErrNo := syscall.Errno(-errno)
-	return error(sysErrNo)
-}
-
-// UnmarshalPayload unmarshals the response data into m.
-func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
-	hdrLen := r.hdr.SizeBytes()
-	haveDataLen := r.hdr.Len - uint32(hdrLen)
-	wantDataLen := uint32(m.SizeBytes())
-
-	if haveDataLen < wantDataLen {
-		return fmt.Errorf("payload too small. Minimum data lenth required: %d,  but got data length %d", wantDataLen, haveDataLen)
-	}
-
-	m.UnmarshalUnsafe(r.data[hdrLen:])
-	return nil
-}
-
 // callFuture makes a request to the server and returns a future response.
 // Call resolve() when the response needs to be fulfilled.
 func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse, error) {
@@ -358,11 +285,6 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse,
 	// if there are always too many ongoing requests all the time. The
 	// supported maxActiveRequests setting should be really high to avoid this.
 	for conn.fd.numActiveRequests == conn.fd.fs.opts.maxActiveRequests {
-		if t == nil {
-			// Since there is no task that is waiting. We must error out.
-			return nil, errors.New("FUSE request queue full")
-		}
-
 		log.Infof("Blocking request %v from being queued. Too many active requests: %v",
 			r.id, conn.fd.numActiveRequests)
 		conn.fd.mu.Unlock()
@@ -378,9 +300,19 @@ func (conn *connection) callFuture(t *kernel.Task, r *Request) (*futureResponse,
 
 // callFutureLocked makes a request to the server and returns a future response.
 func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureResponse, error) {
+	// Check connected again holding conn.mu.
+	conn.mu.Lock()
+	if !conn.connected {
+		conn.mu.Unlock()
+		// we checked connected before,
+		// this must be due to aborted connection.
+		return nil, syserror.ECONNABORTED
+	}
+	conn.mu.Unlock()
+
 	conn.fd.queue.PushBack(r)
-	conn.fd.numActiveRequests += 1
-	fut := newFutureResponse(r.hdr.Opcode)
+	conn.fd.numActiveRequests++
+	fut := newFutureResponse(r)
 	conn.fd.completions[r.id] = fut
 
 	// Signal the readers that there is something to read.
@@ -388,50 +320,3 @@ func (conn *connection) callFutureLocked(t *kernel.Task, r *Request) (*futureRes
 
 	return fut, nil
 }
-
-// futureResponse represents an in-flight request, that may or may not have
-// completed yet. Convert it to a resolved Response by calling Resolve, but note
-// that this may block.
-//
-// +stateify savable
-type futureResponse struct {
-	opcode linux.FUSEOpcode
-	ch     chan struct{}
-	hdr    *linux.FUSEHeaderOut
-	data   []byte
-}
-
-// newFutureResponse creates a future response to a FUSE request.
-func newFutureResponse(opcode linux.FUSEOpcode) *futureResponse {
-	return &futureResponse{
-		opcode: opcode,
-		ch:     make(chan struct{}),
-	}
-}
-
-// resolve blocks the task until the server responds to its corresponding request,
-// then returns a resolved response.
-func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
-	// If there is no Task associated with this request  - then we don't try to resolve
-	// the response.  Instead, the task writing the response (proxy to the server) will
-	// process the response on our behalf.
-	if t == nil {
-		log.Infof("fuse.Response.resolve: Not waiting on a response from server.")
-		return nil, nil
-	}
-
-	if err := t.Block(f.ch); err != nil {
-		return nil, err
-	}
-
-	return f.getResponse(), nil
-}
-
-// getResponse creates a Response from the data the futureResponse has.
-func (f *futureResponse) getResponse() *Response {
-	return &Response{
-		opcode: f.opcode,
-		hdr:    *f.hdr,
-		data:   f.data,
-	}
-}
diff --git a/pkg/sentry/fsimpl/fuse/init.go b/pkg/sentry/fsimpl/fuse/connection_control.go
index 779c2bd3f..bfde78559 100644
--- a/pkg/sentry/fsimpl/fuse/init.go
+++ b/pkg/sentry/fsimpl/fuse/connection_control.go
@@ -15,7 +15,11 @@
 package fuse
 
 import (
+	"sync/atomic"
+	"syscall"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
@@ -29,9 +33,10 @@ const (
 	// Follow the same behavior as unix fuse implementation.
 	fuseMaxTimeGranNs = 1000000000
 
-	// Minimum value for MaxWrite.
+	// Minimum value for MaxWrite and MaxRead.
 	// Follow the same behavior as unix fuse implementation.
 	fuseMinMaxWrite = 4096
+	fuseMinMaxRead  = 4096
 
 	// Temporary default value for max readahead, 128kb.
 	fuseDefaultMaxReadahead = 131072
@@ -49,6 +54,26 @@ var (
 	MaxUserCongestionThreshold uint16 = fuseDefaultCongestionThreshold
 )
 
+// SetInitialized atomically sets the connection as initialized.
+func (conn *connection) SetInitialized() {
+	// Unblock the requests sent before INIT.
+	close(conn.initializedChan)
+
+	// Close the channel first to avoid the non-atomic situation
+	// where conn.initialized is true but there are
+	// tasks being blocked on the channel.
+	// And it prevents the newer tasks from gaining
+	// unnecessary higher chance to be issued before the blocked one.
+
+	atomic.StoreInt32(&(conn.initialized), int32(1))
+}
+
+// IsInitialized atomically check if the connection is initialized.
+// pairs with SetInitialized().
+func (conn *connection) Initialized() bool {
+	return atomic.LoadInt32(&(conn.initialized)) != 0
+}
+
 // InitSend sends a FUSE_INIT request.
 func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
 	in := linux.FUSEInitIn{
@@ -70,29 +95,31 @@ func (conn *connection) InitSend(creds *auth.Credentials, pid uint32) error {
 }
 
 // InitRecv receives a FUSE_INIT reply and process it.
+//
+// Preconditions: conn.asyncMu must not be held if minor verion is newer than 13.
 func (conn *connection) InitRecv(res *Response, hasSysAdminCap bool) error {
 	if err := res.Error(); err != nil {
 		return err
 	}
 
-	var out linux.FUSEInitOut
-	if err := res.UnmarshalPayload(&out); err != nil {
+	initRes := fuseInitRes{initLen: res.DataLen()}
+	if err := res.UnmarshalPayload(&initRes); err != nil {
 		return err
 	}
 
-	return conn.initProcessReply(&out, hasSysAdminCap)
+	return conn.initProcessReply(&initRes.initOut, hasSysAdminCap)
 }
 
 // Process the FUSE_INIT reply from the FUSE server.
+// It tries to acquire the conn.asyncMu lock if minor version is newer than 13.
 func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap bool) error {
+	// No matter error or not, always set initialzied.
+	// to unblock the blocked requests.
+	defer conn.SetInitialized()
+
 	// No support for old major fuse versions.
 	if out.Major != linux.FUSE_KERNEL_VERSION {
 		conn.connInitError = true
-
-		// Set the connection as initialized and unblock the blocked requests
-		// (i.e. return error for them).
-		conn.SetInitialized()
-
 		return nil
 	}
 
@@ -100,29 +127,14 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
 	conn.connInitSuccess = true
 	conn.minor = out.Minor
 
-	// No support for limits before minor version 13.
-	if out.Minor >= 13 {
-		conn.bgLock.Lock()
-
-		if out.MaxBackground > 0 {
-			conn.maxBackground = out.MaxBackground
-
-			if !hasSysAdminCap &&
-				conn.maxBackground > MaxUserBackgroundRequest {
-				conn.maxBackground = MaxUserBackgroundRequest
-			}
-		}
-
-		if out.CongestionThreshold > 0 {
-			conn.congestionThreshold = out.CongestionThreshold
-
-			if !hasSysAdminCap &&
-				conn.congestionThreshold > MaxUserCongestionThreshold {
-				conn.congestionThreshold = MaxUserCongestionThreshold
-			}
-		}
-
-		conn.bgLock.Unlock()
+	// No support for negotiating MaxWrite before minor version 5.
+	if out.Minor >= 5 {
+		conn.maxWrite = out.MaxWrite
+	} else {
+		conn.maxWrite = fuseMinMaxWrite
+	}
+	if conn.maxWrite < fuseMinMaxWrite {
+		conn.maxWrite = fuseMinMaxWrite
 	}
 
 	// No support for the following flags before minor version 6.
@@ -131,8 +143,6 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
 		conn.bigWrites = out.Flags&linux.FUSE_BIG_WRITES != 0
 		conn.dontMask = out.Flags&linux.FUSE_DONT_MASK != 0
 		conn.writebackCache = out.Flags&linux.FUSE_WRITEBACK_CACHE != 0
-		conn.cacheSymlinks = out.Flags&linux.FUSE_CACHE_SYMLINKS != 0
-		conn.abortErr = out.Flags&linux.FUSE_ABORT_ERROR != 0
 
 		// TODO(gvisor.dev/issue/3195): figure out how to use TimeGran (0 < TimeGran <= fuseMaxTimeGranNs).
 
@@ -148,19 +158,90 @@ func (conn *connection) initProcessReply(out *linux.FUSEInitOut, hasSysAdminCap
 		}
 	}
 
-	// No support for negotiating MaxWrite before minor version 5.
-	if out.Minor >= 5 {
-		conn.maxWrite = out.MaxWrite
-	} else {
-		conn.maxWrite = fuseMinMaxWrite
+	// No support for limits before minor version 13.
+	if out.Minor >= 13 {
+		conn.asyncMu.Lock()
+
+		if out.MaxBackground > 0 {
+			conn.asyncNumMax = out.MaxBackground
+
+			if !hasSysAdminCap &&
+				conn.asyncNumMax > MaxUserBackgroundRequest {
+				conn.asyncNumMax = MaxUserBackgroundRequest
+			}
+		}
+
+		if out.CongestionThreshold > 0 {
+			conn.asyncCongestionThreshold = out.CongestionThreshold
+
+			if !hasSysAdminCap &&
+				conn.asyncCongestionThreshold > MaxUserCongestionThreshold {
+				conn.asyncCongestionThreshold = MaxUserCongestionThreshold
+			}
+		}
+
+		conn.asyncMu.Unlock()
 	}
-	if conn.maxWrite < fuseMinMaxWrite {
-		conn.maxWrite = fuseMinMaxWrite
+
+	return nil
+}
+
+// Abort this FUSE connection.
+// It tries to acquire conn.fd.mu, conn.lock, conn.bgLock in order.
+// All possible requests waiting or blocking will be aborted.
+//
+// Preconditions: conn.fd.mu is locked.
+func (conn *connection) Abort(ctx context.Context) {
+	conn.mu.Lock()
+	conn.asyncMu.Lock()
+
+	if !conn.connected {
+		conn.asyncMu.Unlock()
+		conn.mu.Unlock()
+		conn.fd.mu.Unlock()
+		return
 	}
 
-	// Set connection as initialized and unblock the requests
-	// issued before init.
-	conn.SetInitialized()
+	conn.connected = false
 
-	return nil
+	// Empty the `fd.queue` that holds the requests
+	// not yet read by the FUSE daemon yet.
+	// These are a subset of the requests in `fuse.completion` map.
+	for !conn.fd.queue.Empty() {
+		req := conn.fd.queue.Front()
+		conn.fd.queue.Remove(req)
+	}
+
+	var terminate []linux.FUSEOpID
+
+	// 2. Collect the requests have not been sent to FUSE daemon,
+	// or have not received a reply.
+	for unique := range conn.fd.completions {
+		terminate = append(terminate, unique)
+	}
+
+	// Release locks to avoid deadlock.
+	conn.asyncMu.Unlock()
+	conn.mu.Unlock()
+
+	// 1. The requets blocked before initialization.
+	// Will reach call() `connected` check and return.
+	if !conn.Initialized() {
+		conn.SetInitialized()
+	}
+
+	// 2. Terminate the requests collected above.
+	// Set ECONNABORTED error.
+	// sendError() will remove them from `fd.completion` map.
+	// Will enter the path of a normally received error.
+	for _, toTerminate := range terminate {
+		conn.fd.sendError(ctx, -int32(syscall.ECONNABORTED), toTerminate)
+	}
+
+	// 3. The requests not yet written to FUSE device.
+	// Early terminate.
+	// Will reach callFutureLocked() `connected` check and return.
+	close(conn.fd.fullQueueCh)
+
+	// TODO(gvisor.dev/issue/3528): Forget all pending forget reqs.
 }
diff --git a/pkg/sentry/fsimpl/fuse/connection_test.go b/pkg/sentry/fsimpl/fuse/connection_test.go
new file mode 100644
index 000000000..91d16c1cf
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/connection_test.go
@@ -0,0 +1,117 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"math/rand"
+	"syscall"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// TestConnectionInitBlock tests if initialization
+// correctly blocks and unblocks the connection.
+// Since it's unfeasible to test kernelTask.Block() in unit test,
+// the code in Call() are not tested here.
+func TestConnectionInitBlock(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	k := kernel.KernelFromContext(s.Ctx)
+
+	conn, _, err := newTestConnection(s, k, maxActiveRequestsDefault)
+	if err != nil {
+		t.Fatalf("newTestConnection: %v", err)
+	}
+
+	select {
+	case <-conn.initializedChan:
+		t.Fatalf("initializedChan should be blocking before SetInitialized")
+	default:
+	}
+
+	conn.SetInitialized()
+
+	select {
+	case <-conn.initializedChan:
+	default:
+		t.Fatalf("initializedChan should not be blocking after SetInitialized")
+	}
+}
+
+func TestConnectionAbort(t *testing.T) {
+	s := setup(t)
+	defer s.Destroy()
+
+	k := kernel.KernelFromContext(s.Ctx)
+	creds := auth.CredentialsFromContext(s.Ctx)
+	task := kernel.TaskFromContext(s.Ctx)
+
+	const numRequests uint64 = 256
+
+	conn, _, err := newTestConnection(s, k, numRequests)
+	if err != nil {
+		t.Fatalf("newTestConnection: %v", err)
+	}
+
+	testObj := &testPayload{
+		data: rand.Uint32(),
+	}
+
+	var futNormal []*futureResponse
+
+	for i := 0; i < int(numRequests); i++ {
+		req, err := conn.NewRequest(creds, uint32(i), uint64(i), 0, testObj)
+		if err != nil {
+			t.Fatalf("NewRequest creation failed: %v", err)
+		}
+		fut, err := conn.callFutureLocked(task, req)
+		if err != nil {
+			t.Fatalf("callFutureLocked failed: %v", err)
+		}
+		futNormal = append(futNormal, fut)
+	}
+
+	conn.Abort(s.Ctx)
+
+	// Abort should unblock the initialization channel.
+	// Note: no test requests are actually blocked on `conn.initializedChan`.
+	select {
+	case <-conn.initializedChan:
+	default:
+		t.Fatalf("initializedChan should not be blocking after SetInitialized")
+	}
+
+	// Abort will return ECONNABORTED error to unblocked requests.
+	for _, fut := range futNormal {
+		if fut.getResponse().hdr.Error != -int32(syscall.ECONNABORTED) {
+			t.Fatalf("Incorrect error code received for aborted connection: %v", fut.getResponse().hdr.Error)
+		}
+	}
+
+	// After abort, Call() should return directly with ENOTCONN.
+	req, err := conn.NewRequest(creds, 0, 0, 0, testObj)
+	if err != nil {
+		t.Fatalf("NewRequest creation failed: %v", err)
+	}
+	_, err = conn.Call(task, req)
+	if err != syserror.ENOTCONN {
+		t.Fatalf("Incorrect error code received for Call() after connection aborted")
+	}
+
+}
diff --git a/pkg/sentry/fsimpl/fuse/dev.go b/pkg/sentry/fsimpl/fuse/dev.go
index e522ff9a0..1b86a4b4c 100644
--- a/pkg/sentry/fsimpl/fuse/dev.go
+++ b/pkg/sentry/fsimpl/fuse/dev.go
@@ -19,7 +19,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -32,6 +31,8 @@ import (
 const fuseDevMinor = 229
 
 // fuseDevice implements vfs.Device for /dev/fuse.
+//
+// +stateify savable
 type fuseDevice struct{}
 
 // Open implements vfs.Device.Open.
@@ -50,15 +51,14 @@ func (fuseDevice) Open(ctx context.Context, mnt *vfs.Mount, vfsd *vfs.Dentry, op
 }
 
 // DeviceFD implements vfs.FileDescriptionImpl for /dev/fuse.
+//
+// +stateify savable
 type DeviceFD struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
 	vfs.NoLockFD
 
-	// mounted specifies whether a FUSE filesystem was mounted using the DeviceFD.
-	mounted bool
-
 	// nextOpID is used to create new requests.
 	nextOpID linux.FUSEOpID
 
@@ -83,7 +83,7 @@ type DeviceFD struct {
 	writeCursorFR *futureResponse
 
 	// mu protects all the queues, maps, buffers and cursors and nextOpID.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// waitQueue is used to notify interested parties when the device becomes
 	// readable or writable.
@@ -92,21 +92,36 @@ type DeviceFD struct {
 	// fullQueueCh is a channel used to synchronize the readers with the writers.
 	// Writers (inbound requests to the filesystem) block if there are too many
 	// unprocessed in-flight requests.
-	fullQueueCh chan struct{}
+	fullQueueCh chan struct{} `state:".(int)"`
 
 	// fs is the FUSE filesystem that this FD is being used for.
 	fs *filesystem
 }
 
+func (fd *DeviceFD) saveFullQueueCh() int {
+	return cap(fd.fullQueueCh)
+}
+
+func (fd *DeviceFD) loadFullQueueCh(capacity int) {
+	fd.fullQueueCh = make(chan struct{}, capacity)
+}
+
 // Release implements vfs.FileDescriptionImpl.Release.
-func (fd *DeviceFD) Release(context.Context) {
-	fd.fs.conn.connected = false
+func (fd *DeviceFD) Release(ctx context.Context) {
+	if fd.fs != nil {
+		fd.fs.conn.mu.Lock()
+		fd.fs.conn.connected = false
+		fd.fs.conn.mu.Unlock()
+
+		fd.fs.VFSFilesystem().DecRef(ctx)
+		fd.fs = nil
+	}
 }
 
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
@@ -116,10 +131,16 @@ func (fd *DeviceFD) PRead(ctx context.Context, dst usermem.IOSequence, offset in
 // Read implements vfs.FileDescriptionImpl.Read.
 func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
+	// Return ENODEV if the filesystem is umounted.
+	if fd.fs.umounted {
+		// TODO(gvisor.dev/issue/3525): return ECONNABORTED if aborted via fuse control fs.
+		return 0, syserror.ENODEV
+	}
+
 	// We require that any Read done on this filesystem have a sane minimum
 	// read buffer. It must have the capacity for the fixed parts of any request
 	// header (Linux uses the request header and the FUSEWriteIn header for this
@@ -143,58 +164,82 @@ func (fd *DeviceFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.R
 }
 
 // readLocked implements the reading of the fuse device while locked with DeviceFD.mu.
+//
+// Preconditions: dst is large enough for any reasonable request.
 func (fd *DeviceFD) readLocked(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	if fd.queue.Empty() {
-		return 0, syserror.ErrWouldBlock
-	}
+	var req *Request
 
-	var readCursor uint32
-	var bytesRead int64
-	for {
-		req := fd.queue.Front()
-		if dst.NumBytes() < int64(req.hdr.Len) {
-			// The request is too large. Cannot process it. All requests must be smaller than the
-			// negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
-			// handshake.
-			errno := -int32(syscall.EIO)
-			if req.hdr.Opcode == linux.FUSE_SETXATTR {
-				errno = -int32(syscall.E2BIG)
-			}
+	// Find the first valid request.
+	// For the normal case this loop only execute once.
+	for !fd.queue.Empty() {
+		req = fd.queue.Front()
 
-			// Return the error to the calling task.
-			if err := fd.sendError(ctx, errno, req); err != nil {
-				return 0, err
-			}
+		if int64(req.hdr.Len)+int64(len(req.payload)) <= dst.NumBytes() {
+			break
+		}
 
-			// We're done with this request.
-			fd.queue.Remove(req)
+		// The request is too large. Cannot process it. All requests must be smaller than the
+		// negotiated size as specified by Connection.MaxWrite set as part of the FUSE_INIT
+		// handshake.
+		errno := -int32(syscall.EIO)
+		if req.hdr.Opcode == linux.FUSE_SETXATTR {
+			errno = -int32(syscall.E2BIG)
+		}
 
-			// Restart the read as this request was invalid.
-			log.Warningf("fuse.DeviceFD.Read: request found was too large. Restarting read.")
-			return fd.readLocked(ctx, dst, opts)
+		// Return the error to the calling task.
+		if err := fd.sendError(ctx, errno, req.hdr.Unique); err != nil {
+			return 0, err
 		}
 
-		n, err := dst.CopyOut(ctx, req.data[readCursor:])
+		// We're done with this request.
+		fd.queue.Remove(req)
+		req = nil
+	}
+
+	if req == nil {
+		return 0, syserror.ErrWouldBlock
+	}
+
+	// We already checked the size: dst must be able to fit the whole request.
+	// Now we write the marshalled header, the payload,
+	// and the potential additional payload
+	// to the user memory IOSequence.
+
+	n, err := dst.CopyOut(ctx, req.data)
+	if err != nil {
+		return 0, err
+	}
+	if n != len(req.data) {
+		return 0, syserror.EIO
+	}
+
+	if req.hdr.Opcode == linux.FUSE_WRITE {
+		written, err := dst.DropFirst(n).CopyOut(ctx, req.payload)
 		if err != nil {
 			return 0, err
 		}
-		readCursor += uint32(n)
-		bytesRead += int64(n)
-
-		if readCursor >= req.hdr.Len {
-			// Fully done with this req, remove it from the queue.
-			fd.queue.Remove(req)
-			break
+		if written != len(req.payload) {
+			return 0, syserror.EIO
 		}
+		n += int(written)
 	}
 
-	return bytesRead, nil
+	// Fully done with this req, remove it from the queue.
+	fd.queue.Remove(req)
+
+	// Remove noReply ones from map of requests expecting a reply.
+	if req.noReply {
+		fd.numActiveRequests -= 1
+		delete(fd.completions, req.hdr.Unique)
+	}
+
+	return int64(n), nil
 }
 
 // PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (fd *DeviceFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
@@ -211,10 +256,15 @@ func (fd *DeviceFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.
 // writeLocked implements writing to the fuse device while locked with DeviceFD.mu.
 func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
+	// Return ENODEV if the filesystem is umounted.
+	if fd.fs.umounted {
+		return 0, syserror.ENODEV
+	}
+
 	var cn, n int64
 	hdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
 
@@ -276,7 +326,8 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt
 
 			fut, ok := fd.completions[hdr.Unique]
 			if !ok {
-				// Server sent us a response for a request we never sent?
+				// Server sent us a response for a request we never sent,
+				// or for which we already received a reply (e.g. aborted), an unlikely event.
 				return 0, syserror.EINVAL
 			}
 
@@ -307,8 +358,23 @@ func (fd *DeviceFD) writeLocked(ctx context.Context, src usermem.IOSequence, opt
 
 // Readiness implements vfs.FileDescriptionImpl.Readiness.
 func (fd *DeviceFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	return fd.readinessLocked(mask)
+}
+
+// readinessLocked implements checking the readiness of the fuse device while
+// locked with DeviceFD.mu.
+func (fd *DeviceFD) readinessLocked(mask waiter.EventMask) waiter.EventMask {
 	var ready waiter.EventMask
-	ready |= waiter.EventOut // FD is always writable
+
+	if fd.fs.umounted {
+		ready |= waiter.EventErr
+		return ready & mask
+	}
+
+	// FD is always writable.
+	ready |= waiter.EventOut
 	if !fd.queue.Empty() {
 		// Have reqs available, FD is readable.
 		ready |= waiter.EventIn
@@ -330,7 +396,7 @@ func (fd *DeviceFD) EventUnregister(e *waiter.Entry) {
 // Seek implements vfs.FileDescriptionImpl.Seek.
 func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
 	// Operations on /dev/fuse don't make sense until a FUSE filesystem is mounted.
-	if !fd.mounted {
+	if fd.fs == nil {
 		return 0, syserror.EPERM
 	}
 
@@ -338,59 +404,59 @@ func (fd *DeviceFD) Seek(ctx context.Context, offset int64, whence int32) (int64
 }
 
 // sendResponse sends a response to the waiting task (if any).
+//
+// Preconditions: fd.mu must be held.
 func (fd *DeviceFD) sendResponse(ctx context.Context, fut *futureResponse) error {
-	// See if the running task need to perform some action before returning.
-	// Since we just finished writing the future, we can be sure that
-	// getResponse generates a populated response.
-	if err := fd.noReceiverAction(ctx, fut.getResponse()); err != nil {
-		return err
-	}
+	// Signal the task waiting on a response if any.
+	defer close(fut.ch)
 
 	// Signal that the queue is no longer full.
 	select {
 	case fd.fullQueueCh <- struct{}{}:
 	default:
 	}
-	fd.numActiveRequests -= 1
+	fd.numActiveRequests--
+
+	if fut.async {
+		return fd.asyncCallBack(ctx, fut.getResponse())
+	}
 
-	// Signal the task waiting on a response.
-	close(fut.ch)
 	return nil
 }
 
-// sendError sends an error response to the waiting task (if any).
-func (fd *DeviceFD) sendError(ctx context.Context, errno int32, req *Request) error {
+// sendError sends an error response to the waiting task (if any) by calling sendResponse().
+//
+// Preconditions: fd.mu must be held.
+func (fd *DeviceFD) sendError(ctx context.Context, errno int32, unique linux.FUSEOpID) error {
 	// Return the error to the calling task.
 	outHdrLen := uint32((*linux.FUSEHeaderOut)(nil).SizeBytes())
 	respHdr := linux.FUSEHeaderOut{
 		Len:    outHdrLen,
 		Error:  errno,
-		Unique: req.hdr.Unique,
+		Unique: unique,
 	}
 
 	fut, ok := fd.completions[respHdr.Unique]
 	if !ok {
-		// Server sent us a response for a request we never sent?
+		// A response for a request we never sent,
+		// or for which we already received a reply (e.g. aborted).
 		return syserror.EINVAL
 	}
 	delete(fd.completions, respHdr.Unique)
 
 	fut.hdr = &respHdr
-	if err := fd.sendResponse(ctx, fut); err != nil {
-		return err
-	}
-
-	return nil
+	return fd.sendResponse(ctx, fut)
 }
 
-// noReceiverAction has the calling kernel.Task do some action if its known that no
-// receiver is going to be waiting on the future channel. This is to be used by:
-// FUSE_INIT.
-func (fd *DeviceFD) noReceiverAction(ctx context.Context, r *Response) error {
-	if r.opcode == linux.FUSE_INIT {
+// asyncCallBack executes pre-defined callback function for async requests.
+// Currently used by: FUSE_INIT.
+func (fd *DeviceFD) asyncCallBack(ctx context.Context, r *Response) error {
+	switch r.opcode {
+	case linux.FUSE_INIT:
 		creds := auth.CredentialsFromContext(ctx)
 		rootUserNs := kernel.KernelFromContext(ctx).RootUserNamespace()
 		return fd.fs.conn.InitRecv(r, creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, rootUserNs))
+		// TODO(gvisor.dev/issue/3247): support async read: correctly process the response.
 	}
 
 	return nil
diff --git a/pkg/sentry/fsimpl/fuse/dev_test.go b/pkg/sentry/fsimpl/fuse/dev_test.go
index 1ffe7ccd2..5986133e9 100644
--- a/pkg/sentry/fsimpl/fuse/dev_test.go
+++ b/pkg/sentry/fsimpl/fuse/dev_test.go
@@ -16,7 +16,6 @@ package fuse
 
 import (
 	"fmt"
-	"io"
 	"math/rand"
 	"testing"
 
@@ -28,17 +27,12 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
-	"gvisor.dev/gvisor/tools/go_marshal/marshal"
 )
 
 // echoTestOpcode is the Opcode used during testing. The server used in tests
 // will simply echo the payload back with the appropriate headers.
 const echoTestOpcode linux.FUSEOpcode = 1000
 
-type testPayload struct {
-	data uint32
-}
-
 // TestFUSECommunication tests that the communication layer between the Sentry and the
 // FUSE server daemon works as expected.
 func TestFUSECommunication(t *testing.T) {
@@ -327,102 +321,3 @@ func fuseServerRun(t *testing.T, s *testutil.System, k *kernel.Kernel, fd *vfs.F
 		}
 	}
 }
-
-func setup(t *testing.T) *testutil.System {
-	k, err := testutil.Boot()
-	if err != nil {
-		t.Fatalf("Error creating kernel: %v", err)
-	}
-
-	ctx := k.SupervisorContext()
-	creds := auth.CredentialsFromContext(ctx)
-
-	k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
-		AllowUserList:  true,
-		AllowUserMount: true,
-	})
-
-	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
-	if err != nil {
-		t.Fatalf("NewMountNamespace(): %v", err)
-	}
-
-	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
-}
-
-// newTestConnection creates a fuse connection that the sentry can communicate with
-// and the FD for the server to communicate with.
-func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
-	vfsObj := &vfs.VirtualFilesystem{}
-	fuseDev := &DeviceFD{}
-
-	if err := vfsObj.Init(system.Ctx); err != nil {
-		return nil, nil, err
-	}
-
-	vd := vfsObj.NewAnonVirtualDentry("genCountFD")
-	defer vd.DecRef(system.Ctx)
-	if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
-		return nil, nil, err
-	}
-
-	fsopts := filesystemOptions{
-		maxActiveRequests: maxActiveRequests,
-	}
-	fs, err := NewFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
-	if err != nil {
-		return nil, nil, err
-	}
-
-	return fs.conn, &fuseDev.vfsfd, nil
-}
-
-// SizeBytes implements marshal.Marshallable.SizeBytes.
-func (t *testPayload) SizeBytes() int {
-	return 4
-}
-
-// MarshalBytes implements marshal.Marshallable.MarshalBytes.
-func (t *testPayload) MarshalBytes(dst []byte) {
-	usermem.ByteOrder.PutUint32(dst[:4], t.data)
-}
-
-// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
-func (t *testPayload) UnmarshalBytes(src []byte) {
-	*t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
-}
-
-// Packed implements marshal.Marshallable.Packed.
-func (t *testPayload) Packed() bool {
-	return true
-}
-
-// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
-func (t *testPayload) MarshalUnsafe(dst []byte) {
-	t.MarshalBytes(dst)
-}
-
-// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
-func (t *testPayload) UnmarshalUnsafe(src []byte) {
-	t.UnmarshalBytes(src)
-}
-
-// CopyOutN implements marshal.Marshallable.CopyOutN.
-func (t *testPayload) CopyOutN(task marshal.Task, addr usermem.Addr, limit int) (int, error) {
-	panic("not implemented")
-}
-
-// CopyOut implements marshal.Marshallable.CopyOut.
-func (t *testPayload) CopyOut(task marshal.Task, addr usermem.Addr) (int, error) {
-	panic("not implemented")
-}
-
-// CopyIn implements marshal.Marshallable.CopyIn.
-func (t *testPayload) CopyIn(task marshal.Task, addr usermem.Addr) (int, error) {
-	panic("not implemented")
-}
-
-// WriteTo implements io.WriterTo.WriteTo.
-func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
-	panic("not implemented")
-}
diff --git a/pkg/sentry/fsimpl/fuse/directory.go b/pkg/sentry/fsimpl/fuse/directory.go
new file mode 100644
index 000000000..8f220a04b
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/directory.go
@@ -0,0 +1,105 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type directoryFD struct {
+	fileDescription
+}
+
+// Allocate implements directoryFD.Allocate.
+func (*directoryFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	return syserror.EISDIR
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (*directoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (*directoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (*directoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (*directoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.EISDIR
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (dir *directoryFD) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback) error {
+	fusefs := dir.inode().fs
+	task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+
+	in := linux.FUSEReadIn{
+		Fh:     dir.Fh,
+		Offset: uint64(atomic.LoadInt64(&dir.off)),
+		Size:   linux.FUSE_PAGE_SIZE,
+		Flags:  dir.statusFlags(),
+	}
+
+	// TODO(gVisor.dev/issue/3404): Support FUSE_READDIRPLUS.
+	req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), dir.inode().nodeID, linux.FUSE_READDIR, &in)
+	if err != nil {
+		return err
+	}
+
+	res, err := fusefs.conn.Call(task, req)
+	if err != nil {
+		return err
+	}
+	if err := res.Error(); err != nil {
+		return err
+	}
+
+	var out linux.FUSEDirents
+	if err := res.UnmarshalPayload(&out); err != nil {
+		return err
+	}
+
+	for _, fuseDirent := range out.Dirents {
+		nextOff := int64(fuseDirent.Meta.Off)
+		dirent := vfs.Dirent{
+			Name:    fuseDirent.Name,
+			Type:    uint8(fuseDirent.Meta.Type),
+			Ino:     fuseDirent.Meta.Ino,
+			NextOff: nextOff,
+		}
+
+		if err := callback.Handle(dirent); err != nil {
+			return err
+		}
+		atomic.StoreInt64(&dir.off, nextOff)
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/file.go b/pkg/sentry/fsimpl/fuse/file.go
new file mode 100644
index 000000000..83f2816b7
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/file.go
@@ -0,0 +1,133 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fileDescription implements vfs.FileDescriptionImpl for fuse.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+	vfs.DentryMetadataFileDescriptionImpl
+	vfs.NoLockFD
+
+	// the file handle used in userspace.
+	Fh uint64
+
+	// Nonseekable is indicate cannot perform seek on a file.
+	Nonseekable bool
+
+	// DirectIO suggest fuse to use direct io operation.
+	DirectIO bool
+
+	// OpenFlag is the flag returned by open.
+	OpenFlag uint32
+
+	// off is the file offset.
+	off int64
+}
+
+func (fd *fileDescription) dentry() *kernfs.Dentry {
+	return fd.vfsfd.Dentry().Impl().(*kernfs.Dentry)
+}
+
+func (fd *fileDescription) inode() *inode {
+	return fd.dentry().Inode().(*inode)
+}
+
+func (fd *fileDescription) filesystem() *vfs.Filesystem {
+	return fd.vfsfd.VirtualDentry().Mount().Filesystem()
+}
+
+func (fd *fileDescription) statusFlags() uint32 {
+	return fd.vfsfd.StatusFlags()
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *fileDescription) Release(ctx context.Context) {
+	// no need to release if FUSE server doesn't implement Open.
+	conn := fd.inode().fs.conn
+	if conn.noOpen {
+		return
+	}
+
+	in := linux.FUSEReleaseIn{
+		Fh:    fd.Fh,
+		Flags: fd.statusFlags(),
+	}
+	// TODO(gvisor.dev/issue/3245): add logic when we support file lock owner.
+	var opcode linux.FUSEOpcode
+	if fd.inode().Mode().IsDir() {
+		opcode = linux.FUSE_RELEASEDIR
+	} else {
+		opcode = linux.FUSE_RELEASE
+	}
+	kernelTask := kernel.TaskFromContext(ctx)
+	// ignoring errors and FUSE server reply is analogous to Linux's behavior.
+	req, err := conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), fd.inode().nodeID, opcode, &in)
+	if err != nil {
+		// No way to invoke Call() with an errored request.
+		return
+	}
+	// The reply will be ignored since no callback is defined in asyncCallBack().
+	conn.CallAsync(kernelTask, req)
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	return 0, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	return 0, nil
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, nil
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return 0, nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return 0, nil
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	fs := fd.filesystem()
+	inode := fd.inode()
+	return inode.Stat(ctx, fs, opts)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	fs := fd.filesystem()
+	creds := auth.CredentialsFromContext(ctx)
+	return fd.inode().setAttr(ctx, fs, creds, opts, true, fd.Fh)
+}
diff --git a/pkg/sentry/fsimpl/fuse/fusefs.go b/pkg/sentry/fsimpl/fuse/fusefs.go
index 83c24ec25..e7ef5998e 100644
--- a/pkg/sentry/fsimpl/fuse/fusefs.go
+++ b/pkg/sentry/fsimpl/fuse/fusefs.go
@@ -16,24 +16,36 @@
 package fuse
 
 import (
+	"math"
 	"strconv"
+	"sync"
+	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/marshal"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/waiter"
 )
 
 // Name is the default filesystem name.
 const Name = "fuse"
 
+// maxActiveRequestsDefault is the default setting controlling the upper bound
+// on the number of active requests at any given time.
+const maxActiveRequestsDefault = 10000
+
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
+// +stateify savable
 type filesystemOptions struct {
 	// userID specifies the numeric uid of the mount owner.
 	// This option should not be specified by the filesystem owner.
@@ -56,9 +68,16 @@ type filesystemOptions struct {
 	// exist at any time. Any further requests will block when trying to
 	// Call the server.
 	maxActiveRequests uint64
+
+	// maxRead is the max number of bytes to read,
+	// specified as "max_read" in fs parameters.
+	// If not specified by user, use math.MaxUint32 as default value.
+	maxRead uint32
 }
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 	devMinor uint32
@@ -69,6 +88,9 @@ type filesystem struct {
 
 	// opts is the options the fusefs is initialized with.
 	opts *filesystemOptions
+
+	// umounted is true if filesystem.Release() has been called.
+	umounted bool
 }
 
 // Name implements vfs.FilesystemType.Name.
@@ -76,6 +98,9 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	devMinor, err := vfsObj.GetAnonBlockDevMinor()
@@ -142,14 +167,29 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	// Set the maxInFlightRequests option.
 	fsopts.maxActiveRequests = maxActiveRequestsDefault
 
+	if maxReadStr, ok := mopts["max_read"]; ok {
+		delete(mopts, "max_read")
+		maxRead, err := strconv.ParseUint(maxReadStr, 10, 32)
+		if err != nil {
+			log.Warningf("%s.GetFilesystem: invalid max_read: max_read=%s", fsType.Name(), maxReadStr)
+			return nil, nil, syserror.EINVAL
+		}
+		if maxRead < fuseMinMaxRead {
+			maxRead = fuseMinMaxRead
+		}
+		fsopts.maxRead = uint32(maxRead)
+	} else {
+		fsopts.maxRead = math.MaxUint32
+	}
+
 	// Check for unparsed options.
 	if len(mopts) != 0 {
-		log.Warningf("%s.GetFilesystem: unknown options: %v", fsType.Name(), mopts)
+		log.Warningf("%s.GetFilesystem: unsupported or unknown options: %v", fsType.Name(), mopts)
 		return nil, nil, syserror.EINVAL
 	}
 
 	// Create a new FUSE filesystem.
-	fs, err := NewFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
+	fs, err := newFUSEFilesystem(ctx, devMinor, &fsopts, fuseFd)
 	if err != nil {
 		log.Warningf("%s.NewFUSEFilesystem: failed with error: %v", fsType.Name(), err)
 		return nil, nil, err
@@ -165,26 +205,28 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	}
 
 	// root is the fusefs root directory.
-	root := fs.newInode(creds, fsopts.rootMode)
+	root := fs.newRootInode(ctx, creds, fsopts.rootMode)
 
 	return fs.VFSFilesystem(), root.VFSDentry(), nil
 }
 
-// NewFUSEFilesystem creates a new FUSE filesystem.
-func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
-	fs := &filesystem{
-		devMinor: devMinor,
-		opts:     opts,
-	}
-
-	conn, err := newFUSEConnection(ctx, device, opts.maxActiveRequests)
+// newFUSEFilesystem creates a new FUSE filesystem.
+func newFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOptions, device *vfs.FileDescription) (*filesystem, error) {
+	conn, err := newFUSEConnection(ctx, device, opts)
 	if err != nil {
 		log.Warningf("fuse.NewFUSEFilesystem: NewFUSEConnection failed with error: %v", err)
 		return nil, syserror.EINVAL
 	}
 
-	fs.conn = conn
 	fuseFD := device.Impl().(*DeviceFD)
+
+	fs := &filesystem{
+		devMinor: devMinor,
+		opts:     opts,
+		conn:     conn,
+	}
+
+	fs.VFSFilesystem().IncRef()
 	fuseFD.fs = fs
 
 	return fs, nil
@@ -192,39 +234,361 @@ func NewFUSEFilesystem(ctx context.Context, devMinor uint32, opts *filesystemOpt
 
 // Release implements vfs.FilesystemImpl.Release.
 func (fs *filesystem) Release(ctx context.Context) {
+	fs.conn.fd.mu.Lock()
+
+	fs.umounted = true
+	fs.conn.Abort(ctx)
+	// Notify all the waiters on this fd.
+	fs.conn.fd.waitQueue.Notify(waiter.EventIn)
+
+	fs.conn.fd.mu.Unlock()
+
 	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
 	fs.Filesystem.Release(ctx)
 }
 
 // inode implements kernfs.Inode.
+//
+// +stateify savable
 type inode struct {
+	inodeRefs
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
-	kernfs.InodeNoDynamicLookup
-	kernfs.InodeNotSymlink
 	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
 	kernfs.OrderedChildren
 
+	// the owning filesystem. fs is immutable.
+	fs *filesystem
+
+	// metaDataMu protects the metadata of this inode.
+	metadataMu sync.Mutex
+
+	nodeID uint64
+
 	locks vfs.FileLocks
 
-	dentry kernfs.Dentry
+	// size of the file.
+	size uint64
+
+	// attributeVersion is the version of inode's attributes.
+	attributeVersion uint64
+
+	// attributeTime is the remaining vaild time of attributes.
+	attributeTime uint64
+
+	// version of the inode.
+	version uint64
+
+	// link is result of following a symbolic link.
+	link string
 }
 
-func (fs *filesystem) newInode(creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
-	i := &inode{}
-	i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
+func (fs *filesystem) newRootInode(ctx context.Context, creds *auth.Credentials, mode linux.FileMode) *kernfs.Dentry {
+	i := &inode{fs: fs, nodeID: 1}
+	i.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, 1, linux.ModeDirectory|0755)
 	i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	i.dentry.Init(i)
+	i.EnableLeakCheck()
 
-	return &i.dentry
+	var d kernfs.Dentry
+	d.Init(&fs.Filesystem, i)
+	return &d
+}
+
+func (fs *filesystem) newInode(ctx context.Context, nodeID uint64, attr linux.FUSEAttr) kernfs.Inode {
+	i := &inode{fs: fs, nodeID: nodeID}
+	creds := auth.Credentials{EffectiveKGID: auth.KGID(attr.UID), EffectiveKUID: auth.KUID(attr.UID)}
+	i.InodeAttrs.Init(ctx, &creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.FileMode(attr.Mode))
+	atomic.StoreUint64(&i.size, attr.Size)
+	i.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	i.EnableLeakCheck()
+	return i
 }
 
 // Open implements kernfs.Inode.Open.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	isDir := i.InodeAttrs.Mode().IsDir()
+	// return error if specified to open directory but inode is not a directory.
+	if !isDir && opts.Mode.IsDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if opts.Flags&linux.O_LARGEFILE == 0 && atomic.LoadUint64(&i.size) > linux.MAX_NON_LFS {
+		return nil, syserror.EOVERFLOW
+	}
+
+	var fd *fileDescription
+	var fdImpl vfs.FileDescriptionImpl
+	if isDir {
+		directoryFD := &directoryFD{}
+		fd = &(directoryFD.fileDescription)
+		fdImpl = directoryFD
+	} else {
+		regularFD := &regularFileFD{}
+		fd = &(regularFD.fileDescription)
+		fdImpl = regularFD
+	}
+	// FOPEN_KEEP_CACHE is the defualt flag for noOpen.
+	fd.OpenFlag = linux.FOPEN_KEEP_CACHE
+
+	// Only send open request when FUSE server support open or is opening a directory.
+	if !i.fs.conn.noOpen || isDir {
+		kernelTask := kernel.TaskFromContext(ctx)
+		if kernelTask == nil {
+			log.Warningf("fusefs.Inode.Open: couldn't get kernel task from context")
+			return nil, syserror.EINVAL
+		}
+
+		// Build the request.
+		var opcode linux.FUSEOpcode
+		if isDir {
+			opcode = linux.FUSE_OPENDIR
+		} else {
+			opcode = linux.FUSE_OPEN
+		}
+
+		in := linux.FUSEOpenIn{Flags: opts.Flags & ^uint32(linux.O_CREAT|linux.O_EXCL|linux.O_NOCTTY)}
+		if !i.fs.conn.atomicOTrunc {
+			in.Flags &= ^uint32(linux.O_TRUNC)
+		}
+
+		req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, &in)
+		if err != nil {
+			return nil, err
+		}
+
+		// Send the request and receive the reply.
+		res, err := i.fs.conn.Call(kernelTask, req)
+		if err != nil {
+			return nil, err
+		}
+		if err := res.Error(); err == syserror.ENOSYS && !isDir {
+			i.fs.conn.noOpen = true
+		} else if err != nil {
+			return nil, err
+		} else {
+			out := linux.FUSEOpenOut{}
+			if err := res.UnmarshalPayload(&out); err != nil {
+				return nil, err
+			}
+
+			// Process the reply.
+			fd.OpenFlag = out.OpenFlag
+			if isDir {
+				fd.OpenFlag &= ^uint32(linux.FOPEN_DIRECT_IO)
+			}
+
+			fd.Fh = out.Fh
+		}
+	}
+
+	// TODO(gvisor.dev/issue/3234): invalidate mmap after implemented it for FUSE Inode
+	fd.DirectIO = fd.OpenFlag&linux.FOPEN_DIRECT_IO != 0
+	fdOptions := &vfs.FileDescriptionOptions{}
+	if fd.OpenFlag&linux.FOPEN_NONSEEKABLE != 0 {
+		fdOptions.DenyPRead = true
+		fdOptions.DenyPWrite = true
+		fd.Nonseekable = true
+	}
+
+	// If we don't send SETATTR before open (which is indicated by atomicOTrunc)
+	// and O_TRUNC is set, update the inode's version number and clean existing data
+	// by setting the file size to 0.
+	if i.fs.conn.atomicOTrunc && opts.Flags&linux.O_TRUNC != 0 {
+		i.fs.conn.mu.Lock()
+		i.fs.conn.attributeVersion++
+		i.attributeVersion = i.fs.conn.attributeVersion
+		atomic.StoreUint64(&i.size, 0)
+		i.fs.conn.mu.Unlock()
+		i.attributeTime = 0
+	}
+
+	if err := fd.vfsfd.Init(fdImpl, opts.Flags, rp.Mount(), d.VFSDentry(), fdOptions); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// Lookup implements kernfs.Inode.Lookup.
+func (i *inode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+	in := linux.FUSELookupIn{Name: name}
+	return i.newEntry(ctx, name, 0, linux.FUSE_LOOKUP, &in)
+}
+
+// Keep implements kernfs.Inode.Keep.
+func (i *inode) Keep() bool {
+	// Return true so that kernfs keeps the new dentry pointing to this
+	// inode in the dentry tree. This is needed because inodes created via
+	// Lookup are not temporary. They might refer to existing files on server
+	// that can be Unlink'd/Rmdir'd.
+	return true
+}
+
+// IterDirents implements kernfs.Inode.IterDirents.
+func (*inode) IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	return offset, nil
+}
+
+// NewFile implements kernfs.Inode.NewFile.
+func (i *inode) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) {
+	kernelTask := kernel.TaskFromContext(ctx)
+	if kernelTask == nil {
+		log.Warningf("fusefs.Inode.NewFile: couldn't get kernel task from context", i.nodeID)
+		return nil, syserror.EINVAL
+	}
+	in := linux.FUSECreateIn{
+		CreateMeta: linux.FUSECreateMeta{
+			Flags: opts.Flags,
+			Mode:  uint32(opts.Mode) | linux.S_IFREG,
+			Umask: uint32(kernelTask.FSContext().Umask()),
+		},
+		Name: name,
+	}
+	return i.newEntry(ctx, name, linux.S_IFREG, linux.FUSE_CREATE, &in)
+}
+
+// NewNode implements kernfs.Inode.NewNode.
+func (i *inode) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (kernfs.Inode, error) {
+	in := linux.FUSEMknodIn{
+		MknodMeta: linux.FUSEMknodMeta{
+			Mode:  uint32(opts.Mode),
+			Rdev:  linux.MakeDeviceID(uint16(opts.DevMajor), opts.DevMinor),
+			Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
+		},
+		Name: name,
+	}
+	return i.newEntry(ctx, name, opts.Mode.FileType(), linux.FUSE_MKNOD, &in)
+}
+
+// NewSymlink implements kernfs.Inode.NewSymlink.
+func (i *inode) NewSymlink(ctx context.Context, name, target string) (kernfs.Inode, error) {
+	in := linux.FUSESymLinkIn{
+		Name:   name,
+		Target: target,
+	}
+	return i.newEntry(ctx, name, linux.S_IFLNK, linux.FUSE_SYMLINK, &in)
+}
+
+// Unlink implements kernfs.Inode.Unlink.
+func (i *inode) Unlink(ctx context.Context, name string, child kernfs.Inode) error {
+	kernelTask := kernel.TaskFromContext(ctx)
+	if kernelTask == nil {
+		log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
+		return syserror.EINVAL
+	}
+	in := linux.FUSEUnlinkIn{Name: name}
+	req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_UNLINK, &in)
 	if err != nil {
+		return err
+	}
+	res, err := i.fs.conn.Call(kernelTask, req)
+	if err != nil {
+		return err
+	}
+	// only return error, discard res.
+	return res.Error()
+}
+
+// NewDir implements kernfs.Inode.NewDir.
+func (i *inode) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
+	in := linux.FUSEMkdirIn{
+		MkdirMeta: linux.FUSEMkdirMeta{
+			Mode:  uint32(opts.Mode),
+			Umask: uint32(kernel.TaskFromContext(ctx).FSContext().Umask()),
+		},
+		Name: name,
+	}
+	return i.newEntry(ctx, name, linux.S_IFDIR, linux.FUSE_MKDIR, &in)
+}
+
+// RmDir implements kernfs.Inode.RmDir.
+func (i *inode) RmDir(ctx context.Context, name string, child kernfs.Inode) error {
+	fusefs := i.fs
+	task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+
+	in := linux.FUSERmDirIn{Name: name}
+	req, err := fusefs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_RMDIR, &in)
+	if err != nil {
+		return err
+	}
+
+	res, err := i.fs.conn.Call(task, req)
+	if err != nil {
+		return err
+	}
+	return res.Error()
+}
+
+// newEntry calls FUSE server for entry creation and allocates corresponding entry according to response.
+// Shared by FUSE_MKNOD, FUSE_MKDIR, FUSE_SYMLINK, FUSE_LINK and FUSE_LOOKUP.
+func (i *inode) newEntry(ctx context.Context, name string, fileType linux.FileMode, opcode linux.FUSEOpcode, payload marshal.Marshallable) (kernfs.Inode, error) {
+	kernelTask := kernel.TaskFromContext(ctx)
+	if kernelTask == nil {
+		log.Warningf("fusefs.Inode.newEntry: couldn't get kernel task from context", i.nodeID)
+		return nil, syserror.EINVAL
+	}
+	req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, opcode, payload)
+	if err != nil {
+		return nil, err
+	}
+	res, err := i.fs.conn.Call(kernelTask, req)
+	if err != nil {
+		return nil, err
+	}
+	if err := res.Error(); err != nil {
+		return nil, err
+	}
+	out := linux.FUSEEntryOut{}
+	if err := res.UnmarshalPayload(&out); err != nil {
 		return nil, err
 	}
-	return fd.VFSFileDescription(), nil
+	if opcode != linux.FUSE_LOOKUP && ((out.Attr.Mode&linux.S_IFMT)^uint32(fileType) != 0 || out.NodeID == 0 || out.NodeID == linux.FUSE_ROOT_ID) {
+		return nil, syserror.EIO
+	}
+	child := i.fs.newInode(ctx, out.NodeID, out.Attr)
+	return child, nil
+}
+
+// Getlink implements kernfs.Inode.Getlink.
+func (i *inode) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	path, err := i.Readlink(ctx, mnt)
+	return vfs.VirtualDentry{}, path, err
+}
+
+// Readlink implements kernfs.Inode.Readlink.
+func (i *inode) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
+	if i.Mode().FileType()&linux.S_IFLNK == 0 {
+		return "", syserror.EINVAL
+	}
+	if len(i.link) == 0 {
+		kernelTask := kernel.TaskFromContext(ctx)
+		if kernelTask == nil {
+			log.Warningf("fusefs.Inode.Readlink: couldn't get kernel task from context")
+			return "", syserror.EINVAL
+		}
+		req, err := i.fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(kernelTask.ThreadID()), i.nodeID, linux.FUSE_READLINK, &linux.FUSEEmptyIn{})
+		if err != nil {
+			return "", err
+		}
+		res, err := i.fs.conn.Call(kernelTask, req)
+		if err != nil {
+			return "", err
+		}
+		i.link = string(res.data[res.hdr.SizeBytes():])
+		if !mnt.Options().ReadOnly {
+			i.attributeTime = 0
+		}
+	}
+	return i.link, nil
+}
+
+// getFUSEAttr returns a linux.FUSEAttr of this inode stored in local cache.
+// TODO(gvisor.dev/issue/3679): Add support for other fields.
+func (i *inode) getFUSEAttr() linux.FUSEAttr {
+	return linux.FUSEAttr{
+		Ino:  i.Ino(),
+		Size: atomic.LoadUint64(&i.size),
+		Mode: uint32(i.Mode()),
+	}
 }
 
 // statFromFUSEAttr makes attributes from linux.FUSEAttr to linux.Statx. The
@@ -280,45 +644,179 @@ func statFromFUSEAttr(attr linux.FUSEAttr, mask, devMinor uint32) linux.Statx {
 	return stat
 }
 
-// Stat implements kernfs.Inode.Stat.
-func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
-	fusefs := fs.Impl().(*filesystem)
-	conn := fusefs.conn
-	task, creds := kernel.TaskFromContext(ctx), auth.CredentialsFromContext(ctx)
+// getAttr gets the attribute of this inode by issuing a FUSE_GETATTR request
+// or read from local cache. It updates the corresponding attributes if
+// necessary.
+func (i *inode) getAttr(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions, flags uint32, fh uint64) (linux.FUSEAttr, error) {
+	attributeVersion := atomic.LoadUint64(&i.fs.conn.attributeVersion)
+
+	// TODO(gvisor.dev/issue/3679): send the request only if
+	// - invalid local cache for fields specified in the opts.Mask
+	// - forced update
+	// - i.attributeTime expired
+	// If local cache is still valid, return local cache.
+	// Currently we always send a request,
+	// and we always set the metadata with the new result,
+	// unless attributeVersion has changed.
+
+	task := kernel.TaskFromContext(ctx)
 	if task == nil {
 		log.Warningf("couldn't get kernel task from context")
-		return linux.Statx{}, syserror.EINVAL
+		return linux.FUSEAttr{}, syserror.EINVAL
 	}
 
-	var in linux.FUSEGetAttrIn
-	// We don't set any attribute in the request, because in VFS2 fstat(2) will
-	// finally be translated into vfs.FilesystemImpl.StatAt() (see
-	// pkg/sentry/syscalls/linux/vfs2/stat.go), resulting in the same flow
-	// as stat(2). Thus GetAttrFlags and Fh variable will never be used in VFS2.
-	req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.Ino(), linux.FUSE_GETATTR, &in)
+	creds := auth.CredentialsFromContext(ctx)
+
+	in := linux.FUSEGetAttrIn{
+		GetAttrFlags: flags,
+		Fh:           fh,
+	}
+	req, err := i.fs.conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_GETATTR, &in)
 	if err != nil {
-		return linux.Statx{}, err
+		return linux.FUSEAttr{}, err
 	}
 
-	res, err := conn.Call(task, req)
+	res, err := i.fs.conn.Call(task, req)
 	if err != nil {
-		return linux.Statx{}, err
+		return linux.FUSEAttr{}, err
 	}
 	if err := res.Error(); err != nil {
-		return linux.Statx{}, err
+		return linux.FUSEAttr{}, err
 	}
 
 	var out linux.FUSEGetAttrOut
 	if err := res.UnmarshalPayload(&out); err != nil {
-		return linux.Statx{}, err
+		return linux.FUSEAttr{}, err
+	}
+
+	// Local version is newer, return the local one.
+	// Skip the update.
+	if attributeVersion != 0 && atomic.LoadUint64(&i.attributeVersion) > attributeVersion {
+		return i.getFUSEAttr(), nil
 	}
 
-	// Set all metadata into kernfs.InodeAttrs.
-	if err := i.SetStat(ctx, fs, creds, vfs.SetStatOptions{
-		Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, fusefs.devMinor),
+	// Set the metadata of kernfs.InodeAttrs.
+	if err := i.InodeAttrs.SetStat(ctx, fs, creds, vfs.SetStatOptions{
+		Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
 	}); err != nil {
+		return linux.FUSEAttr{}, err
+	}
+
+	// Set the size if no error (after SetStat() check).
+	atomic.StoreUint64(&i.size, out.Attr.Size)
+
+	return out.Attr, nil
+}
+
+// reviseAttr attempts to update the attributes for internal purposes
+// by calling getAttr with a pre-specified mask.
+// Used by read, write, lseek.
+func (i *inode) reviseAttr(ctx context.Context, flags uint32, fh uint64) error {
+	// Never need atime for internal purposes.
+	_, err := i.getAttr(ctx, i.fs.VFSFilesystem(), vfs.StatOptions{
+		Mask: linux.STATX_BASIC_STATS &^ linux.STATX_ATIME,
+	}, flags, fh)
+	return err
+}
+
+// Stat implements kernfs.Inode.Stat.
+func (i *inode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
+	attr, err := i.getAttr(ctx, fs, opts, 0, 0)
+	if err != nil {
 		return linux.Statx{}, err
 	}
 
-	return statFromFUSEAttr(out.Attr, opts.Mask, fusefs.devMinor), nil
+	return statFromFUSEAttr(attr, opts.Mask, i.fs.devMinor), nil
+}
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *inode) DecRef(ctx context.Context) {
+	i.inodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	// TODO(gvisor.dev/issues/3413): Complete the implementation of statfs.
+	return vfs.GenericStatFS(linux.FUSE_SUPER_MAGIC), nil
+}
+
+// fattrMaskFromStats converts vfs.SetStatOptions.Stat.Mask to linux stats mask
+// aligned with the attribute mask defined in include/linux/fs.h.
+func fattrMaskFromStats(mask uint32) uint32 {
+	var fuseAttrMask uint32
+	maskMap := map[uint32]uint32{
+		linux.STATX_MODE:  linux.FATTR_MODE,
+		linux.STATX_UID:   linux.FATTR_UID,
+		linux.STATX_GID:   linux.FATTR_GID,
+		linux.STATX_SIZE:  linux.FATTR_SIZE,
+		linux.STATX_ATIME: linux.FATTR_ATIME,
+		linux.STATX_MTIME: linux.FATTR_MTIME,
+		linux.STATX_CTIME: linux.FATTR_CTIME,
+	}
+	for statxMask, fattrMask := range maskMap {
+		if mask&statxMask != 0 {
+			fuseAttrMask |= fattrMask
+		}
+	}
+	return fuseAttrMask
+}
+
+// SetStat implements kernfs.Inode.SetStat.
+func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
+	return i.setAttr(ctx, fs, creds, opts, false, 0)
+}
+
+func (i *inode) setAttr(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions, useFh bool, fh uint64) error {
+	conn := i.fs.conn
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		log.Warningf("couldn't get kernel task from context")
+		return syserror.EINVAL
+	}
+
+	// We should retain the original file type when assigning new mode.
+	fileType := uint16(i.Mode()) & linux.S_IFMT
+	fattrMask := fattrMaskFromStats(opts.Stat.Mask)
+	if useFh {
+		fattrMask |= linux.FATTR_FH
+	}
+	in := linux.FUSESetAttrIn{
+		Valid:     fattrMask,
+		Fh:        fh,
+		Size:      opts.Stat.Size,
+		Atime:     uint64(opts.Stat.Atime.Sec),
+		Mtime:     uint64(opts.Stat.Mtime.Sec),
+		Ctime:     uint64(opts.Stat.Ctime.Sec),
+		AtimeNsec: opts.Stat.Atime.Nsec,
+		MtimeNsec: opts.Stat.Mtime.Nsec,
+		CtimeNsec: opts.Stat.Ctime.Nsec,
+		Mode:      uint32(fileType | opts.Stat.Mode),
+		UID:       opts.Stat.UID,
+		GID:       opts.Stat.GID,
+	}
+	req, err := conn.NewRequest(creds, uint32(task.ThreadID()), i.nodeID, linux.FUSE_SETATTR, &in)
+	if err != nil {
+		return err
+	}
+
+	res, err := conn.Call(task, req)
+	if err != nil {
+		return err
+	}
+	if err := res.Error(); err != nil {
+		return err
+	}
+	out := linux.FUSEGetAttrOut{}
+	if err := res.UnmarshalPayload(&out); err != nil {
+		return err
+	}
+
+	// Set the metadata of kernfs.InodeAttrs.
+	if err := i.InodeAttrs.SetStat(ctx, fs, creds, vfs.SetStatOptions{
+		Stat: statFromFUSEAttr(out.Attr, linux.STATX_ALL, i.fs.devMinor),
+	}); err != nil {
+		return err
+	}
+
+	return nil
 }
diff --git a/pkg/sentry/fsimpl/fuse/read_write.go b/pkg/sentry/fsimpl/fuse/read_write.go
new file mode 100644
index 000000000..2d396e84c
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/read_write.go
@@ -0,0 +1,244 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"io"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// ReadInPages sends FUSE_READ requests for the size after round it up to
+// a multiple of page size, blocks on it for reply, processes the reply
+// and returns the payload (or joined payloads) as a byte slice.
+// This is used for the general purpose reading.
+// We do not support direct IO (which read the exact number of bytes)
+// at this moment.
+func (fs *filesystem) ReadInPages(ctx context.Context, fd *regularFileFD, off uint64, size uint32) ([][]byte, uint32, error) {
+	attributeVersion := atomic.LoadUint64(&fs.conn.attributeVersion)
+
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		log.Warningf("fusefs.Read: couldn't get kernel task from context")
+		return nil, 0, syserror.EINVAL
+	}
+
+	// Round up to a multiple of page size.
+	readSize, _ := usermem.PageRoundUp(uint64(size))
+
+	// One request cannnot exceed either maxRead or maxPages.
+	maxPages := fs.conn.maxRead >> usermem.PageShift
+	if maxPages > uint32(fs.conn.maxPages) {
+		maxPages = uint32(fs.conn.maxPages)
+	}
+
+	var outs [][]byte
+	var sizeRead uint32
+
+	// readSize is a multiple of usermem.PageSize.
+	// Always request bytes as a multiple of pages.
+	pagesRead, pagesToRead := uint32(0), uint32(readSize>>usermem.PageShift)
+
+	// Reuse the same struct for unmarshalling to avoid unnecessary memory allocation.
+	in := linux.FUSEReadIn{
+		Fh:        fd.Fh,
+		LockOwner: 0, // TODO(gvisor.dev/issue/3245): file lock
+		ReadFlags: 0, // TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER
+		Flags:     fd.statusFlags(),
+	}
+
+	// This loop is intended for fragmented read where the bytes to read is
+	// larger than either the maxPages or maxRead.
+	// For the majority of reads with normal size, this loop should only
+	// execute once.
+	for pagesRead < pagesToRead {
+		pagesCanRead := pagesToRead - pagesRead
+		if pagesCanRead > maxPages {
+			pagesCanRead = maxPages
+		}
+
+		in.Offset = off + (uint64(pagesRead) << usermem.PageShift)
+		in.Size = pagesCanRead << usermem.PageShift
+
+		req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), fd.inode().nodeID, linux.FUSE_READ, &in)
+		if err != nil {
+			return nil, 0, err
+		}
+
+		// TODO(gvisor.dev/issue/3247): support async read.
+
+		res, err := fs.conn.Call(t, req)
+		if err != nil {
+			return nil, 0, err
+		}
+		if err := res.Error(); err != nil {
+			return nil, 0, err
+		}
+
+		// Not enough bytes in response,
+		// either we reached EOF,
+		// or the FUSE server sends back a response
+		// that cannot even fit the hdr.
+		if len(res.data) <= res.hdr.SizeBytes() {
+			// We treat both case as EOF here for now
+			// since there is no reliable way to detect
+			// the over-short hdr case.
+			break
+		}
+
+		// Directly using the slice to avoid extra copy.
+		out := res.data[res.hdr.SizeBytes():]
+
+		outs = append(outs, out)
+		sizeRead += uint32(len(out))
+
+		pagesRead += pagesCanRead
+	}
+
+	defer fs.ReadCallback(ctx, fd, off, size, sizeRead, attributeVersion)
+
+	// No bytes returned: offset >= EOF.
+	if len(outs) == 0 {
+		return nil, 0, io.EOF
+	}
+
+	return outs, sizeRead, nil
+}
+
+// ReadCallback updates several information after receiving a read response.
+// Due to readahead, sizeRead can be larger than size.
+func (fs *filesystem) ReadCallback(ctx context.Context, fd *regularFileFD, off uint64, size uint32, sizeRead uint32, attributeVersion uint64) {
+	// TODO(gvisor.dev/issue/3247): support async read.
+	// If this is called by an async read, correctly process it.
+	// May need to update the signature.
+
+	i := fd.inode()
+	i.InodeAttrs.TouchAtime(ctx, fd.vfsfd.Mount())
+
+	// Reached EOF.
+	if sizeRead < size {
+		// TODO(gvisor.dev/issue/3630): If we have writeback cache, then we need to fill this hole.
+		// Might need to update the buf to be returned from the Read().
+
+		// Update existing size.
+		newSize := off + uint64(sizeRead)
+		fs.conn.mu.Lock()
+		if attributeVersion == i.attributeVersion && newSize < atomic.LoadUint64(&i.size) {
+			fs.conn.attributeVersion++
+			i.attributeVersion = i.fs.conn.attributeVersion
+			atomic.StoreUint64(&i.size, newSize)
+		}
+		fs.conn.mu.Unlock()
+	}
+}
+
+// Write sends FUSE_WRITE requests and return the bytes
+// written according to the response.
+//
+// Preconditions: len(data) == size.
+func (fs *filesystem) Write(ctx context.Context, fd *regularFileFD, off uint64, size uint32, data []byte) (uint32, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		log.Warningf("fusefs.Read: couldn't get kernel task from context")
+		return 0, syserror.EINVAL
+	}
+
+	// One request cannnot exceed either maxWrite or maxPages.
+	maxWrite := uint32(fs.conn.maxPages) << usermem.PageShift
+	if maxWrite > fs.conn.maxWrite {
+		maxWrite = fs.conn.maxWrite
+	}
+
+	// Reuse the same struct for unmarshalling to avoid unnecessary memory allocation.
+	in := linux.FUSEWriteIn{
+		Fh: fd.Fh,
+		// TODO(gvisor.dev/issue/3245): file lock
+		LockOwner: 0,
+		// TODO(gvisor.dev/issue/3245): |= linux.FUSE_READ_LOCKOWNER
+		// TODO(gvisor.dev/issue/3237): |= linux.FUSE_WRITE_CACHE (not added yet)
+		WriteFlags: 0,
+		Flags:      fd.statusFlags(),
+	}
+
+	inode := fd.inode()
+	var written uint32
+
+	// This loop is intended for fragmented write where the bytes to write is
+	// larger than either the maxWrite or maxPages or when bigWrites is false.
+	// Unless a small value for max_write is explicitly used, this loop
+	// is expected to execute only once for the majority of the writes.
+	for written < size {
+		toWrite := size - written
+
+		// Limit the write size to one page.
+		// Note that the bigWrites flag is obsolete,
+		// latest libfuse always sets it on.
+		if !fs.conn.bigWrites && toWrite > usermem.PageSize {
+			toWrite = usermem.PageSize
+		}
+
+		// Limit the write size to maxWrite.
+		if toWrite > maxWrite {
+			toWrite = maxWrite
+		}
+
+		in.Offset = off + uint64(written)
+		in.Size = toWrite
+
+		req, err := fs.conn.NewRequest(auth.CredentialsFromContext(ctx), uint32(t.ThreadID()), inode.nodeID, linux.FUSE_WRITE, &in)
+		if err != nil {
+			return 0, err
+		}
+
+		req.payload = data[written : written+toWrite]
+
+		// TODO(gvisor.dev/issue/3247): support async write.
+
+		res, err := fs.conn.Call(t, req)
+		if err != nil {
+			return 0, err
+		}
+		if err := res.Error(); err != nil {
+			return 0, err
+		}
+
+		out := linux.FUSEWriteOut{}
+		if err := res.UnmarshalPayload(&out); err != nil {
+			return 0, err
+		}
+
+		// Write more than requested? EIO.
+		if out.Size > toWrite {
+			return 0, syserror.EIO
+		}
+
+		written += out.Size
+
+		// Break if short write. Not necessarily an error.
+		if out.Size != toWrite {
+			break
+		}
+	}
+	inode.InodeAttrs.TouchCMtime(ctx)
+
+	return written, nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/regular_file.go b/pkg/sentry/fsimpl/fuse/regular_file.go
new file mode 100644
index 000000000..5bdd096c3
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/regular_file.go
@@ -0,0 +1,230 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"io"
+	"math"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type regularFileFD struct {
+	fileDescription
+
+	// off is the file offset.
+	off int64
+	// offMu protects off.
+	offMu sync.Mutex
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
+	size := dst.NumBytes()
+	if size == 0 {
+		// Early return if count is 0.
+		return 0, nil
+	} else if size > math.MaxUint32 {
+		// FUSE only supports uint32 for size.
+		// Overflow.
+		return 0, syserror.EINVAL
+	}
+
+	// TODO(gvisor.dev/issue/3678): Add direct IO support.
+
+	inode := fd.inode()
+
+	// Reading beyond EOF, update file size if outdated.
+	if uint64(offset+size) > atomic.LoadUint64(&inode.size) {
+		if err := inode.reviseAttr(ctx, linux.FUSE_GETATTR_FH, fd.Fh); err != nil {
+			return 0, err
+		}
+		// If the offset after update is still too large, return error.
+		if uint64(offset) >= atomic.LoadUint64(&inode.size) {
+			return 0, io.EOF
+		}
+	}
+
+	// Truncate the read with updated file size.
+	fileSize := atomic.LoadUint64(&inode.size)
+	if uint64(offset+size) > fileSize {
+		size = int64(fileSize) - offset
+	}
+
+	buffers, n, err := inode.fs.ReadInPages(ctx, fd, uint64(offset), uint32(size))
+	if err != nil {
+		return 0, err
+	}
+
+	// TODO(gvisor.dev/issue/3237): support indirect IO (e.g. caching),
+	// store the bytes that were read ahead.
+
+	// Update the number of bytes to copy for short read.
+	if n < uint32(size) {
+		size = int64(n)
+	}
+
+	// Copy the bytes read to the dst.
+	// This loop is intended for fragmented reads.
+	// For the majority of reads, this loop only execute once.
+	var copied int64
+	for _, buffer := range buffers {
+		toCopy := int64(len(buffer))
+		if copied+toCopy > size {
+			toCopy = size - copied
+		}
+		cp, err := dst.DropFirst64(copied).CopyOut(ctx, buffer[:toCopy])
+		if err != nil {
+			return 0, err
+		}
+		if int64(cp) != toCopy {
+			return 0, syserror.EIO
+		}
+		copied += toCopy
+	}
+
+	return copied, nil
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	n, _, err := fd.pwrite(ctx, src, offset, opts)
+	return n, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
+	fd.off = off
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// pwrite returns the number of bytes written, final offset and error. The
+// final offset should be ignored by PWrite.
+func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
+	if offset < 0 {
+		return 0, offset, syserror.EINVAL
+	}
+
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, offset, syserror.EOPNOTSUPP
+	}
+
+	inode := fd.inode()
+	inode.metadataMu.Lock()
+	defer inode.metadataMu.Unlock()
+
+	// If the file is opened with O_APPEND, update offset to file size.
+	// Note: since our Open() implements the interface of kernfs,
+	// and kernfs currently does not support O_APPEND, this will never
+	// be true before we switch out from kernfs.
+	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
+		// Locking inode.metadataMu is sufficient for reading size
+		offset = int64(inode.size)
+	}
+
+	srclen := src.NumBytes()
+
+	if srclen > math.MaxUint32 {
+		// FUSE only supports uint32 for size.
+		// Overflow.
+		return 0, offset, syserror.EINVAL
+	}
+	if end := offset + srclen; end < offset {
+		// Overflow.
+		return 0, offset, syserror.EINVAL
+	}
+
+	srclen, err = vfs.CheckLimit(ctx, offset, srclen)
+	if err != nil {
+		return 0, offset, err
+	}
+
+	if srclen == 0 {
+		// Return before causing any side effects.
+		return 0, offset, nil
+	}
+
+	src = src.TakeFirst64(srclen)
+
+	// TODO(gvisor.dev/issue/3237): Add cache support:
+	// buffer cache. Ideally we write from src to our buffer cache first.
+	// The slice passed to fs.Write() should be a slice from buffer cache.
+	data := make([]byte, srclen)
+	// Reason for making a copy here: connection.Call() blocks on kerneltask,
+	// which in turn acquires mm.activeMu lock. Functions like CopyInTo() will
+	// attemp to acquire the mm.activeMu lock as well -> deadlock.
+	// We must finish reading from the userspace memory before
+	// t.Block() deactivates it.
+	cp, err := src.CopyIn(ctx, data)
+	if err != nil {
+		return 0, offset, err
+	}
+	if int64(cp) != srclen {
+		return 0, offset, syserror.EIO
+	}
+
+	n, err := fd.inode().fs.Write(ctx, fd, uint64(offset), uint32(srclen), data)
+	if err != nil {
+		return 0, offset, err
+	}
+
+	if n == 0 {
+		// We have checked srclen != 0 previously.
+		// If err == nil, then it's a short write and we return EIO.
+		return 0, offset, syserror.EIO
+	}
+
+	written = int64(n)
+	finalOff = offset + written
+
+	if finalOff > int64(inode.size) {
+		atomic.StoreUint64(&inode.size, uint64(finalOff))
+		atomic.AddUint64(&inode.fs.conn.attributeVersion, 1)
+	}
+
+	return
+}
diff --git a/pkg/sentry/fsimpl/fuse/request_response.go b/pkg/sentry/fsimpl/fuse/request_response.go
new file mode 100644
index 000000000..7fa00569b
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/request_response.go
@@ -0,0 +1,229 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"fmt"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// fuseInitRes is a variable-length wrapper of linux.FUSEInitOut. The FUSE
+// server may implement an older version of FUSE protocol, which contains a
+// linux.FUSEInitOut with less attributes.
+//
+// Dynamically-sized objects cannot be marshalled.
+type fuseInitRes struct {
+	marshal.StubMarshallable
+
+	// initOut contains the response from the FUSE server.
+	initOut linux.FUSEInitOut
+
+	// initLen is the total length of bytes of the response.
+	initLen uint32
+}
+
+// UnmarshalBytes deserializes src to the initOut attribute in a fuseInitRes.
+func (r *fuseInitRes) UnmarshalBytes(src []byte) {
+	out := &r.initOut
+
+	// Introduced before FUSE kernel version 7.13.
+	out.Major = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+	out.Minor = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+	out.MaxReadahead = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+	out.Flags = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+	out.MaxBackground = uint16(usermem.ByteOrder.Uint16(src[:2]))
+	src = src[2:]
+	out.CongestionThreshold = uint16(usermem.ByteOrder.Uint16(src[:2]))
+	src = src[2:]
+	out.MaxWrite = uint32(usermem.ByteOrder.Uint32(src[:4]))
+	src = src[4:]
+
+	// Introduced in FUSE kernel version 7.23.
+	if len(src) >= 4 {
+		out.TimeGran = uint32(usermem.ByteOrder.Uint32(src[:4]))
+		src = src[4:]
+	}
+	// Introduced in FUSE kernel version 7.28.
+	if len(src) >= 2 {
+		out.MaxPages = uint16(usermem.ByteOrder.Uint16(src[:2]))
+		src = src[2:]
+	}
+}
+
+// SizeBytes is the size of the payload of the FUSE_INIT response.
+func (r *fuseInitRes) SizeBytes() int {
+	return int(r.initLen)
+}
+
+// Ordinary requests have even IDs, while interrupts IDs are odd.
+// Used to increment the unique ID for each FUSE request.
+var reqIDStep uint64 = 2
+
+// Request represents a FUSE operation request that hasn't been sent to the
+// server yet.
+//
+// +stateify savable
+type Request struct {
+	requestEntry
+
+	id   linux.FUSEOpID
+	hdr  *linux.FUSEHeaderIn
+	data []byte
+
+	// payload for this request: extra bytes to write after
+	// the data slice. Used by FUSE_WRITE.
+	payload []byte
+
+	// If this request is async.
+	async bool
+	// If we don't care its response.
+	// Manually set by the caller.
+	noReply bool
+}
+
+// NewRequest creates a new request that can be sent to the FUSE server.
+func (conn *connection) NewRequest(creds *auth.Credentials, pid uint32, ino uint64, opcode linux.FUSEOpcode, payload marshal.Marshallable) (*Request, error) {
+	conn.fd.mu.Lock()
+	defer conn.fd.mu.Unlock()
+	conn.fd.nextOpID += linux.FUSEOpID(reqIDStep)
+
+	hdrLen := (*linux.FUSEHeaderIn)(nil).SizeBytes()
+	hdr := linux.FUSEHeaderIn{
+		Len:    uint32(hdrLen + payload.SizeBytes()),
+		Opcode: opcode,
+		Unique: conn.fd.nextOpID,
+		NodeID: ino,
+		UID:    uint32(creds.EffectiveKUID),
+		GID:    uint32(creds.EffectiveKGID),
+		PID:    pid,
+	}
+
+	buf := make([]byte, hdr.Len)
+
+	// TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again.
+	hdr.MarshalBytes(buf[:hdrLen])
+	payload.MarshalBytes(buf[hdrLen:])
+
+	return &Request{
+		id:   hdr.Unique,
+		hdr:  &hdr,
+		data: buf,
+	}, nil
+}
+
+// futureResponse represents an in-flight request, that may or may not have
+// completed yet. Convert it to a resolved Response by calling Resolve, but note
+// that this may block.
+//
+// +stateify savable
+type futureResponse struct {
+	opcode linux.FUSEOpcode
+	ch     chan struct{}
+	hdr    *linux.FUSEHeaderOut
+	data   []byte
+
+	// If this request is async.
+	async bool
+}
+
+// newFutureResponse creates a future response to a FUSE request.
+func newFutureResponse(req *Request) *futureResponse {
+	return &futureResponse{
+		opcode: req.hdr.Opcode,
+		ch:     make(chan struct{}),
+		async:  req.async,
+	}
+}
+
+// resolve blocks the task until the server responds to its corresponding request,
+// then returns a resolved response.
+func (f *futureResponse) resolve(t *kernel.Task) (*Response, error) {
+	// Return directly for async requests.
+	if f.async {
+		return nil, nil
+	}
+
+	if err := t.Block(f.ch); err != nil {
+		return nil, err
+	}
+
+	return f.getResponse(), nil
+}
+
+// getResponse creates a Response from the data the futureResponse has.
+func (f *futureResponse) getResponse() *Response {
+	return &Response{
+		opcode: f.opcode,
+		hdr:    *f.hdr,
+		data:   f.data,
+	}
+}
+
+// Response represents an actual response from the server, including the
+// response payload.
+//
+// +stateify savable
+type Response struct {
+	opcode linux.FUSEOpcode
+	hdr    linux.FUSEHeaderOut
+	data   []byte
+}
+
+// Error returns the error of the FUSE call.
+func (r *Response) Error() error {
+	errno := r.hdr.Error
+	if errno >= 0 {
+		return nil
+	}
+
+	sysErrNo := syscall.Errno(-errno)
+	return error(sysErrNo)
+}
+
+// DataLen returns the size of the response without the header.
+func (r *Response) DataLen() uint32 {
+	return r.hdr.Len - uint32(r.hdr.SizeBytes())
+}
+
+// UnmarshalPayload unmarshals the response data into m.
+func (r *Response) UnmarshalPayload(m marshal.Marshallable) error {
+	hdrLen := r.hdr.SizeBytes()
+	haveDataLen := r.hdr.Len - uint32(hdrLen)
+	wantDataLen := uint32(m.SizeBytes())
+
+	if haveDataLen < wantDataLen {
+		return fmt.Errorf("payload too small. Minimum data lenth required: %d,  but got data length %d", wantDataLen, haveDataLen)
+	}
+
+	// The response data is empty unless there is some payload. And so, doesn't
+	// need to be unmarshalled.
+	if r.data == nil {
+		return nil
+	}
+
+	// TODO(gVisor.dev/issue/3698): Use the unsafe version once go_marshal is safe to use again.
+	m.UnmarshalBytes(r.data[hdrLen:])
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/fuse/utils_test.go b/pkg/sentry/fsimpl/fuse/utils_test.go
new file mode 100644
index 000000000..e1d9e3365
--- /dev/null
+++ b/pkg/sentry/fsimpl/fuse/utils_test.go
@@ -0,0 +1,132 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package fuse
+
+import (
+	"io"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/marshal"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func setup(t *testing.T) *testutil.System {
+	k, err := testutil.Boot()
+	if err != nil {
+		t.Fatalf("Error creating kernel: %v", err)
+	}
+
+	ctx := k.SupervisorContext()
+	creds := auth.CredentialsFromContext(ctx)
+
+	k.VFS().MustRegisterFilesystemType(Name, &FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserList:  true,
+		AllowUserMount: true,
+	})
+
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
+	if err != nil {
+		t.Fatalf("NewMountNamespace(): %v", err)
+	}
+
+	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
+}
+
+// newTestConnection creates a fuse connection that the sentry can communicate with
+// and the FD for the server to communicate with.
+func newTestConnection(system *testutil.System, k *kernel.Kernel, maxActiveRequests uint64) (*connection, *vfs.FileDescription, error) {
+	vfsObj := &vfs.VirtualFilesystem{}
+	fuseDev := &DeviceFD{}
+
+	if err := vfsObj.Init(system.Ctx); err != nil {
+		return nil, nil, err
+	}
+
+	vd := vfsObj.NewAnonVirtualDentry("genCountFD")
+	defer vd.DecRef(system.Ctx)
+	if err := fuseDev.vfsfd.Init(fuseDev, linux.O_RDWR|linux.O_CREAT, vd.Mount(), vd.Dentry(), &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, nil, err
+	}
+
+	fsopts := filesystemOptions{
+		maxActiveRequests: maxActiveRequests,
+	}
+	fs, err := newFUSEFilesystem(system.Ctx, 0, &fsopts, &fuseDev.vfsfd)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	return fs.conn, &fuseDev.vfsfd, nil
+}
+
+type testPayload struct {
+	marshal.StubMarshallable
+	data uint32
+}
+
+// SizeBytes implements marshal.Marshallable.SizeBytes.
+func (t *testPayload) SizeBytes() int {
+	return 4
+}
+
+// MarshalBytes implements marshal.Marshallable.MarshalBytes.
+func (t *testPayload) MarshalBytes(dst []byte) {
+	usermem.ByteOrder.PutUint32(dst[:4], t.data)
+}
+
+// UnmarshalBytes implements marshal.Marshallable.UnmarshalBytes.
+func (t *testPayload) UnmarshalBytes(src []byte) {
+	*t = testPayload{data: usermem.ByteOrder.Uint32(src[:4])}
+}
+
+// Packed implements marshal.Marshallable.Packed.
+func (t *testPayload) Packed() bool {
+	return true
+}
+
+// MarshalUnsafe implements marshal.Marshallable.MarshalUnsafe.
+func (t *testPayload) MarshalUnsafe(dst []byte) {
+	t.MarshalBytes(dst)
+}
+
+// UnmarshalUnsafe implements marshal.Marshallable.UnmarshalUnsafe.
+func (t *testPayload) UnmarshalUnsafe(src []byte) {
+	t.UnmarshalBytes(src)
+}
+
+// CopyOutN implements marshal.Marshallable.CopyOutN.
+func (t *testPayload) CopyOutN(task marshal.CopyContext, addr usermem.Addr, limit int) (int, error) {
+	panic("not implemented")
+}
+
+// CopyOut implements marshal.Marshallable.CopyOut.
+func (t *testPayload) CopyOut(task marshal.CopyContext, addr usermem.Addr) (int, error) {
+	panic("not implemented")
+}
+
+// CopyIn implements marshal.Marshallable.CopyIn.
+func (t *testPayload) CopyIn(task marshal.CopyContext, addr usermem.Addr) (int, error) {
+	panic("not implemented")
+}
+
+// WriteTo implements io.WriterTo.WriteTo.
+func (t *testPayload) WriteTo(w io.Writer) (int64, error) {
+	panic("not implemented")
+}
diff --git a/pkg/sentry/fsimpl/gofer/BUILD b/pkg/sentry/fsimpl/gofer/BUILD
index 16787116f..4c3e9acf8 100644
--- a/pkg/sentry/fsimpl/gofer/BUILD
+++ b/pkg/sentry/fsimpl/gofer/BUILD
@@ -38,6 +38,7 @@ go_library(
         "host_named_pipe.go",
         "p9file.go",
         "regular_file.go",
+        "save_restore.go",
         "socket.go",
         "special_file.go",
         "symlink.go",
@@ -52,6 +53,8 @@ go_library(
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/p9",
+        "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/safemem",
         "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/lock",
@@ -69,6 +72,7 @@ go_library(
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/usage",
         "//pkg/sentry/vfs",
+        "//pkg/sync",
         "//pkg/syserr",
         "//pkg/syserror",
         "//pkg/unet",
diff --git a/pkg/sentry/fsimpl/gofer/directory.go b/pkg/sentry/fsimpl/gofer/directory.go
index 2a8011eb4..e993c8e36 100644
--- a/pkg/sentry/fsimpl/gofer/directory.go
+++ b/pkg/sentry/fsimpl/gofer/directory.go
@@ -16,16 +16,17 @@ package gofer
 
 import (
 	"fmt"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -34,8 +35,11 @@ func (d *dentry) isDir() bool {
 	return d.fileType() == linux.S_IFDIR
 }
 
-// Preconditions: filesystem.renameMu must be locked. d.dirMu must be locked.
-// d.isDir(). child must be a newly-created dentry that has never had a parent.
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+// * child must be a newly-created dentry that has never had a parent.
 func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
 	d.IncRef() // reference held by child on its parent
 	child.parent = d
@@ -46,7 +50,9 @@ func (d *dentry) cacheNewChildLocked(child *dentry, name string) {
 	d.children[name] = child
 }
 
-// Preconditions: d.dirMu must be locked. d.isDir().
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
 func (d *dentry) cacheNegativeLookupLocked(name string) {
 	// Don't cache negative lookups if InteropModeShared is in effect (since
 	// this makes remote lookup unavoidable), or if d.isSynthetic() (in which
@@ -79,13 +85,15 @@ type createSyntheticOpts struct {
 // createSyntheticChildLocked creates a synthetic file with the given name
 // in d.
 //
-// Preconditions: d.dirMu must be locked. d.isDir(). d does not already contain
-// a child with the given name.
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
+// * d does not already contain a child with the given name.
 func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
-	d2 := &dentry{
+	child := &dentry{
 		refs:      1, // held by d
 		fs:        d.fs,
-		ino:       d.fs.nextSyntheticIno(),
+		ino:       d.fs.nextIno(),
 		mode:      uint32(opts.mode),
 		uid:       uint32(opts.kuid),
 		gid:       uint32(opts.kgid),
@@ -93,28 +101,32 @@ func (d *dentry) createSyntheticChildLocked(opts *createSyntheticOpts) {
 		hostFD:    -1,
 		nlink:     uint32(2),
 	}
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Register(child, "gofer.dentry")
+	}
 	switch opts.mode.FileType() {
 	case linux.S_IFDIR:
 		// Nothing else needs to be done.
 	case linux.S_IFSOCK:
-		d2.endpoint = opts.endpoint
+		child.endpoint = opts.endpoint
 	case linux.S_IFIFO:
-		d2.pipe = opts.pipe
+		child.pipe = opts.pipe
 	default:
 		panic(fmt.Sprintf("failed to create synthetic file of unrecognized type: %v", opts.mode.FileType()))
 	}
-	d2.pf.dentry = d2
-	d2.vfsd.Init(d2)
+	child.pf.dentry = child
+	child.vfsd.Init(child)
 
-	d.cacheNewChildLocked(d2, opts.name)
+	d.cacheNewChildLocked(child, opts.name)
 	d.syntheticChildren++
 }
 
+// +stateify savable
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
 
-	mu      sync.Mutex
+	mu      sync.Mutex `state:"nosave"`
 	off     int64
 	dirents []vfs.Dirent
 }
@@ -151,7 +163,9 @@ func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallba
 	return nil
 }
 
-// Preconditions: d.isDir(). There exists at least one directoryFD representing d.
+// Preconditions:
+// * d.isDir().
+// * There exists at least one directoryFD representing d.
 func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 	// NOTE(b/135560623): 9P2000.L's readdir does not specify behavior in the
 	// presence of concurrent mutation of an iterated directory, so
@@ -225,7 +239,7 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 				}
 				dirent := vfs.Dirent{
 					Name:    p9d.Name,
-					Ino:     uint64(inoFromPath(p9d.QID.Path)),
+					Ino:     d.fs.inoFromQIDPath(p9d.QID.Path),
 					NextOff: int64(len(dirents) + 1),
 				}
 				// p9 does not expose 9P2000.U's DMDEVICE, DMNAMEDPIPE, or
diff --git a/pkg/sentry/fsimpl/gofer/filesystem.go b/pkg/sentry/fsimpl/gofer/filesystem.go
index 9a90351e5..baecb88c4 100644
--- a/pkg/sentry/fsimpl/gofer/filesystem.go
+++ b/pkg/sentry/fsimpl/gofer/filesystem.go
@@ -35,7 +35,7 @@ import (
 
 // Sync implements vfs.FilesystemImpl.Sync.
 func (fs *filesystem) Sync(ctx context.Context) error {
-	// Snapshot current syncable dentries and special files.
+	// Snapshot current syncable dentries and special file FDs.
 	fs.syncMu.Lock()
 	ds := make([]*dentry, 0, len(fs.syncableDentries))
 	for d := range fs.syncableDentries {
@@ -53,22 +53,28 @@ func (fs *filesystem) Sync(ctx context.Context) error {
 	// regardless.
 	var retErr error
 
-	// Sync regular files.
+	// Sync syncable dentries.
 	for _, d := range ds {
-		err := d.syncCachedFile(ctx)
+		err := d.syncCachedFile(ctx, true /* forFilesystemSync */)
 		d.DecRef(ctx)
-		if err != nil && retErr == nil {
-			retErr = err
+		if err != nil {
+			ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err)
+			if retErr == nil {
+				retErr = err
+			}
 		}
 	}
 
 	// Sync special files, which may be writable but do not use dentry shared
 	// handles (so they won't be synced by the above).
 	for _, sffd := range sffds {
-		err := sffd.Sync(ctx)
+		err := sffd.sync(ctx, true /* forFilesystemSync */)
 		sffd.vfsfd.DecRef(ctx)
-		if err != nil && retErr == nil {
-			retErr = err
+		if err != nil {
+			ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err)
+			if retErr == nil {
+				retErr = err
+			}
 		}
 	}
 
@@ -115,9 +121,12 @@ func putDentrySlice(ds *[]*dentry) {
 // Dentries which may become cached as a result of the traversal are appended
 // to *ds.
 //
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done(). If !d.cachedMetadataAuthoritative(), then d's cached metadata
-// must be up to date.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
+// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
+//   to date.
 //
 // Postconditions: The returned dentry's cached metadata is up to date.
 func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
@@ -185,8 +194,11 @@ afterSymlink:
 // getChildLocked returns a dentry representing the child of parent with the
 // given name. If no such child exists, getChildLocked returns (nil, nil).
 //
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
-// parent.isDir(). name is not "." or "..".
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
+// * parent.isDir().
+// * name is not "." or "..".
 //
 // Postconditions: If getChildLocked returns a non-nil dentry, its cached
 // metadata is up to date.
@@ -206,7 +218,8 @@ func (fs *filesystem) getChildLocked(ctx context.Context, vfsObj *vfs.VirtualFil
 	return fs.revalidateChildLocked(ctx, vfsObj, parent, name, child, ds)
 }
 
-// Preconditions: As for getChildLocked. !parent.isSynthetic().
+// Preconditions: Same as getChildLocked, plus:
+// * !parent.isSynthetic().
 func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *dentry, name string, child *dentry, ds **[]*dentry) (*dentry, error) {
 	if child != nil {
 		// Need to lock child.metadataMu because we might be updating child
@@ -222,7 +235,7 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 		return nil, err
 	}
 	if child != nil {
-		if !file.isNil() && inoFromPath(qid.Path) == child.ino {
+		if !file.isNil() && qid.Path == child.qidPath {
 			// The file at this path hasn't changed. Just update cached metadata.
 			file.close(ctx)
 			child.updateFromP9AttrsLocked(attrMask, &attr)
@@ -279,9 +292,11 @@ func (fs *filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 // rp.Start().Impl().(*dentry)). It does not check that the returned directory
 // is searchable by the provider of rp.
 //
-// Preconditions: fs.renameMu must be locked. !rp.Done(). If
-// !d.cachedMetadataAuthoritative(), then d's cached metadata must be up to
-// date.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
+// * If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up
+//   to date.
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	for !rp.Final() {
 		d.dirMu.Lock()
@@ -328,9 +343,10 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 // createInRemoteDir (if the parent directory is a real remote directory) or
 // createInSyntheticDir (if the parent directory is synthetic) to do so.
 //
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string) error, createInSyntheticDir func(parent *dentry, name string) error) error {
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) error, createInSyntheticDir func(parent *dentry, name string) error) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -399,7 +415,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 		// RPC will fail with EEXIST like we would have. If the RPC succeeds, and a
 		// stale dentry exists, the dentry will fail revalidation next time it's
 		// used.
-		if err := createInRemoteDir(parent, name); err != nil {
+		if err := createInRemoteDir(parent, name, &ds); err != nil {
 			return err
 		}
 		ev := linux.IN_CREATE
@@ -414,7 +430,7 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	}
 	// No cached dentry exists; however, there might still be an existing file
 	// at name. As above, we attempt the file creation RPC anyway.
-	if err := createInRemoteDir(parent, name); err != nil {
+	if err := createInRemoteDir(parent, name, &ds); err != nil {
 		return err
 	}
 	if child, ok := parent.children[name]; ok && child == nil {
@@ -721,7 +737,7 @@ func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
 
 // LinkAt implements vfs.FilesystemImpl.LinkAt.
 func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
-	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, childName string, _ **[]*dentry) error {
 		if rp.Mount() != vd.Mount() {
 			return syserror.EXDEV
 		}
@@ -754,7 +770,7 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
 	creds := rp.Credentials()
-	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, _ **[]*dentry) error {
 		if _, err := parent.file.mkdir(ctx, name, (p9.FileMode)(opts.Mode), (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID)); err != nil {
 			if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
 				return err
@@ -789,34 +805,49 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 
 // MknodAt implements vfs.FilesystemImpl.MknodAt.
 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
-	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) error {
 		creds := rp.Credentials()
 		_, err := parent.file.mknod(ctx, name, (p9.FileMode)(opts.Mode), opts.DevMajor, opts.DevMinor, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
-		// If the gofer does not allow creating a socket or pipe, create a
-		// synthetic one, i.e. one that is kept entirely in memory.
-		if err == syserror.EPERM {
-			switch opts.Mode.FileType() {
-			case linux.S_IFSOCK:
-				parent.createSyntheticChildLocked(&createSyntheticOpts{
-					name:     name,
-					mode:     opts.Mode,
-					kuid:     creds.EffectiveKUID,
-					kgid:     creds.EffectiveKGID,
-					endpoint: opts.Endpoint,
-				})
-				return nil
-			case linux.S_IFIFO:
-				parent.createSyntheticChildLocked(&createSyntheticOpts{
-					name: name,
-					mode: opts.Mode,
-					kuid: creds.EffectiveKUID,
-					kgid: creds.EffectiveKGID,
-					pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
-				})
-				return nil
-			}
+		if err != syserror.EPERM {
+			return err
 		}
-		return err
+
+		// EPERM means that gofer does not allow creating a socket or pipe. Fallback
+		// to creating a synthetic one, i.e. one that is kept entirely in memory.
+
+		// Check that we're not overriding an existing file with a synthetic one.
+		_, err = fs.stepLocked(ctx, rp, parent, true, ds)
+		switch {
+		case err == nil:
+			// Step succeeded, another file exists.
+			return syserror.EEXIST
+		case err != syserror.ENOENT:
+			// Unexpected error.
+			return err
+		}
+
+		switch opts.Mode.FileType() {
+		case linux.S_IFSOCK:
+			parent.createSyntheticChildLocked(&createSyntheticOpts{
+				name:     name,
+				mode:     opts.Mode,
+				kuid:     creds.EffectiveKUID,
+				kgid:     creds.EffectiveKGID,
+				endpoint: opts.Endpoint,
+			})
+			return nil
+		case linux.S_IFIFO:
+			parent.createSyntheticChildLocked(&createSyntheticOpts{
+				name: name,
+				mode: opts.Mode,
+				kuid: creds.EffectiveKUID,
+				kgid: creds.EffectiveKGID,
+				pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize),
+			})
+			return nil
+		}
+		// Retain error from gofer if synthetic file cannot be created internally.
+		return syserror.EPERM
 	}, nil)
 }
 
@@ -1001,7 +1032,7 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.Open
 		// step is required even if !d.cachedMetadataAuthoritative() because
 		// d.mappings has to be updated.
 		// d.metadataMu has already been acquired if trunc == true.
-		d.updateFileSizeLocked(0)
+		d.updateSizeLocked(0)
 
 		if d.cachedMetadataAuthoritative() {
 			d.touchCMtimeLocked()
@@ -1072,8 +1103,10 @@ retry:
 	return &fd.vfsfd, nil
 }
 
-// Preconditions: d.fs.renameMu must be locked. d.dirMu must be locked.
-// !d.isSynthetic().
+// Preconditions:
+// * d.fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !d.isSynthetic().
 func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
 	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
 		return nil, err
@@ -1284,6 +1317,9 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 			if !renamed.isDir() {
 				return syserror.EISDIR
 			}
+			if genericIsAncestorDentry(replaced, renamed) {
+				return syserror.ENOTEMPTY
+			}
 		} else {
 			if rp.MustBeDir() || renamed.isDir() {
 				return syserror.ENOTDIR
@@ -1334,14 +1370,15 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	// with reference counts and queue oldParent for checkCachingLocked if the
 	// parent isn't actually changing.
 	if oldParent != newParent {
+		oldParent.decRefLocked()
 		ds = appendDentry(ds, oldParent)
 		newParent.IncRef()
 		if renamed.isSynthetic() {
 			oldParent.syntheticChildren--
 			newParent.syntheticChildren++
 		}
+		renamed.parent = newParent
 	}
-	renamed.parent = newParent
 	renamed.name = newName
 	if newParent.children == nil {
 		newParent.children = make(map[string]*dentry)
@@ -1385,11 +1422,11 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 		return err
 	}
-	if err := d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()); err != nil {
-		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	err = d.setStat(ctx, rp.Credentials(), &opts, rp.Mount())
+	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	if err != nil {
 		return err
 	}
-	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 
 	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
 		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
@@ -1452,7 +1489,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
-	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string) error {
+	return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, _ **[]*dentry) error {
 		creds := rp.Credentials()
 		_, err := parent.file.symlink(ctx, target, name, (p9.UID)(creds.EffectiveKUID), (p9.GID)(creds.EffectiveKGID))
 		return err
@@ -1464,7 +1501,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return fs.unlinkAt(ctx, rp, false /* dir */)
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
@@ -1481,17 +1518,18 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 			d.IncRef()
 			return &endpoint{
 				dentry: d,
-				file:   d.file.file,
 				path:   opts.Addr,
 			}, nil
 		}
-		return d.endpoint, nil
+		if d.endpoint != nil {
+			return d.endpoint, nil
+		}
 	}
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -1499,11 +1537,11 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 	if err != nil {
 		return nil, err
 	}
-	return d.listxattr(ctx, rp.Credentials(), size)
+	return d.listXattr(ctx, rp.Credentials(), size)
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
@@ -1511,11 +1549,11 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	if err != nil {
 		return "", err
 	}
-	return d.getxattr(ctx, rp.Credentials(), &opts)
+	return d.getXattr(ctx, rp.Credentials(), &opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	d, err := fs.resolveLocked(ctx, rp, &ds)
@@ -1523,18 +1561,18 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 		return err
 	}
-	if err := d.setxattr(ctx, rp.Credentials(), &opts); err != nil {
-		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	err = d.setXattr(ctx, rp.Credentials(), &opts)
+	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	if err != nil {
 		return err
 	}
-	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	d, err := fs.resolveLocked(ctx, rp, &ds)
@@ -1542,11 +1580,11 @@ func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath,
 		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 		return err
 	}
-	if err := d.removexattr(ctx, rp.Credentials(), name); err != nil {
-		fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	err = d.removeXattr(ctx, rp.Credentials(), name)
+	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
+	if err != nil {
 		return err
 	}
-	fs.renameMuRUnlockAndCheckCaching(ctx, &ds)
 
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
@@ -1558,7 +1596,3 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 	defer fs.renameMu.RUnlock()
 	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
 }
-
-func (fs *filesystem) nextSyntheticIno() inodeNumber {
-	return inodeNumber(atomic.AddUint64(&fs.syntheticSeq, 1) | syntheticInoMask)
-}
diff --git a/pkg/sentry/fsimpl/gofer/gofer.go b/pkg/sentry/fsimpl/gofer/gofer.go
index 63e589859..80668ebc1 100644
--- a/pkg/sentry/fsimpl/gofer/gofer.go
+++ b/pkg/sentry/fsimpl/gofer/gofer.go
@@ -26,6 +26,9 @@
 //             *** "memmap.Mappable locks taken by Translate" below this point
 //             dentry.handleMu
 //               dentry.dataMu
+//           filesystem.inoMu
+//   specialFileFD.mu
+//     specialFileFD.bufMu
 //
 // Locking dentry.dirMu in multiple dentries requires that either ancestor
 // dentries are locked before descendant dentries, or that filesystem.renameMu
@@ -36,7 +39,6 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
-	"sync"
 	"sync/atomic"
 	"syscall"
 
@@ -44,6 +46,8 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/p9"
+	refs_vfs1 "gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
 	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -53,6 +57,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/unet"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -62,9 +67,13 @@ import (
 const Name = "9p"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
@@ -77,7 +86,7 @@ type filesystem struct {
 	iopts InternalFilesystemOptions
 
 	// client is the client used by this filesystem. client is immutable.
-	client *p9.Client
+	client *p9.Client `state:"nosave"`
 
 	// clock is a realtime clock used to set timestamps in file operations.
 	clock ktime.Clock
@@ -85,6 +94,9 @@ type filesystem struct {
 	// devMinor is the filesystem's minor device number. devMinor is immutable.
 	devMinor uint32
 
+	// root is the root dentry. root is immutable.
+	root *dentry
+
 	// renameMu serves two purposes:
 	//
 	// - It synchronizes path resolution with renaming initiated by this
@@ -95,43 +107,42 @@ type filesystem struct {
 	// reference count (such that it is usable as vfs.ResolvingPath.Start() or
 	// is reachable from its children), or if it is a child dentry (such that
 	// it is reachable from its parent).
-	renameMu sync.RWMutex
+	renameMu sync.RWMutex `state:"nosave"`
 
 	// cachedDentries contains all dentries with 0 references. (Due to race
 	// conditions, it may also contain dentries with non-zero references.)
-	// cachedDentriesLen is the number of dentries in cachedDentries. These
-	// fields are protected by renameMu.
+	// cachedDentriesLen is the number of dentries in cachedDentries. These fields
+	// are protected by renameMu.
 	cachedDentries    dentryList
 	cachedDentriesLen uint64
 
-	// syncableDentries contains all dentries in this filesystem for which
-	// !dentry.file.isNil(). specialFileFDs contains all open specialFileFDs.
-	// These fields are protected by syncMu.
-	syncMu           sync.Mutex
+	// syncableDentries contains all non-synthetic dentries. specialFileFDs
+	// contains all open specialFileFDs. These fields are protected by syncMu.
+	syncMu           sync.Mutex `state:"nosave"`
 	syncableDentries map[*dentry]struct{}
 	specialFileFDs   map[*specialFileFD]struct{}
 
-	// syntheticSeq stores a counter to used to generate unique inodeNumber for
-	// synthetic dentries.
-	syntheticSeq uint64
-}
+	// inoByQIDPath maps previously-observed QID.Paths to inode numbers
+	// assigned to those paths. inoByQIDPath is not preserved across
+	// checkpoint/restore because QIDs may be reused between different gofer
+	// processes, so QIDs may be repeated for different files across
+	// checkpoint/restore. inoByQIDPath is protected by inoMu.
+	inoMu        sync.Mutex        `state:"nosave"`
+	inoByQIDPath map[uint64]uint64 `state:"nosave"`
 
-// inodeNumber represents inode number reported in Dirent.Ino. For regular
-// dentries, it comes from QID.Path from the 9P server. Synthetic dentries
-// have have their inodeNumber generated sequentially, with the MSB reserved to
-// prevent conflicts with regular dentries.
-type inodeNumber uint64
+	// lastIno is the last inode number assigned to a file. lastIno is accessed
+	// using atomic memory operations.
+	lastIno uint64
 
-// Reserve MSB for synthetic mounts.
-const syntheticInoMask = uint64(1) << 63
+	// savedDentryRW records open read/write handles during save/restore.
+	savedDentryRW map[*dentry]savedDentryRW
 
-func inoFromPath(path uint64) inodeNumber {
-	if path&syntheticInoMask != 0 {
-		log.Warningf("Dropping MSB from ino, collision is possible. Original: %d, new: %d", path, path&^syntheticInoMask)
-	}
-	return inodeNumber(path &^ syntheticInoMask)
+	// released is nonzero once filesystem.Release has been called. It is accessed
+	// with atomic memory operations.
+	released int32
 }
 
+// +stateify savable
 type filesystemOptions struct {
 	// "Standard" 9P options.
 	fd      int
@@ -142,8 +153,7 @@ type filesystemOptions struct {
 	msize   uint32
 	version string
 
-	// maxCachedDentries is the maximum number of dentries with 0 references
-	// retained by the client.
+	// maxCachedDentries is the maximum size of filesystem.cachedDentries.
 	maxCachedDentries uint64
 
 	// If forcePageCache is true, host FDs may not be used for application
@@ -177,6 +187,8 @@ type filesystemOptions struct {
 
 // InteropMode controls the client's interaction with other remote filesystem
 // users.
+//
+// +stateify savable
 type InteropMode uint32
 
 const (
@@ -195,11 +207,7 @@ const (
 	// and consistent with Linux's semantics (in particular, it is not always
 	// possible for clients to set arbitrary atimes and mtimes depending on the
 	// remote filesystem implementation, and never possible for clients to set
-	// arbitrary ctimes.) If a dentry containing a client-defined atime or
-	// mtime is evicted from cache, client timestamps will be sent to the
-	// remote filesystem on a best-effort basis to attempt to ensure that
-	// timestamps will be preserved when another dentry representing the same
-	// file is instantiated.
+	// arbitrary ctimes.)
 	InteropModeExclusive InteropMode = iota
 
 	// InteropModeWritethrough is appropriate when there are read-only users of
@@ -239,7 +247,13 @@ const (
 
 // InternalFilesystemOptions may be passed as
 // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+//
+// +stateify savable
 type InternalFilesystemOptions struct {
+	// If UniqueID is non-empty, it is an opaque string used to reassociate the
+	// filesystem with a new server FD during restoration from checkpoint.
+	UniqueID string
+
 	// If LeakConnection is true, do not close the connection to the server
 	// when the Filesystem is released. This is necessary for deployments in
 	// which servers can handle only a single client and report failure if that
@@ -265,6 +279,9 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
@@ -276,46 +293,11 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	mopts := vfs.GenericParseMountOptions(opts.Data)
 	var fsopts filesystemOptions
 
-	// Check that the transport is "fd".
-	trans, ok := mopts["trans"]
-	if !ok {
-		ctx.Warningf("gofer.FilesystemType.GetFilesystem: transport must be specified as 'trans=fd'")
-		return nil, nil, syserror.EINVAL
-	}
-	delete(mopts, "trans")
-	if trans != "fd" {
-		ctx.Warningf("gofer.FilesystemType.GetFilesystem: unsupported transport: trans=%s", trans)
-		return nil, nil, syserror.EINVAL
-	}
-
-	// Check that read and write FDs are provided and identical.
-	rfdstr, ok := mopts["rfdno"]
-	if !ok {
-		ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD must be specified as 'rfdno=<file descriptor>")
-		return nil, nil, syserror.EINVAL
-	}
-	delete(mopts, "rfdno")
-	rfd, err := strconv.Atoi(rfdstr)
+	fd, err := getFDFromMountOptionsMap(ctx, mopts)
 	if err != nil {
-		ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid read FD: rfdno=%s", rfdstr)
-		return nil, nil, syserror.EINVAL
-	}
-	wfdstr, ok := mopts["wfdno"]
-	if !ok {
-		ctx.Warningf("gofer.FilesystemType.GetFilesystem: write FD must be specified as 'wfdno=<file descriptor>")
-		return nil, nil, syserror.EINVAL
-	}
-	delete(mopts, "wfdno")
-	wfd, err := strconv.Atoi(wfdstr)
-	if err != nil {
-		ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid write FD: wfdno=%s", wfdstr)
-		return nil, nil, syserror.EINVAL
-	}
-	if rfd != wfd {
-		ctx.Warningf("gofer.FilesystemType.GetFilesystem: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
-		return nil, nil, syserror.EINVAL
+		return nil, nil, err
 	}
-	fsopts.fd = rfd
+	fsopts.fd = fd
 
 	// Get the attach name.
 	fsopts.aname = "/"
@@ -431,56 +413,43 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	}
 	// If !ok, iopts being the zero value is correct.
 
-	// Establish a connection with the server.
-	conn, err := unet.NewSocket(fsopts.fd)
+	// Construct the filesystem object.
+	devMinor, err := vfsObj.GetAnonBlockDevMinor()
 	if err != nil {
 		return nil, nil, err
 	}
+	fs := &filesystem{
+		mfp:              mfp,
+		opts:             fsopts,
+		iopts:            iopts,
+		clock:            ktime.RealtimeClockFromContext(ctx),
+		devMinor:         devMinor,
+		syncableDentries: make(map[*dentry]struct{}),
+		specialFileFDs:   make(map[*specialFileFD]struct{}),
+		inoByQIDPath:     make(map[uint64]uint64),
+	}
+	fs.vfsfs.Init(vfsObj, &fstype, fs)
 
-	// Perform version negotiation with the server.
-	ctx.UninterruptibleSleepStart(false)
-	client, err := p9.NewClient(conn, fsopts.msize, fsopts.version)
-	ctx.UninterruptibleSleepFinish(false)
-	if err != nil {
-		conn.Close()
+	// Connect to the server.
+	if err := fs.dial(ctx); err != nil {
 		return nil, nil, err
 	}
-	// Ownership of conn has been transferred to client.
 
 	// Perform attach to obtain the filesystem root.
 	ctx.UninterruptibleSleepStart(false)
-	attached, err := client.Attach(fsopts.aname)
+	attached, err := fs.client.Attach(fsopts.aname)
 	ctx.UninterruptibleSleepFinish(false)
 	if err != nil {
-		client.Close()
+		fs.vfsfs.DecRef(ctx)
 		return nil, nil, err
 	}
 	attachFile := p9file{attached}
 	qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
 	if err != nil {
 		attachFile.close(ctx)
-		client.Close()
-		return nil, nil, err
-	}
-
-	// Construct the filesystem object.
-	devMinor, err := vfsObj.GetAnonBlockDevMinor()
-	if err != nil {
-		attachFile.close(ctx)
-		client.Close()
+		fs.vfsfs.DecRef(ctx)
 		return nil, nil, err
 	}
-	fs := &filesystem{
-		mfp:              mfp,
-		opts:             fsopts,
-		iopts:            iopts,
-		client:           client,
-		clock:            ktime.RealtimeClockFromContext(ctx),
-		devMinor:         devMinor,
-		syncableDentries: make(map[*dentry]struct{}),
-		specialFileFDs:   make(map[*specialFileFD]struct{}),
-	}
-	fs.vfsfs.Init(vfsObj, &fstype, fs)
 
 	// Construct the root dentry.
 	root, err := fs.newDentry(ctx, attachFile, qid, attrMask, &attr)
@@ -490,25 +459,87 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		return nil, nil, err
 	}
 	// Set the root's reference count to 2. One reference is returned to the
-	// caller, and the other is deliberately leaked to prevent the root from
-	// being "cached" and subsequently evicted. Its resources will still be
-	// cleaned up by fs.Release().
+	// caller, and the other is held by fs to prevent the root from being "cached"
+	// and subsequently evicted.
 	root.refs = 2
+	fs.root = root
 
 	return &fs.vfsfs, &root.vfsd, nil
 }
 
+func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) {
+	// Check that the transport is "fd".
+	trans, ok := mopts["trans"]
+	if !ok || trans != "fd" {
+		ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as 'trans=fd'")
+		return -1, syserror.EINVAL
+	}
+	delete(mopts, "trans")
+
+	// Check that read and write FDs are provided and identical.
+	rfdstr, ok := mopts["rfdno"]
+	if !ok {
+		ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as 'rfdno=<file descriptor>'")
+		return -1, syserror.EINVAL
+	}
+	delete(mopts, "rfdno")
+	rfd, err := strconv.Atoi(rfdstr)
+	if err != nil {
+		ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: rfdno=%s", rfdstr)
+		return -1, syserror.EINVAL
+	}
+	wfdstr, ok := mopts["wfdno"]
+	if !ok {
+		ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as 'wfdno=<file descriptor>'")
+		return -1, syserror.EINVAL
+	}
+	delete(mopts, "wfdno")
+	wfd, err := strconv.Atoi(wfdstr)
+	if err != nil {
+		ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: wfdno=%s", wfdstr)
+		return -1, syserror.EINVAL
+	}
+	if rfd != wfd {
+		ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
+		return -1, syserror.EINVAL
+	}
+	return rfd, nil
+}
+
+// Preconditions: fs.client == nil.
+func (fs *filesystem) dial(ctx context.Context) error {
+	// Establish a connection with the server.
+	conn, err := unet.NewSocket(fs.opts.fd)
+	if err != nil {
+		return err
+	}
+
+	// Perform version negotiation with the server.
+	ctx.UninterruptibleSleepStart(false)
+	client, err := p9.NewClient(conn, fs.opts.msize, fs.opts.version)
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		conn.Close()
+		return err
+	}
+	// Ownership of conn has been transferred to client.
+
+	fs.client = client
+	return nil
+}
+
 // Release implements vfs.FilesystemImpl.Release.
 func (fs *filesystem) Release(ctx context.Context) {
-	mf := fs.mfp.MemoryFile()
+	atomic.StoreInt32(&fs.released, 1)
 
+	mf := fs.mfp.MemoryFile()
 	fs.syncMu.Lock()
 	for d := range fs.syncableDentries {
 		d.handleMu.Lock()
 		d.dataMu.Lock()
 		if h := d.writeHandleLocked(); h.isOpen() {
 			// Write dirty cached data to the remote file.
-			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, fs.mfp.MemoryFile(), h.writeFromBlocksAt); err != nil {
+			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, mf, h.writeFromBlocksAt); err != nil {
 				log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
 			}
 			// TODO(jamieliu): Do we need to flushf/fsync d?
@@ -529,6 +560,21 @@ func (fs *filesystem) Release(ctx context.Context) {
 	// fs.
 	fs.syncMu.Unlock()
 
+	// If leak checking is enabled, release all outstanding references in the
+	// filesystem. We deliberately avoid doing this outside of leak checking; we
+	// have released all external resources above rather than relying on dentry
+	// destructors.
+	if refs_vfs1.GetLeakMode() != refs_vfs1.NoLeakChecking {
+		fs.renameMu.Lock()
+		fs.root.releaseSyntheticRecursiveLocked(ctx)
+		fs.evictAllCachedDentriesLocked(ctx)
+		fs.renameMu.Unlock()
+
+		// An extra reference was held by the filesystem on the root to prevent it from
+		// being cached/evicted.
+		fs.root.DecRef(ctx)
+	}
+
 	if !fs.iopts.LeakConnection {
 		// Close the connection to the server. This implicitly clunks all fids.
 		fs.client.Close()
@@ -537,7 +583,34 @@ func (fs *filesystem) Release(ctx context.Context) {
 	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
 }
 
+// releaseSyntheticRecursiveLocked traverses the tree with root d and decrements
+// the reference count on every synthetic dentry. Synthetic dentries have one
+// reference for existence that should be dropped during filesystem.Release.
+//
+// Precondition: d.fs.renameMu is locked.
+func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) {
+	if d.isSynthetic() {
+		d.decRefLocked()
+		d.checkCachingLocked(ctx)
+	}
+	if d.isDir() {
+		var children []*dentry
+		d.dirMu.Lock()
+		for _, child := range d.children {
+			children = append(children, child)
+		}
+		d.dirMu.Unlock()
+		for _, child := range children {
+			if child != nil {
+				child.releaseSyntheticRecursiveLocked(ctx)
+			}
+		}
+	}
+}
+
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
@@ -562,12 +635,15 @@ type dentry struct {
 	// filesystem.renameMu.
 	name string
 
+	// qidPath is the p9.QID.Path for this file. qidPath is immutable.
+	qidPath uint64
+
 	// file is the unopened p9.File that backs this dentry. file is immutable.
 	//
 	// If file.isNil(), this dentry represents a synthetic file, i.e. a file
 	// that does not exist on the remote filesystem. As of this writing, the
 	// only files that can be synthetic are sockets, pipes, and directories.
-	file p9file
+	file p9file `state:"nosave"`
 
 	// If deleted is non-zero, the file represented by this dentry has been
 	// deleted. deleted is accessed using atomic memory operations.
@@ -579,7 +655,7 @@ type dentry struct {
 	cached bool
 	dentryEntry
 
-	dirMu sync.Mutex
+	dirMu sync.Mutex `state:"nosave"`
 
 	// If this dentry represents a directory, children contains:
 	//
@@ -611,12 +687,12 @@ type dentry struct {
 	// To mutate:
 	//   - Lock metadataMu and use atomic operations to update because we might
 	//     have atomic readers that don't hold the lock.
-	metadataMu sync.Mutex
-	ino        inodeNumber // immutable
-	mode       uint32      // type is immutable, perms are mutable
-	uid        uint32      // auth.KUID, but stored as raw uint32 for sync/atomic
-	gid        uint32      // auth.KGID, but ...
-	blockSize  uint32      // 0 if unknown
+	metadataMu sync.Mutex `state:"nosave"`
+	ino        uint64     // immutable
+	mode       uint32     // type is immutable, perms are mutable
+	uid        uint32     // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid        uint32     // auth.KGID, but ...
+	blockSize  uint32     // 0 if unknown
 	// Timestamps, all nsecs from the Unix epoch.
 	atime int64
 	mtime int64
@@ -642,7 +718,7 @@ type dentry struct {
 	// other metadata fields.
 	nlink uint32
 
-	mapsMu sync.Mutex
+	mapsMu sync.Mutex `state:"nosave"`
 
 	// If this dentry represents a regular file, mappings tracks mappings of
 	// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
@@ -666,12 +742,12 @@ type dentry struct {
 	// either p9.File transitions from closed (isNil() == true) to open
 	// (isNil() == false), it may be mutated with handleMu locked, but cannot
 	// be closed until the dentry is destroyed.
-	handleMu  sync.RWMutex
-	readFile  p9file
-	writeFile p9file
-	hostFD    int32
+	handleMu  sync.RWMutex `state:"nosave"`
+	readFile  p9file       `state:"nosave"`
+	writeFile p9file       `state:"nosave"`
+	hostFD    int32        `state:"nosave"`
 
-	dataMu sync.RWMutex
+	dataMu sync.RWMutex `state:"nosave"`
 
 	// If this dentry represents a regular file that is client-cached, cache
 	// maps offsets into the cached file to offsets into
@@ -703,6 +779,13 @@ type dentry struct {
 	locks vfs.FileLocks
 
 	// Inotify watches for this dentry.
+	//
+	// Note that inotify may behave unexpectedly in the presence of hard links,
+	// because dentries corresponding to the same file have separate inotify
+	// watches when they should share the same set. This is the case because it is
+	// impossible for us to know for sure whether two dentries correspond to the
+	// same underlying file (see the gofer filesystem section fo vfs/inotify.md for
+	// a more in-depth discussion on this matter).
 	watches vfs.Watches
 }
 
@@ -739,8 +822,9 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 
 	d := &dentry{
 		fs:        fs,
+		qidPath:   qid.Path,
 		file:      file,
-		ino:       inoFromPath(qid.Path),
+		ino:       fs.inoFromQIDPath(qid.Path),
 		mode:      uint32(attr.Mode),
 		uid:       uint32(fs.opts.dfltuid),
 		gid:       uint32(fs.opts.dfltgid),
@@ -776,6 +860,9 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 		d.nlink = uint32(attr.NLink)
 	}
 	d.vfsd.Init(d)
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Register(d, "gofer.dentry")
+	}
 
 	fs.syncMu.Lock()
 	fs.syncableDentries[d] = struct{}{}
@@ -783,6 +870,21 @@ func (fs *filesystem) newDentry(ctx context.Context, file p9file, qid p9.QID, ma
 	return d, nil
 }
 
+func (fs *filesystem) inoFromQIDPath(qidPath uint64) uint64 {
+	fs.inoMu.Lock()
+	defer fs.inoMu.Unlock()
+	if ino, ok := fs.inoByQIDPath[qidPath]; ok {
+		return ino
+	}
+	ino := fs.nextIno()
+	fs.inoByQIDPath[qidPath] = ino
+	return ino
+}
+
+func (fs *filesystem) nextIno() uint64 {
+	return atomic.AddUint64(&fs.lastIno, 1)
+}
+
 func (d *dentry) isSynthetic() bool {
 	return d.file.isNil()
 }
@@ -830,11 +932,11 @@ func (d *dentry) updateFromP9AttrsLocked(mask p9.AttrMask, attr *p9.Attr) {
 		atomic.StoreUint32(&d.nlink, uint32(attr.NLink))
 	}
 	if mask.Size {
-		d.updateFileSizeLocked(attr.Size)
+		d.updateSizeLocked(attr.Size)
 	}
 }
 
-// Preconditions: !d.isSynthetic()
+// Preconditions: !d.isSynthetic().
 func (d *dentry) updateFromGetattr(ctx context.Context) error {
 	// Use d.readFile or d.writeFile, which represent 9P fids that have been
 	// opened, in preference to d.file, which represents a 9P fid that has not.
@@ -897,10 +999,10 @@ func (d *dentry) statTo(stat *linux.Statx) {
 	// This is consistent with regularFileFD.Seek(), which treats regular files
 	// as having no holes.
 	stat.Blocks = (stat.Size + 511) / 512
-	stat.Atime = statxTimestampFromDentry(atomic.LoadInt64(&d.atime))
-	stat.Btime = statxTimestampFromDentry(atomic.LoadInt64(&d.btime))
-	stat.Ctime = statxTimestampFromDentry(atomic.LoadInt64(&d.ctime))
-	stat.Mtime = statxTimestampFromDentry(atomic.LoadInt64(&d.mtime))
+	stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.atime))
+	stat.Btime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.btime))
+	stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.ctime))
+	stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&d.mtime))
 	stat.DevMajor = linux.UNNAMED_MAJOR
 	stat.DevMinor = d.fs.devMinor
 }
@@ -948,10 +1050,10 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
 		// Use client clocks for timestamps.
 		now = d.fs.clock.Now().Nanoseconds()
 		if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW {
-			stat.Atime = statxTimestampFromDentry(now)
+			stat.Atime = linux.NsecToStatxTimestamp(now)
 		}
 		if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW {
-			stat.Mtime = statxTimestampFromDentry(now)
+			stat.Mtime = linux.NsecToStatxTimestamp(now)
 		}
 	}
 
@@ -984,7 +1086,7 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
 				// d.size should be kept up to date, and privatized
 				// copy-on-write mappings of truncated pages need to be
 				// invalidated, even if InteropModeShared is in effect.
-				d.updateFileSizeLocked(stat.Size)
+				d.updateSizeLocked(stat.Size)
 			}
 		}
 		if d.fs.opts.interop == InteropModeShared {
@@ -1010,19 +1112,42 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs
 	// !d.cachedMetadataAuthoritative() then we returned after calling
 	// d.file.setAttr(). For the same reason, now must have been initialized.
 	if stat.Mask&linux.STATX_ATIME != 0 {
-		atomic.StoreInt64(&d.atime, dentryTimestampFromStatx(stat.Atime))
+		atomic.StoreInt64(&d.atime, stat.Atime.ToNsec())
 		atomic.StoreUint32(&d.atimeDirty, 0)
 	}
 	if stat.Mask&linux.STATX_MTIME != 0 {
-		atomic.StoreInt64(&d.mtime, dentryTimestampFromStatx(stat.Mtime))
+		atomic.StoreInt64(&d.mtime, stat.Mtime.ToNsec())
 		atomic.StoreUint32(&d.mtimeDirty, 0)
 	}
 	atomic.StoreInt64(&d.ctime, now)
 	return nil
 }
 
+// doAllocate performs an allocate operation on d. Note that d.metadataMu will
+// be held when allocate is called.
+func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error {
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+
+	// Allocating a smaller size is a noop.
+	size := offset + length
+	if d.cachedMetadataAuthoritative() && size <= d.size {
+		return nil
+	}
+
+	err := allocate()
+	if err != nil {
+		return err
+	}
+	d.updateSizeLocked(size)
+	if d.cachedMetadataAuthoritative() {
+		d.touchCMtimeLocked()
+	}
+	return nil
+}
+
 // Preconditions: d.metadataMu must be locked.
-func (d *dentry) updateFileSizeLocked(newSize uint64) {
+func (d *dentry) updateSizeLocked(newSize uint64) {
 	d.dataMu.Lock()
 	oldSize := d.size
 	atomic.StoreUint64(&d.size, newSize)
@@ -1060,6 +1185,21 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
+func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+	// We only support xattrs prefixed with "user." (see b/148380782). Currently,
+	// there is no need to expose any other xattrs through a gofer.
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+		return syserror.EOPNOTSUPP
+	}
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	kuid := auth.KUID(atomic.LoadUint32(&d.uid))
+	kgid := auth.KGID(atomic.LoadUint32(&d.gid))
+	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+		return err
+	}
+	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
+}
+
 func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
 	return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&child.uid)))
 }
@@ -1118,6 +1258,11 @@ func (d *dentry) decRefLocked() {
 	}
 }
 
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (d *dentry) LeakMessage() string {
+	return fmt.Sprintf("[gofer.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
+}
+
 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
 func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
 	if d.isDir() {
@@ -1166,6 +1311,10 @@ func (d *dentry) checkCachingLocked(ctx context.Context) {
 	// resolution, which requires renameMu, so if d.refs is zero then it will
 	// remain zero while we hold renameMu for writing.)
 	refs := atomic.LoadInt64(&d.refs)
+	if refs == -1 {
+		// Dentry has already been destroyed.
+		return
+	}
 	if refs > 0 {
 		if d.cached {
 			d.fs.cachedDentries.Remove(d)
@@ -1174,10 +1323,6 @@ func (d *dentry) checkCachingLocked(ctx context.Context) {
 		}
 		return
 	}
-	if refs == -1 {
-		// Dentry has already been destroyed.
-		return
-	}
 	// Deleted and invalidated dentries with zero references are no longer
 	// reachable by path resolution and should be dropped immediately.
 	if d.vfsd.IsDead() {
@@ -1200,6 +1345,16 @@ func (d *dentry) checkCachingLocked(ctx context.Context) {
 	if d.watches.Size() > 0 {
 		return
 	}
+
+	if atomic.LoadInt32(&d.fs.released) != 0 {
+		if d.parent != nil {
+			d.parent.dirMu.Lock()
+			delete(d.parent.children, d.name)
+			d.parent.dirMu.Unlock()
+		}
+		d.destroyLocked(ctx)
+	}
+
 	// If d is already cached, just move it to the front of the LRU.
 	if d.cached {
 		d.fs.cachedDentries.Remove(d)
@@ -1212,33 +1367,48 @@ func (d *dentry) checkCachingLocked(ctx context.Context) {
 	d.fs.cachedDentriesLen++
 	d.cached = true
 	if d.fs.cachedDentriesLen > d.fs.opts.maxCachedDentries {
-		victim := d.fs.cachedDentries.Back()
-		d.fs.cachedDentries.Remove(victim)
-		d.fs.cachedDentriesLen--
-		victim.cached = false
-		// victim.refs may have become non-zero from an earlier path resolution
-		// since it was inserted into fs.cachedDentries.
-		if atomic.LoadInt64(&victim.refs) == 0 {
-			if victim.parent != nil {
-				victim.parent.dirMu.Lock()
-				if !victim.vfsd.IsDead() {
-					// Note that victim can't be a mount point (in any mount
-					// namespace), since VFS holds references on mount points.
-					d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd)
-					delete(victim.parent.children, victim.name)
-					// We're only deleting the dentry, not the file it
-					// represents, so we don't need to update
-					// victimParent.dirents etc.
-				}
-				victim.parent.dirMu.Unlock()
-			}
-			victim.destroyLocked(ctx)
-		}
+		d.fs.evictCachedDentryLocked(ctx)
 		// Whether or not victim was destroyed, we brought fs.cachedDentriesLen
 		// back down to fs.opts.maxCachedDentries, so we don't loop.
 	}
 }
 
+// Precondition: fs.renameMu must be locked for writing; it may be temporarily
+// unlocked.
+func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) {
+	for fs.cachedDentriesLen != 0 {
+		fs.evictCachedDentryLocked(ctx)
+	}
+}
+
+// Preconditions:
+// * fs.renameMu must be locked for writing; it may be temporarily unlocked.
+// * fs.cachedDentriesLen != 0.
+func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) {
+	victim := fs.cachedDentries.Back()
+	fs.cachedDentries.Remove(victim)
+	fs.cachedDentriesLen--
+	victim.cached = false
+	// victim.refs may have become non-zero from an earlier path resolution
+	// since it was inserted into fs.cachedDentries.
+	if atomic.LoadInt64(&victim.refs) == 0 {
+		if victim.parent != nil {
+			victim.parent.dirMu.Lock()
+			if !victim.vfsd.IsDead() {
+				// Note that victim can't be a mount point (in any mount
+				// namespace), since VFS holds references on mount points.
+				fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &victim.vfsd)
+				delete(victim.parent.children, victim.name)
+				// We're only deleting the dentry, not the file it
+				// represents, so we don't need to update
+				// victimParent.dirents etc.
+			}
+			victim.parent.dirMu.Unlock()
+		}
+		victim.destroyLocked(ctx)
+	}
+}
+
 // destroyLocked destroys the dentry.
 //
 // Preconditions:
@@ -1293,30 +1463,19 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 	d.handleMu.Unlock()
 
 	if !d.file.isNil() {
-		if !d.isDeleted() {
-			// Write dirty timestamps back to the remote filesystem.
-			atimeDirty := atomic.LoadUint32(&d.atimeDirty) != 0
-			mtimeDirty := atomic.LoadUint32(&d.mtimeDirty) != 0
-			if atimeDirty || mtimeDirty {
-				atime := atomic.LoadInt64(&d.atime)
-				mtime := atomic.LoadInt64(&d.mtime)
-				if err := d.file.setAttr(ctx, p9.SetAttrMask{
-					ATime:              atimeDirty,
-					ATimeNotSystemTime: atimeDirty,
-					MTime:              mtimeDirty,
-					MTimeNotSystemTime: mtimeDirty,
-				}, p9.SetAttr{
-					ATimeSeconds:     uint64(atime / 1e9),
-					ATimeNanoSeconds: uint64(atime % 1e9),
-					MTimeSeconds:     uint64(mtime / 1e9),
-					MTimeNanoSeconds: uint64(mtime % 1e9),
-				}); err != nil {
-					log.Warningf("gofer.dentry.destroyLocked: failed to write dirty timestamps back: %v", err)
-				}
-			}
+		// Note that it's possible that d.atimeDirty or d.mtimeDirty are true,
+		// i.e. client and server timestamps may differ (because e.g. a client
+		// write was serviced by the page cache, and only written back to the
+		// remote file later). Ideally, we'd write client timestamps back to
+		// the remote filesystem so that timestamps for a new dentry
+		// instantiated for the same file would remain coherent. Unfortunately,
+		// this turns out to be too expensive in many cases, so for now we
+		// don't do this.
+		if err := d.file.close(ctx); err != nil {
+			log.Warningf("gofer.dentry.destroyLocked: failed to close file: %v", err)
 		}
-		d.file.close(ctx)
 		d.file = p9file{}
+
 		// Remove d from the set of syncable dentries.
 		d.fs.syncMu.Lock()
 		delete(d.fs.syncableDentries, d)
@@ -1334,6 +1493,10 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 			panic("gofer.dentry.DecRef() called without holding a reference")
 		}
 	}
+
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Unregister(d, "gofer.dentry")
+	}
 }
 
 func (d *dentry) isDeleted() bool {
@@ -1344,9 +1507,7 @@ func (d *dentry) setDeleted() {
 	atomic.StoreUint32(&d.deleted, 1)
 }
 
-// We only support xattrs prefixed with "user." (see b/148380782). Currently,
-// there is no need to expose any other xattrs through a gofer.
-func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
+func (d *dentry) listXattr(ctx context.Context, creds *auth.Credentials, size uint64) ([]string, error) {
 	if d.file.isNil() || !d.userXattrSupported() {
 		return nil, nil
 	}
@@ -1356,6 +1517,7 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
 	}
 	xattrs := make([]string, 0, len(xattrMap))
 	for x := range xattrMap {
+		// We only support xattrs in the user.* namespace.
 		if strings.HasPrefix(x, linux.XATTR_USER_PREFIX) {
 			xattrs = append(xattrs, x)
 		}
@@ -1363,51 +1525,33 @@ func (d *dentry) listxattr(ctx context.Context, creds *auth.Credentials, size ui
 	return xattrs, nil
 }
 
-func (d *dentry) getxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
+func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
 	if d.file.isNil() {
 		return "", syserror.ENODATA
 	}
-	if err := d.checkPermissions(creds, vfs.MayRead); err != nil {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
 		return "", err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return "", syserror.EOPNOTSUPP
-	}
-	if !d.userXattrSupported() {
-		return "", syserror.ENODATA
-	}
 	return d.file.getXattr(ctx, opts.Name, opts.Size)
 }
 
-func (d *dentry) setxattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
+func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
 	if d.file.isNil() {
 		return syserror.EPERM
 	}
-	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return syserror.EOPNOTSUPP
-	}
-	if !d.userXattrSupported() {
-		return syserror.EPERM
-	}
 	return d.file.setXattr(ctx, opts.Name, opts.Value, opts.Flags)
 }
 
-func (d *dentry) removexattr(ctx context.Context, creds *auth.Credentials, name string) error {
+func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error {
 	if d.file.isNil() {
 		return syserror.EPERM
 	}
-	if err := d.checkPermissions(creds, vfs.MayWrite); err != nil {
+	if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
-		return syserror.EOPNOTSUPP
-	}
-	if !d.userXattrSupported() {
-		return syserror.EPERM
-	}
 	return d.file.removeXattr(ctx, name)
 }
 
@@ -1418,7 +1562,9 @@ func (d *dentry) userXattrSupported() bool {
 	return filetype == linux.ModeRegular || filetype == linux.ModeDirectory
 }
 
-// Preconditions: !d.isSynthetic(). d.isRegularFile() || d.isDir().
+// Preconditions:
+// * !d.isSynthetic().
+// * d.isRegularFile() || d.isDir().
 func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
 	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
 	// O_TRUNC).
@@ -1463,8 +1609,9 @@ func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool
 			return err
 		}
 
-		if d.hostFD < 0 && openReadable && h.fd >= 0 {
-			// We have no existing FD; use the new FD for at least reading.
+		if d.hostFD < 0 && h.fd >= 0 && openReadable && (d.writeFile.isNil() || openWritable) {
+			// We have no existing FD, and the new FD meets the requirements
+			// for d.hostFD, so start using it.
 			d.hostFD = h.fd
 		} else if d.hostFD >= 0 && d.writeFile.isNil() && openWritable {
 			// We have an existing read-only FD, but the file has just been
@@ -1593,6 +1740,33 @@ func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
 	return nil
 }
 
+func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error {
+	d.handleMu.RLock()
+	defer d.handleMu.RUnlock()
+	h := d.writeHandleLocked()
+	if h.isOpen() {
+		// Write back dirty pages to the remote file.
+		d.dataMu.Lock()
+		err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
+		d.dataMu.Unlock()
+		if err != nil {
+			return err
+		}
+	}
+	if err := d.syncRemoteFileLocked(ctx); err != nil {
+		if !forFilesystemSync {
+			return err
+		}
+		// Only return err if we can reasonably have expected sync to succeed
+		// (d is a regular file and was opened for writing).
+		if d.isRegularFile() && h.isOpen() {
+			return err
+		}
+		ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err)
+	}
+	return nil
+}
+
 // incLinks increments link count.
 func (d *dentry) incLinks() {
 	if atomic.LoadUint32(&d.nlink) == 0 {
@@ -1613,12 +1787,14 @@ func (d *dentry) decLinks() {
 
 // fileDescription is embedded by gofer implementations of
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
 	vfs.LockFD
 
-	lockLogging sync.Once
+	lockLogging sync.Once `state:"nosave"`
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
@@ -1656,30 +1832,30 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	return nil
 }
 
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
-	return fd.dentry().listxattr(ctx, auth.CredentialsFromContext(ctx), size)
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.dentry().listXattr(ctx, auth.CredentialsFromContext(ctx), size)
 }
 
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
-	return fd.dentry().getxattr(ctx, auth.CredentialsFromContext(ctx), &opts)
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+	return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
 }
 
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
 	d := fd.dentry()
-	if err := d.setxattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
+	if err := d.setXattr(ctx, auth.CredentialsFromContext(ctx), &opts); err != nil {
 		return err
 	}
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
 }
 
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
 	d := fd.dentry()
-	if err := d.removexattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
+	if err := d.removeXattr(ctx, auth.CredentialsFromContext(ctx), name); err != nil {
 		return err
 	}
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
diff --git a/pkg/sentry/fsimpl/gofer/gofer_test.go b/pkg/sentry/fsimpl/gofer/gofer_test.go
index bfe75dfe4..76f08e252 100644
--- a/pkg/sentry/fsimpl/gofer/gofer_test.go
+++ b/pkg/sentry/fsimpl/gofer/gofer_test.go
@@ -26,12 +26,13 @@ import (
 func TestDestroyIdempotent(t *testing.T) {
 	ctx := contexttest.Context(t)
 	fs := filesystem{
-		mfp:              pgalloc.MemoryFileProviderFromContext(ctx),
-		syncableDentries: make(map[*dentry]struct{}),
+		mfp: pgalloc.MemoryFileProviderFromContext(ctx),
 		opts: filesystemOptions{
 			// Test relies on no dentry being held in the cache.
 			maxCachedDentries: 0,
 		},
+		syncableDentries: make(map[*dentry]struct{}),
+		inoByQIDPath:     make(map[uint64]uint64),
 	}
 
 	attr := &p9.Attr{
diff --git a/pkg/sentry/fsimpl/gofer/handle.go b/pkg/sentry/fsimpl/gofer/handle.go
index 104157512..a9ebe1206 100644
--- a/pkg/sentry/fsimpl/gofer/handle.go
+++ b/pkg/sentry/fsimpl/gofer/handle.go
@@ -25,6 +25,8 @@ import (
 
 // handle represents a remote "open file descriptor", consisting of an opened
 // fid (p9.File) and optionally a host file descriptor.
+//
+// These are explicitly not savable.
 type handle struct {
 	file p9file
 	fd   int32 // -1 if unavailable
diff --git a/pkg/sentry/fsimpl/gofer/host_named_pipe.go b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
index 7294de7d6..c7bf10007 100644
--- a/pkg/sentry/fsimpl/gofer/host_named_pipe.go
+++ b/pkg/sentry/fsimpl/gofer/host_named_pipe.go
@@ -51,8 +51,24 @@ func blockUntilNonblockingPipeHasWriter(ctx context.Context, fd int32) error {
 		if ok {
 			return nil
 		}
-		if err := sleepBetweenNamedPipeOpenChecks(ctx); err != nil {
-			return err
+		if sleepErr := sleepBetweenNamedPipeOpenChecks(ctx); sleepErr != nil {
+			// Another application thread may have opened this pipe for
+			// writing, succeeded because we previously opened the pipe for
+			// reading, and subsequently interrupted us for checkpointing (e.g.
+			// this occurs in mknod tests under cooperative save/restore). In
+			// this case, our open has to succeed for the checkpoint to include
+			// a readable FD for the pipe, which is in turn necessary to
+			// restore the other thread's writable FD for the same pipe
+			// (otherwise it will get ENXIO). So we have to check
+			// nonblockingPipeHasWriter() once last time.
+			ok, err := nonblockingPipeHasWriter(fd)
+			if err != nil {
+				return err
+			}
+			if ok {
+				return nil
+			}
+			return sleepErr
 		}
 	}
 }
diff --git a/pkg/sentry/fsimpl/gofer/p9file.go b/pkg/sentry/fsimpl/gofer/p9file.go
index 87f0b877f..21b4a96fe 100644
--- a/pkg/sentry/fsimpl/gofer/p9file.go
+++ b/pkg/sentry/fsimpl/gofer/p9file.go
@@ -127,6 +127,13 @@ func (f p9file) close(ctx context.Context) error {
 	return err
 }
 
+func (f p9file) setAttrClose(ctx context.Context, valid p9.SetAttrMask, attr p9.SetAttr) error {
+	ctx.UninterruptibleSleepStart(false)
+	err := f.file.SetAttrClose(valid, attr)
+	ctx.UninterruptibleSleepFinish(false)
+	return err
+}
+
 func (f p9file) open(ctx context.Context, flags p9.OpenFlags) (*fd.FD, p9.QID, uint32, error) {
 	ctx.UninterruptibleSleepStart(false)
 	fdobj, qid, iounit, err := f.file.Open(flags)
diff --git a/pkg/sentry/fsimpl/gofer/regular_file.go b/pkg/sentry/fsimpl/gofer/regular_file.go
index 7e1cbf065..dc8a890cb 100644
--- a/pkg/sentry/fsimpl/gofer/regular_file.go
+++ b/pkg/sentry/fsimpl/gofer/regular_file.go
@@ -18,7 +18,6 @@ import (
 	"fmt"
 	"io"
 	"math"
-	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -31,6 +30,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
 	"gvisor.dev/gvisor/pkg/sentry/usage"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
@@ -39,11 +39,12 @@ func (d *dentry) isRegularFile() bool {
 	return d.fileType() == linux.S_IFREG
 }
 
+// +stateify savable
 type regularFileFD struct {
 	fileDescription
 
 	// off is the file offset. off is protected by mu.
-	mu  sync.Mutex
+	mu  sync.Mutex `state:"nosave"`
 	off int64
 }
 
@@ -56,10 +57,16 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
 	if !fd.vfsfd.IsWritable() {
 		return nil
 	}
-	// Skip flushing if writes may be buffered by the client, since (as with
-	// the VFS1 client) we don't flush buffered writes on close anyway.
+	// Skip flushing if there are client-buffered writes, since (as with the
+	// VFS1 client) we don't flush buffered writes on close anyway.
 	d := fd.dentry()
-	if d.fs.opts.interop == InteropModeExclusive {
+	if d.fs.opts.interop != InteropModeExclusive {
+		return nil
+	}
+	d.dataMu.RLock()
+	haveDirtyPages := !d.dirty.IsEmpty()
+	d.dataMu.RUnlock()
+	if haveDirtyPages {
 		return nil
 	}
 	d.handleMu.RLock()
@@ -73,28 +80,11 @@ func (fd *regularFileFD) OnClose(ctx context.Context) error {
 // Allocate implements vfs.FileDescriptionImpl.Allocate.
 func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
 	d := fd.dentry()
-	d.metadataMu.Lock()
-	defer d.metadataMu.Unlock()
-
-	// Allocating a smaller size is a noop.
-	size := offset + length
-	if d.cachedMetadataAuthoritative() && size <= d.size {
-		return nil
-	}
-
-	d.handleMu.RLock()
-	err := d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
-	d.handleMu.RUnlock()
-	if err != nil {
-		return err
-	}
-	d.dataMu.Lock()
-	atomic.StoreUint64(&d.size, size)
-	d.dataMu.Unlock()
-	if d.cachedMetadataAuthoritative() {
-		d.touchCMtimeLocked()
-	}
-	return nil
+	return d.doAllocate(ctx, offset, length, func() error {
+		d.handleMu.RLock()
+		defer d.handleMu.RUnlock()
+		return d.writeFile.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+	})
 }
 
 // PRead implements vfs.FileDescriptionImpl.PRead.
@@ -117,6 +107,10 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 		return 0, io.EOF
 	}
 
+	var (
+		n       int64
+		readErr error
+	)
 	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
 		// Lock d.metadataMu for the rest of the read to prevent d.size from
 		// changing.
@@ -127,20 +121,25 @@ func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 		if err := d.writeback(ctx, offset, dst.NumBytes()); err != nil {
 			return 0, err
 		}
-	}
-
-	rw := getDentryReadWriter(ctx, d, offset)
-	if fd.vfsfd.StatusFlags()&linux.O_DIRECT != 0 {
+		rw := getDentryReadWriter(ctx, d, offset)
 		// Require the read to go to the remote file.
 		rw.direct = true
+		n, readErr = dst.CopyOutFrom(ctx, rw)
+		putDentryReadWriter(rw)
+		if d.fs.opts.interop != InteropModeShared {
+			// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+			d.touchAtimeLocked(fd.vfsfd.Mount())
+		}
+	} else {
+		rw := getDentryReadWriter(ctx, d, offset)
+		n, readErr = dst.CopyOutFrom(ctx, rw)
+		putDentryReadWriter(rw)
+		if d.fs.opts.interop != InteropModeShared {
+			// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
+			d.touchAtime(fd.vfsfd.Mount())
+		}
 	}
-	n, err := dst.CopyOutFrom(ctx, rw)
-	putDentryReadWriter(rw)
-	if d.fs.opts.interop != InteropModeShared {
-		// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
-		d.touchAtime(fd.vfsfd.Mount())
-	}
-	return n, err
+	return n, readErr
 }
 
 // Read implements vfs.FileDescriptionImpl.Read.
@@ -396,7 +395,7 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
 					End:   gapEnd,
 				}
 				optMR := gap.Range()
-				err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, h.readToBlocksAt)
+				err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), rw.d.size, mf, usage.PageCache, h.readToBlocksAt)
 				mf.MarkEvictable(rw.d, pgalloc.EvictableRange{optMR.Start, optMR.End})
 				seg, gap = rw.d.cache.Find(rw.off)
 				if !seg.Ok() {
@@ -404,10 +403,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
 					rw.d.handleMu.RUnlock()
 					return done, err
 				}
-				// err might have occurred in part of gap.Range() outside
-				// gapMR. Forget about it for now; if the error matters and
-				// persists, we'll run into it again in a later iteration of
-				// this loop.
+				// err might have occurred in part of gap.Range() outside gapMR
+				// (in particular, gap.End() might be beyond EOF). Forget about
+				// it for now; if the error matters and persists, we'll run
+				// into it again in a later iteration of this loop.
 			} else {
 				// Read directly from the file.
 				gapDsts := dsts.TakeFirst64(gapMR.Length())
@@ -625,23 +624,7 @@ func regularFileSeekLocked(ctx context.Context, d *dentry, fdOffset, offset int6
 
 // Sync implements vfs.FileDescriptionImpl.Sync.
 func (fd *regularFileFD) Sync(ctx context.Context) error {
-	return fd.dentry().syncCachedFile(ctx)
-}
-
-func (d *dentry) syncCachedFile(ctx context.Context) error {
-	d.handleMu.RLock()
-	defer d.handleMu.RUnlock()
-
-	if h := d.writeHandleLocked(); h.isOpen() {
-		d.dataMu.Lock()
-		// Write dirty cached data to the remote file.
-		err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size, d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
-		d.dataMu.Unlock()
-		if err != nil {
-			return err
-		}
-	}
-	return d.syncRemoteFileLocked(ctx)
+	return fd.dentry().syncCachedFile(ctx, false /* lowSyncExpectations */)
 }
 
 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
@@ -781,7 +764,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab
 
 	mf := d.fs.mfp.MemoryFile()
 	h := d.readHandleLocked()
-	cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, h.readToBlocksAt)
+	cerr := d.cache.Fill(ctx, required, maxFillRange(required, optional), d.size, mf, usage.PageCache, h.readToBlocksAt)
 
 	var ts []memmap.Translation
 	var translatedEnd uint64
@@ -900,6 +883,8 @@ func (d *dentry) Evict(ctx context.Context, er pgalloc.EvictableRange) {
 // dentryPlatformFile is only used when a host FD representing the remote file
 // is available (i.e. dentry.hostFD >= 0), and that FD is used for application
 // memory mappings (i.e. !filesystem.opts.forcePageCache).
+//
+// +stateify savable
 type dentryPlatformFile struct {
 	*dentry
 
@@ -912,7 +897,7 @@ type dentryPlatformFile struct {
 	hostFileMapper fsutil.HostFileMapper
 
 	// hostFileMapperInitOnce is used to lazily initialize hostFileMapper.
-	hostFileMapperInitOnce sync.Once
+	hostFileMapperInitOnce sync.Once `state:"nosave"`
 }
 
 // IncRef implements memmap.File.IncRef.
diff --git a/pkg/sentry/fsimpl/gofer/save_restore.go b/pkg/sentry/fsimpl/gofer/save_restore.go
new file mode 100644
index 000000000..2ea224c43
--- /dev/null
+++ b/pkg/sentry/fsimpl/gofer/save_restore.go
@@ -0,0 +1,329 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gofer
+
+import (
+	"fmt"
+	"io"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/p9"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+type saveRestoreContextID int
+
+const (
+	// CtxRestoreServerFDMap is a Context.Value key for a map[string]int
+	// mapping filesystem unique IDs (cf. InternalFilesystemOptions.UniqueID)
+	// to host FDs.
+	CtxRestoreServerFDMap saveRestoreContextID = iota
+)
+
+// +stateify savable
+type savedDentryRW struct {
+	read  bool
+	write bool
+}
+
+// PreprareSave implements vfs.FilesystemImplSaveRestoreExtension.PrepareSave.
+func (fs *filesystem) PrepareSave(ctx context.Context) error {
+	if len(fs.iopts.UniqueID) == 0 {
+		return fmt.Errorf("gofer.filesystem with no UniqueID cannot be saved")
+	}
+
+	// Purge cached dentries, which may not be reopenable after restore due to
+	// permission changes.
+	fs.renameMu.Lock()
+	fs.evictAllCachedDentriesLocked(ctx)
+	fs.renameMu.Unlock()
+
+	// Buffer pipe data so that it's available for reading after restore. (This
+	// is a legacy VFS1 feature.)
+	fs.syncMu.Lock()
+	for sffd := range fs.specialFileFDs {
+		if sffd.dentry().fileType() == linux.S_IFIFO && sffd.vfsfd.IsReadable() {
+			if err := sffd.savePipeData(ctx); err != nil {
+				fs.syncMu.Unlock()
+				return err
+			}
+		}
+	}
+	fs.syncMu.Unlock()
+
+	// Flush local state to the remote filesystem.
+	if err := fs.Sync(ctx); err != nil {
+		return err
+	}
+
+	fs.savedDentryRW = make(map[*dentry]savedDentryRW)
+	return fs.root.prepareSaveRecursive(ctx)
+}
+
+// Preconditions:
+// * fd represents a pipe.
+// * fd is readable.
+func (fd *specialFileFD) savePipeData(ctx context.Context) error {
+	fd.bufMu.Lock()
+	defer fd.bufMu.Unlock()
+	var buf [usermem.PageSize]byte
+	for {
+		n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:])), ^uint64(0))
+		if n != 0 {
+			fd.buf = append(fd.buf, buf[:n]...)
+		}
+		if err != nil {
+			if err == io.EOF || err == syserror.EAGAIN {
+				break
+			}
+			return err
+		}
+	}
+	if len(fd.buf) != 0 {
+		atomic.StoreUint32(&fd.haveBuf, 1)
+	}
+	return nil
+}
+
+func (d *dentry) prepareSaveRecursive(ctx context.Context) error {
+	if d.isRegularFile() && !d.cachedMetadataAuthoritative() {
+		// Get updated metadata for d in case we need to perform metadata
+		// validation during restore.
+		if err := d.updateFromGetattr(ctx); err != nil {
+			return err
+		}
+	}
+	if !d.readFile.isNil() || !d.writeFile.isNil() {
+		d.fs.savedDentryRW[d] = savedDentryRW{
+			read:  !d.readFile.isNil(),
+			write: !d.writeFile.isNil(),
+		}
+	}
+	d.dirMu.Lock()
+	defer d.dirMu.Unlock()
+	for _, child := range d.children {
+		if child != nil {
+			if err := child.prepareSaveRecursive(ctx); err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
+
+// beforeSave is invoked by stateify.
+func (d *dentry) beforeSave() {
+	if d.vfsd.IsDead() {
+		panic(fmt.Sprintf("gofer.dentry(%q).beforeSave: deleted and invalidated dentries can't be restored", genericDebugPathname(d)))
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (d *dentry) afterLoad() {
+	d.hostFD = -1
+	if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&d.refs) != -1 {
+		refsvfs2.Register(d, "gofer.dentry")
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (d *dentryPlatformFile) afterLoad() {
+	if d.hostFileMapper.IsInited() {
+		// Ensure that we don't call d.hostFileMapper.Init() again.
+		d.hostFileMapperInitOnce.Do(func() {})
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (fd *specialFileFD) afterLoad() {
+	fd.handle.fd = -1
+}
+
+// CompleteRestore implements
+// vfs.FilesystemImplSaveRestoreExtension.CompleteRestore.
+func (fs *filesystem) CompleteRestore(ctx context.Context, opts vfs.CompleteRestoreOptions) error {
+	fdmapv := ctx.Value(CtxRestoreServerFDMap)
+	if fdmapv == nil {
+		return fmt.Errorf("no server FD map available")
+	}
+	fdmap := fdmapv.(map[string]int)
+	fd, ok := fdmap[fs.iopts.UniqueID]
+	if !ok {
+		return fmt.Errorf("no server FD available for filesystem with unique ID %q", fs.iopts.UniqueID)
+	}
+	fs.opts.fd = fd
+	if err := fs.dial(ctx); err != nil {
+		return err
+	}
+	fs.inoByQIDPath = make(map[uint64]uint64)
+
+	// Restore the filesystem root.
+	ctx.UninterruptibleSleepStart(false)
+	attached, err := fs.client.Attach(fs.opts.aname)
+	ctx.UninterruptibleSleepFinish(false)
+	if err != nil {
+		return err
+	}
+	attachFile := p9file{attached}
+	qid, attrMask, attr, err := attachFile.getAttr(ctx, dentryAttrMask())
+	if err != nil {
+		return err
+	}
+	if err := fs.root.restoreFile(ctx, attachFile, qid, attrMask, &attr, &opts); err != nil {
+		return err
+	}
+
+	// Restore remaining dentries.
+	if err := fs.root.restoreDescendantsRecursive(ctx, &opts); err != nil {
+		return err
+	}
+
+	// Re-open handles for specialFileFDs. Unlike the initial open
+	// (dentry.openSpecialFile()), pipes are always opened without blocking;
+	// non-readable pipe FDs are opened last to ensure that they don't get
+	// ENXIO if another specialFileFD represents the read end of the same pipe.
+	// This is consistent with VFS1.
+	haveWriteOnlyPipes := false
+	for fd := range fs.specialFileFDs {
+		if fd.dentry().fileType() == linux.S_IFIFO && !fd.vfsfd.IsReadable() {
+			haveWriteOnlyPipes = true
+			continue
+		}
+		if err := fd.completeRestore(ctx); err != nil {
+			return err
+		}
+	}
+	if haveWriteOnlyPipes {
+		for fd := range fs.specialFileFDs {
+			if fd.dentry().fileType() == linux.S_IFIFO && !fd.vfsfd.IsReadable() {
+				if err := fd.completeRestore(ctx); err != nil {
+					return err
+				}
+			}
+		}
+	}
+
+	// Discard state only required during restore.
+	fs.savedDentryRW = nil
+
+	return nil
+}
+
+func (d *dentry) restoreFile(ctx context.Context, file p9file, qid p9.QID, attrMask p9.AttrMask, attr *p9.Attr, opts *vfs.CompleteRestoreOptions) error {
+	d.file = file
+
+	// Gofers do not preserve QID across checkpoint/restore, so:
+	//
+	// - We must assume that the remote filesystem did not change in a way that
+	// would invalidate dentries, since we can't revalidate dentries by
+	// checking QIDs.
+	//
+	// - We need to associate the new QID.Path with the existing d.ino.
+	d.qidPath = qid.Path
+	d.fs.inoMu.Lock()
+	d.fs.inoByQIDPath[qid.Path] = d.ino
+	d.fs.inoMu.Unlock()
+
+	// Check metadata stability before updating metadata.
+	d.metadataMu.Lock()
+	defer d.metadataMu.Unlock()
+	if d.isRegularFile() {
+		if opts.ValidateFileSizes {
+			if !attrMask.Size {
+				return fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: file size not available", genericDebugPathname(d))
+			}
+			if d.size != attr.Size {
+				return fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(d), d.size, attr.Size)
+			}
+		}
+		if opts.ValidateFileModificationTimestamps {
+			if !attrMask.MTime {
+				return fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime not available", genericDebugPathname(d))
+			}
+			if want := dentryTimestampFromP9(attr.MTimeSeconds, attr.MTimeNanoSeconds); d.mtime != want {
+				return fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(d), linux.NsecToStatxTimestamp(d.mtime), linux.NsecToStatxTimestamp(want))
+			}
+		}
+	}
+	if !d.cachedMetadataAuthoritative() {
+		d.updateFromP9AttrsLocked(attrMask, attr)
+	}
+
+	if rw, ok := d.fs.savedDentryRW[d]; ok {
+		if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+// Preconditions: d is not synthetic.
+func (d *dentry) restoreDescendantsRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error {
+	for _, child := range d.children {
+		if child == nil {
+			continue
+		}
+		if _, ok := d.fs.syncableDentries[child]; !ok {
+			// child is synthetic.
+			continue
+		}
+		if err := child.restoreRecursive(ctx, opts); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// Preconditions: d is not synthetic (but note that since this function
+// restores d.file, d.file.isNil() is always true at this point, so this can
+// only be detected by checking filesystem.syncableDentries). d.parent has been
+// restored.
+func (d *dentry) restoreRecursive(ctx context.Context, opts *vfs.CompleteRestoreOptions) error {
+	qid, file, attrMask, attr, err := d.parent.file.walkGetAttrOne(ctx, d.name)
+	if err != nil {
+		return err
+	}
+	if err := d.restoreFile(ctx, file, qid, attrMask, &attr, opts); err != nil {
+		return err
+	}
+	return d.restoreDescendantsRecursive(ctx, opts)
+}
+
+func (fd *specialFileFD) completeRestore(ctx context.Context) error {
+	d := fd.dentry()
+	h, err := openHandle(ctx, d.file, fd.vfsfd.IsReadable(), fd.vfsfd.IsWritable(), false /* trunc */)
+	if err != nil {
+		return err
+	}
+	fd.handle = h
+
+	ftype := d.fileType()
+	fd.haveQueue = (ftype == linux.S_IFIFO || ftype == linux.S_IFSOCK) && fd.handle.fd >= 0
+	if fd.haveQueue {
+		if err := fdnotifier.AddFD(fd.handle.fd, &fd.queue); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/gofer/socket.go b/pkg/sentry/fsimpl/gofer/socket.go
index 85d2bee72..a21199eac 100644
--- a/pkg/sentry/fsimpl/gofer/socket.go
+++ b/pkg/sentry/fsimpl/gofer/socket.go
@@ -36,13 +36,12 @@ func (d *dentry) isSocket() bool {
 // An endpoint's lifetime is the time between when filesystem.BoundEndpointAt()
 // is called and either BoundEndpoint.BidirectionalConnect or
 // BoundEndpoint.UnidirectionalConnect is called.
+//
+// +stateify savable
 type endpoint struct {
 	// dentry is the filesystem dentry which produced this endpoint.
 	dentry *dentry
 
-	// file is the p9 file that contains a single unopened fid.
-	file p9.File
-
 	// path is the sentry path where this endpoint is bound.
 	path string
 }
@@ -114,7 +113,7 @@ func (e *endpoint) UnidirectionalConnect(ctx context.Context) (transport.Connect
 }
 
 func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFlags, queue *waiter.Queue) (*host.SCMConnectedEndpoint, *syserr.Error) {
-	hostFile, err := e.file.Connect(flags)
+	hostFile, err := e.dentry.file.connect(ctx, flags)
 	if err != nil {
 		return nil, syserr.ErrConnectionRefused
 	}
@@ -129,7 +128,7 @@ func (e *endpoint) newConnectedEndpoint(ctx context.Context, flags p9.ConnectFla
 
 	c, serr := host.NewSCMEndpoint(ctx, hostFD, queue, e.path)
 	if serr != nil {
-		log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.file, flags, serr)
+		log.Warningf("Gofer returned invalid host socket for BidirectionalConnect; file %+v flags %+v: %v", e.dentry.file, flags, serr)
 		return nil, serr
 	}
 	return c, nil
diff --git a/pkg/sentry/fsimpl/gofer/special_file.go b/pkg/sentry/fsimpl/gofer/special_file.go
index 3c39aa9b7..625400c0b 100644
--- a/pkg/sentry/fsimpl/gofer/special_file.go
+++ b/pkg/sentry/fsimpl/gofer/special_file.go
@@ -15,15 +15,16 @@
 package gofer
 
 import (
-	"sync"
 	"sync/atomic"
 	"syscall"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/p9"
 	"gvisor.dev/gvisor/pkg/safemem"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
 	"gvisor.dev/gvisor/pkg/waiter"
@@ -33,11 +34,13 @@ import (
 // special files, and (when filesystemOptions.regularFilesUseSpecialFileFD is
 // in effect) regular files. specialFileFD differs from regularFileFD by using
 // per-FD handles instead of shared per-dentry handles, and never buffering I/O.
+//
+// +stateify savable
 type specialFileFD struct {
 	fileDescription
 
 	// handle is used for file I/O. handle is immutable.
-	handle handle
+	handle handle `state:"nosave"`
 
 	// isRegularFile is true if this FD represents a regular file which is only
 	// possible when filesystemOptions.regularFilesUseSpecialFileFD is in
@@ -51,12 +54,20 @@ type specialFileFD struct {
 
 	// haveQueue is true if this file description represents a file for which
 	// queue may send I/O readiness events. haveQueue is immutable.
-	haveQueue bool
+	haveQueue bool `state:"nosave"`
 	queue     waiter.Queue
 
 	// If seekable is true, off is the file offset. off is protected by mu.
-	mu  sync.Mutex
+	mu  sync.Mutex `state:"nosave"`
 	off int64
+
+	// If haveBuf is non-zero, this FD represents a pipe, and buf contains data
+	// read from the pipe from previous calls to specialFileFD.savePipeData().
+	// haveBuf and buf are protected by bufMu. haveBuf is accessed using atomic
+	// memory operations.
+	bufMu   sync.Mutex `state:"nosave"`
+	haveBuf uint32
+	buf     []byte
 }
 
 func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks, flags uint32) (*specialFileFD, error) {
@@ -84,6 +95,9 @@ func newSpecialFileFD(h handle, mnt *vfs.Mount, d *dentry, locks *vfs.FileLocks,
 		}
 		return nil, err
 	}
+	d.fs.syncMu.Lock()
+	d.fs.specialFileFDs[fd] = struct{}{}
+	d.fs.syncMu.Unlock()
 	return fd, nil
 }
 
@@ -135,6 +149,16 @@ func (fd *specialFileFD) EventUnregister(e *waiter.Entry) {
 	fd.fileDescription.EventUnregister(e)
 }
 
+func (fd *specialFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	if fd.isRegularFile {
+		d := fd.dentry()
+		return d.doAllocate(ctx, offset, length, func() error {
+			return fd.handle.file.allocate(ctx, p9.ToAllocateMode(mode), offset, length)
+		})
+	}
+	return fd.FileDescriptionDefaultImpl.Allocate(ctx, mode, offset, length)
+}
+
 // PRead implements vfs.FileDescriptionImpl.PRead.
 func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
 	if fd.seekable && offset < 0 {
@@ -148,26 +172,51 @@ func (fd *specialFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offs
 		return 0, syserror.EOPNOTSUPP
 	}
 
-	// Going through dst.CopyOutFrom() holds MM locks around file operations of
-	// unknown duration. For regularFileFD, doing so is necessary to support
-	// mmap due to lock ordering; MM locks precede dentry.dataMu. That doesn't
-	// hold here since specialFileFD doesn't client-cache data. Just buffer the
-	// read instead.
 	if d := fd.dentry(); d.cachedMetadataAuthoritative() {
 		d.touchAtime(fd.vfsfd.Mount())
 	}
+
+	bufN := int64(0)
+	if atomic.LoadUint32(&fd.haveBuf) != 0 {
+		var err error
+		fd.bufMu.Lock()
+		if len(fd.buf) != 0 {
+			var n int
+			n, err = dst.CopyOut(ctx, fd.buf)
+			dst = dst.DropFirst(n)
+			fd.buf = fd.buf[n:]
+			if len(fd.buf) == 0 {
+				atomic.StoreUint32(&fd.haveBuf, 0)
+				fd.buf = nil
+			}
+			bufN = int64(n)
+			if offset >= 0 {
+				offset += bufN
+			}
+		}
+		fd.bufMu.Unlock()
+		if err != nil {
+			return bufN, err
+		}
+	}
+
+	// Going through dst.CopyOutFrom() would hold MM locks around file
+	// operations of unknown duration. For regularFileFD, doing so is necessary
+	// to support mmap due to lock ordering; MM locks precede dentry.dataMu.
+	// That doesn't hold here since specialFileFD doesn't client-cache data.
+	// Just buffer the read instead.
 	buf := make([]byte, dst.NumBytes())
 	n, err := fd.handle.readToBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
 	if err == syserror.EAGAIN {
 		err = syserror.ErrWouldBlock
 	}
 	if n == 0 {
-		return 0, err
+		return bufN, err
 	}
 	if cp, cperr := dst.CopyOut(ctx, buf[:n]); cperr != nil {
-		return int64(cp), cperr
+		return bufN + int64(cp), cperr
 	}
-	return int64(n), err
+	return bufN + int64(n), err
 }
 
 // Read implements vfs.FileDescriptionImpl.Read.
@@ -204,16 +253,16 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
 	}
 
 	d := fd.dentry()
-	// If the regular file fd was opened with O_APPEND, make sure the file size
-	// is updated. There is a possible race here if size is modified externally
-	// after metadata cache is updated.
-	if fd.isRegularFile && fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
-		if err := d.updateFromGetattr(ctx); err != nil {
-			return 0, offset, err
+	if fd.isRegularFile {
+		// If the regular file fd was opened with O_APPEND, make sure the file
+		// size is updated. There is a possible race here if size is modified
+		// externally after metadata cache is updated.
+		if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 && !d.cachedMetadataAuthoritative() {
+			if err := d.updateFromGetattr(ctx); err != nil {
+				return 0, offset, err
+			}
 		}
-	}
 
-	if fd.isRegularFile {
 		// We need to hold the metadataMu *while* writing to a regular file.
 		d.metadataMu.Lock()
 		defer d.metadataMu.Unlock()
@@ -235,11 +284,12 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
 		d.touchCMtime()
 	}
 	buf := make([]byte, src.NumBytes())
-	// Don't do partial writes if we get a partial read from src.
-	if _, err := src.CopyIn(ctx, buf); err != nil {
-		return 0, offset, err
+	copied, copyErr := src.CopyIn(ctx, buf)
+	if copied == 0 && copyErr != nil {
+		// Only return the error if we didn't get any data.
+		return 0, offset, copyErr
 	}
-	n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf)), uint64(offset))
+	n, err := fd.handle.writeFromBlocksAt(ctx, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:copied])), uint64(offset))
 	if err == syserror.EAGAIN {
 		err = syserror.ErrWouldBlock
 	}
@@ -256,7 +306,10 @@ func (fd *specialFileFD) pwrite(ctx context.Context, src usermem.IOSequence, off
 			atomic.StoreUint64(&d.size, uint64(offset))
 		}
 	}
-	return int64(n), offset, err
+	if err != nil {
+		return int64(n), offset, err
+	}
+	return int64(n), offset, copyErr
 }
 
 // Write implements vfs.FileDescriptionImpl.Write.
@@ -289,13 +342,31 @@ func (fd *specialFileFD) Seek(ctx context.Context, offset int64, whence int32) (
 
 // Sync implements vfs.FileDescriptionImpl.Sync.
 func (fd *specialFileFD) Sync(ctx context.Context) error {
-	// If we have a host FD, fsyncing it is likely to be faster than an fsync
-	// RPC.
-	if fd.handle.fd >= 0 {
-		ctx.UninterruptibleSleepStart(false)
-		err := syscall.Fsync(int(fd.handle.fd))
-		ctx.UninterruptibleSleepFinish(false)
-		return err
+	return fd.sync(ctx, false /* forFilesystemSync */)
+}
+
+func (fd *specialFileFD) sync(ctx context.Context, forFilesystemSync bool) error {
+	err := func() error {
+		// If we have a host FD, fsyncing it is likely to be faster than an fsync
+		// RPC.
+		if fd.handle.fd >= 0 {
+			ctx.UninterruptibleSleepStart(false)
+			err := syscall.Fsync(int(fd.handle.fd))
+			ctx.UninterruptibleSleepFinish(false)
+			return err
+		}
+		return fd.handle.file.fsync(ctx)
+	}()
+	if err != nil {
+		if !forFilesystemSync {
+			return err
+		}
+		// Only return err if we can reasonably have expected sync to succeed
+		// (fd represents a regular file that was opened for writing).
+		if fd.isRegularFile && fd.vfsfd.IsWritable() {
+			return err
+		}
+		ctx.Debugf("gofer.specialFileFD.sync: syncing non-writable or non-regular-file FD failed: %v", err)
 	}
-	return fd.handle.file.fsync(ctx)
+	return nil
 }
diff --git a/pkg/sentry/fsimpl/gofer/time.go b/pkg/sentry/fsimpl/gofer/time.go
index e59d07e90..9cbe805b9 100644
--- a/pkg/sentry/fsimpl/gofer/time.go
+++ b/pkg/sentry/fsimpl/gofer/time.go
@@ -17,7 +17,6 @@ package gofer
 import (
 	"sync/atomic"
 
-	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
@@ -25,17 +24,6 @@ func dentryTimestampFromP9(s, ns uint64) int64 {
 	return int64(s*1e9 + ns)
 }
 
-func dentryTimestampFromStatx(ts linux.StatxTimestamp) int64 {
-	return ts.Sec*1e9 + int64(ts.Nsec)
-}
-
-func statxTimestampFromDentry(ns int64) linux.StatxTimestamp {
-	return linux.StatxTimestamp{
-		Sec:  ns / 1e9,
-		Nsec: uint32(ns % 1e9),
-	}
-}
-
 // Preconditions: d.cachedMetadataAuthoritative() == true.
 func (d *dentry) touchAtime(mnt *vfs.Mount) {
 	if mnt.Flags.NoATime || mnt.ReadOnly() {
@@ -52,8 +40,23 @@ func (d *dentry) touchAtime(mnt *vfs.Mount) {
 	mnt.EndWrite()
 }
 
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// successfully called vfs.Mount.CheckBeginWrite().
+// Preconditions: d.metadataMu is locked. d.cachedMetadataAuthoritative() == true.
+func (d *dentry) touchAtimeLocked(mnt *vfs.Mount) {
+	if mnt.Flags.NoATime || mnt.ReadOnly() {
+		return
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return
+	}
+	now := d.fs.clock.Now().Nanoseconds()
+	atomic.StoreInt64(&d.atime, now)
+	atomic.StoreUint32(&d.atimeDirty, 1)
+	mnt.EndWrite()
+}
+
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has successfully called vfs.Mount.CheckBeginWrite().
 func (d *dentry) touchCtime() {
 	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
@@ -61,8 +64,9 @@ func (d *dentry) touchCtime() {
 	d.metadataMu.Unlock()
 }
 
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// successfully called vfs.Mount.CheckBeginWrite().
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has successfully called vfs.Mount.CheckBeginWrite().
 func (d *dentry) touchCMtime() {
 	now := d.fs.clock.Now().Nanoseconds()
 	d.metadataMu.Lock()
@@ -72,8 +76,9 @@ func (d *dentry) touchCMtime() {
 	d.metadataMu.Unlock()
 }
 
-// Preconditions: d.cachedMetadataAuthoritative() == true. The caller has
-// locked d.metadataMu.
+// Preconditions:
+// * d.cachedMetadataAuthoritative() == true.
+// * The caller has locked d.metadataMu.
 func (d *dentry) touchCMtimeLocked() {
 	now := d.fs.clock.Now().Nanoseconds()
 	atomic.StoreInt64(&d.mtime, now)
diff --git a/pkg/sentry/fsimpl/host/BUILD b/pkg/sentry/fsimpl/host/BUILD
index bd701bbc7..dc0f86061 100644
--- a/pkg/sentry/fsimpl/host/BUILD
+++ b/pkg/sentry/fsimpl/host/BUILD
@@ -1,14 +1,40 @@
 load("//tools:defs.bzl", "go_library")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "inode_refs",
+    out = "inode_refs.go",
+    package = "host",
+    prefix = "inode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "inode",
+    },
+)
+
+go_template_instance(
+    name = "connected_endpoint_refs",
+    out = "connected_endpoint_refs.go",
+    package = "host",
+    prefix = "ConnectedEndpoint",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "ConnectedEndpoint",
+    },
+)
+
 go_library(
     name = "host",
     srcs = [
+        "connected_endpoint_refs.go",
         "control.go",
         "host.go",
+        "inode_refs.go",
         "ioctl_unsafe.go",
         "mmap.go",
+        "save_restore.go",
         "socket.go",
         "socket_iovec.go",
         "socket_unsafe.go",
@@ -24,7 +50,9 @@ go_library(
         "//pkg/fspath",
         "//pkg/iovec",
         "//pkg/log",
+        "//pkg/marshal/primitive",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs/fsutil",
diff --git a/pkg/sentry/fsimpl/host/control.go b/pkg/sentry/fsimpl/host/control.go
index 0135e4428..13ef48cb5 100644
--- a/pkg/sentry/fsimpl/host/control.go
+++ b/pkg/sentry/fsimpl/host/control.go
@@ -79,7 +79,7 @@ func fdsToFiles(ctx context.Context, fds []int) []*vfs.FileDescription {
 		}
 
 		// Create the file backed by hostFD.
-		file, err := ImportFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, false /* isTTY */)
+		file, err := NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, &NewFDOptions{})
 		if err != nil {
 			ctx.Warningf("Error creating file from host FD: %v", err)
 			break
diff --git a/pkg/sentry/fsimpl/host/host.go b/pkg/sentry/fsimpl/host/host.go
index 56869f59a..eeed0f97d 100644
--- a/pkg/sentry/fsimpl/host/host.go
+++ b/pkg/sentry/fsimpl/host/host.go
@@ -19,6 +19,7 @@ package host
 import (
 	"fmt"
 	"math"
+	"sync/atomic"
 	"syscall"
 
 	"golang.org/x/sys/unix"
@@ -27,7 +28,6 @@ import (
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/refs"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/hostfd"
@@ -41,8 +41,123 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
+// inode implements kernfs.Inode.
+//
+// +stateify savable
+type inode struct {
+	kernfs.InodeNoStatFS
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
+
+	locks vfs.FileLocks
+
+	// When the reference count reaches zero, the host fd is closed.
+	inodeRefs
+
+	// hostFD contains the host fd that this file was originally created from,
+	// which must be available at time of restore.
+	//
+	// This field is initialized at creation time and is immutable.
+	hostFD int
+
+	// ino is an inode number unique within this filesystem.
+	//
+	// This field is initialized at creation time and is immutable.
+	ino uint64
+
+	// ftype is the file's type (a linux.S_IFMT mask).
+	//
+	// This field is initialized at creation time and is immutable.
+	ftype uint16
+
+	// mayBlock is true if hostFD is non-blocking, and operations on it may
+	// return EAGAIN or EWOULDBLOCK instead of blocking.
+	//
+	// This field is initialized at creation time and is immutable.
+	mayBlock bool
+
+	// seekable is false if lseek(hostFD) returns ESPIPE. We assume that file
+	// offsets are meaningful iff seekable is true.
+	//
+	// This field is initialized at creation time and is immutable.
+	seekable bool
+
+	// isTTY is true if this file represents a TTY.
+	//
+	// This field is initialized at creation time and is immutable.
+	isTTY bool
+
+	// savable is true if hostFD may be saved/restored by its numeric value.
+	//
+	// This field is initialized at creation time and is immutable.
+	savable bool
+
+	// Event queue for blocking operations.
+	queue waiter.Queue
+
+	// mapsMu protects mappings.
+	mapsMu sync.Mutex `state:"nosave"`
+
+	// If this file is mmappable, mappings tracks mappings of hostFD into
+	// memmap.MappingSpaces.
+	mappings memmap.MappingSet
+
+	// pf implements platform.File for mappings of hostFD.
+	pf inodePlatformFile
+
+	// If haveBuf is non-zero, hostFD represents a pipe, and buf contains data
+	// read from the pipe from previous calls to inode.beforeSave(). haveBuf
+	// and buf are protected by bufMu. haveBuf is accessed using atomic memory
+	// operations.
+	bufMu   sync.Mutex `state:"nosave"`
+	haveBuf uint32
+	buf     []byte
+}
+
+func newInode(ctx context.Context, fs *filesystem, hostFD int, savable bool, fileType linux.FileMode, isTTY bool) (*inode, error) {
+	// Determine if hostFD is seekable.
+	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
+	seekable := err != syserror.ESPIPE
+	// We expect regular files to be seekable, as this is required for them to
+	// be memory-mappable.
+	if !seekable && fileType == syscall.S_IFREG {
+		ctx.Infof("host.newInode: host FD %d is a non-seekable regular file", hostFD)
+		return nil, syserror.ESPIPE
+	}
+
+	i := &inode{
+		hostFD:   hostFD,
+		ino:      fs.NextIno(),
+		ftype:    uint16(fileType),
+		mayBlock: fileType != syscall.S_IFREG && fileType != syscall.S_IFDIR,
+		seekable: seekable,
+		isTTY:    isTTY,
+		savable:  savable,
+	}
+	i.pf.inode = i
+	i.EnableLeakCheck()
+
+	// If the hostFD can return EWOULDBLOCK when set to non-blocking, do so and
+	// handle blocking behavior in the sentry.
+	if i.mayBlock {
+		if err := syscall.SetNonblock(i.hostFD, true); err != nil {
+			return nil, err
+		}
+		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
+			return nil, err
+		}
+	}
+	return i, nil
+}
+
 // NewFDOptions contains options to NewFD.
 type NewFDOptions struct {
+	// If Savable is true, the host file descriptor may be saved/restored by
+	// numeric value; the sandbox API requires a corresponding host FD with the
+	// same numeric value to be provieded at time of restore.
+	Savable bool
+
 	// If IsTTY is true, the file descriptor is a TTY.
 	IsTTY bool
 
@@ -76,45 +191,12 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
 		flags = uint32(flagsInt)
 	}
 
-	fileMode := linux.FileMode(s.Mode)
-	fileType := fileMode.FileType()
-
-	// Determine if hostFD is seekable. If not, this syscall will return ESPIPE
-	// (see fs/read_write.c:llseek), e.g. for pipes, sockets, and some character
-	// devices.
-	_, err := unix.Seek(hostFD, 0, linux.SEEK_CUR)
-	seekable := err != syserror.ESPIPE
-
-	i := &inode{
-		hostFD:     hostFD,
-		ino:        fs.NextIno(),
-		isTTY:      opts.IsTTY,
-		wouldBlock: wouldBlock(uint32(fileType)),
-		seekable:   seekable,
-		// NOTE(b/38213152): Technically, some obscure char devices can be memory
-		// mapped, but we only allow regular files.
-		canMap: fileType == linux.S_IFREG,
-	}
-	i.pf.inode = i
-
-	// Non-seekable files can't be memory mapped, assert this.
-	if !i.seekable && i.canMap {
-		panic("files that can return EWOULDBLOCK (sockets, pipes, etc.) cannot be memory mapped")
-	}
-
-	// If the hostFD would block, we must set it to non-blocking and handle
-	// blocking behavior in the sentry.
-	if i.wouldBlock {
-		if err := syscall.SetNonblock(i.hostFD, true); err != nil {
-			return nil, err
-		}
-		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
-			return nil, err
-		}
-	}
-
 	d := &kernfs.Dentry{}
-	d.Init(i)
+	i, err := newInode(ctx, fs, hostFD, opts.Savable, linux.FileMode(s.Mode).FileType(), opts.IsTTY)
+	if err != nil {
+		return nil, err
+	}
+	d.Init(&fs.Filesystem, i)
 
 	// i.open will take a reference on d.
 	defer d.DecRef(ctx)
@@ -122,29 +204,35 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
 	// For simplicity, fileDescription.offset is set to 0. Technically, we
 	// should only set to 0 on files that are not seekable (sockets, pipes,
 	// etc.), and use the offset from the host fd otherwise when importing.
-	return i.open(ctx, d.VFSDentry(), mnt, flags)
+	return i.open(ctx, d, mnt, flags)
 }
 
 // ImportFD sets up and returns a vfs.FileDescription from a donated fd.
 func ImportFD(ctx context.Context, mnt *vfs.Mount, hostFD int, isTTY bool) (*vfs.FileDescription, error) {
 	return NewFD(ctx, mnt, hostFD, &NewFDOptions{
-		IsTTY: isTTY,
+		Savable: true,
+		IsTTY:   isTTY,
 	})
 }
 
 // filesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type filesystemType struct{}
 
-// GetFilesystem implements FilesystemType.GetFilesystem.
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (filesystemType) GetFilesystem(context.Context, *vfs.VirtualFilesystem, *auth.Credentials, string, vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	panic("host.filesystemType.GetFilesystem should never be called")
 }
 
-// Name implements FilesystemType.Name.
+// Name implements vfs.FilesystemType.Name.
 func (filesystemType) Name() string {
 	return "none"
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
 // NewFilesystem sets up and returns a new hostfs filesystem.
 //
 // Note that there should only ever be one instance of host.filesystem,
@@ -162,6 +250,8 @@ func NewFilesystem(vfsObj *vfs.VirtualFilesystem) (*vfs.Filesystem, error) {
 }
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -180,65 +270,7 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 	return vfs.PrependPathSyntheticError{}
 }
 
-// inode implements kernfs.Inode.
-type inode struct {
-	kernfs.InodeNotDirectory
-	kernfs.InodeNotSymlink
-
-	locks vfs.FileLocks
-
-	// When the reference count reaches zero, the host fd is closed.
-	refs.AtomicRefCount
-
-	// hostFD contains the host fd that this file was originally created from,
-	// which must be available at time of restore.
-	//
-	// This field is initialized at creation time and is immutable.
-	hostFD int
-
-	// ino is an inode number unique within this filesystem.
-	//
-	// This field is initialized at creation time and is immutable.
-	ino uint64
-
-	// isTTY is true if this file represents a TTY.
-	//
-	// This field is initialized at creation time and is immutable.
-	isTTY bool
-
-	// seekable is false if the host fd points to a file representing a stream,
-	// e.g. a socket or a pipe. Such files are not seekable and can return
-	// EWOULDBLOCK for I/O operations.
-	//
-	// This field is initialized at creation time and is immutable.
-	seekable bool
-
-	// wouldBlock is true if the host FD would return EWOULDBLOCK for
-	// operations that would block.
-	//
-	// This field is initialized at creation time and is immutable.
-	wouldBlock bool
-
-	// Event queue for blocking operations.
-	queue waiter.Queue
-
-	// canMap specifies whether we allow the file to be memory mapped.
-	//
-	// This field is initialized at creation time and is immutable.
-	canMap bool
-
-	// mapsMu protects mappings.
-	mapsMu sync.Mutex
-
-	// If canMap is true, mappings tracks mappings of hostFD into
-	// memmap.MappingSpaces.
-	mappings memmap.MappingSet
-
-	// pf implements platform.File for mappings of hostFD.
-	pf inodePlatformFile
-}
-
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
 func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
 	var s syscall.Stat_t
 	if err := syscall.Fstat(i.hostFD, &s); err != nil {
@@ -247,7 +279,7 @@ func (i *inode) CheckPermissions(ctx context.Context, creds *auth.Credentials, a
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(s.Mode), auth.KUID(s.Uid), auth.KGID(s.Gid))
 }
 
-// Mode implements kernfs.Inode.
+// Mode implements kernfs.Inode.Mode.
 func (i *inode) Mode() linux.FileMode {
 	var s syscall.Stat_t
 	if err := syscall.Fstat(i.hostFD, &s); err != nil {
@@ -258,7 +290,7 @@ func (i *inode) Mode() linux.FileMode {
 	return linux.FileMode(s.Mode)
 }
 
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
 func (i *inode) Stat(ctx context.Context, vfsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
 	if opts.Mask&linux.STATX__RESERVED != 0 {
 		return linux.Statx{}, syserror.EINVAL
@@ -371,7 +403,7 @@ func (i *inode) fstat(fs *filesystem) (linux.Statx, error) {
 	}, nil
 }
 
-// SetStat implements kernfs.Inode.
+// SetStat implements kernfs.Inode.SetStat.
 func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
 	s := &opts.Stat
 
@@ -430,10 +462,10 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
 	return nil
 }
 
-// DecRef implements kernfs.Inode.
+// DecRef implements kernfs.Inode.DecRef.
 func (i *inode) DecRef(ctx context.Context) {
-	i.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
-		if i.wouldBlock {
+	i.inodeRefs.DecRef(func() {
+		if i.mayBlock {
 			fdnotifier.RemoveFD(int32(i.hostFD))
 		}
 		if err := unix.Close(i.hostFD); err != nil {
@@ -442,16 +474,16 @@ func (i *inode) DecRef(ctx context.Context) {
 	})
 }
 
-// Open implements kernfs.Inode.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	// Once created, we cannot re-open a socket fd through /proc/[pid]/fd/.
 	if i.Mode().FileType() == linux.S_IFSOCK {
 		return nil, syserror.ENXIO
 	}
-	return i.open(ctx, vfsd, rp.Mount(), opts.Flags)
+	return i.open(ctx, d, rp.Mount(), opts.Flags)
 }
 
-func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) {
+func (i *inode) open(ctx context.Context, d *kernfs.Dentry, mnt *vfs.Mount, flags uint32) (*vfs.FileDescription, error) {
 	var s syscall.Stat_t
 	if err := syscall.Fstat(i.hostFD, &s); err != nil {
 		return nil, err
@@ -475,17 +507,17 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 			return nil, err
 		}
 		// Currently, we only allow Unix sockets to be imported.
-		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d, &i.locks)
+		return unixsocket.NewFileDescription(ep, ep.Type(), flags, mnt, d.VFSDentry(), &i.locks)
 
 	case syscall.S_IFREG, syscall.S_IFIFO, syscall.S_IFCHR:
 		if i.isTTY {
 			fd := &TTYFileDescription{
 				fileDescription: fileDescription{inode: i},
-				termios:         linux.DefaultSlaveTermios,
+				termios:         linux.DefaultReplicaTermios,
 			}
 			fd.LockFD.Init(&i.locks)
 			vfsfd := &fd.vfsfd
-			if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+			if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 				return nil, err
 			}
 			return vfsfd, nil
@@ -494,7 +526,7 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 		fd := &fileDescription{inode: i}
 		fd.LockFD.Init(&i.locks)
 		vfsfd := &fd.vfsfd
-		if err := vfsfd.Init(fd, flags, mnt, d, &vfs.FileDescriptionOptions{}); err != nil {
+		if err := vfsfd.Init(fd, flags, mnt, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 			return nil, err
 		}
 		return vfsfd, nil
@@ -506,6 +538,8 @@ func (i *inode) open(ctx context.Context, d *vfs.Dentry, mnt *vfs.Mount, flags u
 }
 
 // fileDescription is embedded by host fd implementations of FileDescriptionImpl.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -520,41 +554,43 @@ type fileDescription struct {
 	inode *inode
 
 	// offsetMu protects offset.
-	offsetMu sync.Mutex
+	offsetMu sync.Mutex `state:"nosave"`
 
 	// offset specifies the current file offset. It is only meaningful when
 	// inode.seekable is true.
 	offset int64
 }
 
-// SetStat implements vfs.FileDescriptionImpl.
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (f *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
 	creds := auth.CredentialsFromContext(ctx)
 	return f.inode.SetStat(ctx, f.vfsfd.Mount().Filesystem(), creds, opts)
 }
 
-// Stat implements vfs.FileDescriptionImpl.
+// Stat implements vfs.FileDescriptionImpl.Stat.
 func (f *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	return f.inode.Stat(ctx, f.vfsfd.Mount().Filesystem(), opts)
 }
 
-// Release implements vfs.FileDescriptionImpl.
+// Release implements vfs.FileDescriptionImpl.Release.
 func (f *fileDescription) Release(context.Context) {
 	// noop
 }
 
-// Allocate implements vfs.FileDescriptionImpl.
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
 func (f *fileDescription) Allocate(ctx context.Context, mode, offset, length uint64) error {
-	if !f.inode.seekable {
-		return syserror.ESPIPE
-	}
-
-	// TODO(gvisor.dev/issue/3589): Implement Allocate for non-pipe hostfds.
-	return syserror.EOPNOTSUPP
+	return unix.Fallocate(f.inode.hostFD, uint32(mode), int64(offset), int64(length))
 }
 
-// PRead implements FileDescriptionImpl.
+// PRead implements vfs.FileDescriptionImpl.PRead.
 func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
 	i := f.inode
 	if !i.seekable {
 		return 0, syserror.ESPIPE
@@ -563,21 +599,33 @@ func (f *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, off
 	return readFromHostFD(ctx, i.hostFD, dst, offset, opts.Flags)
 }
 
-// Read implements FileDescriptionImpl.
+// Read implements vfs.FileDescriptionImpl.Read.
 func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// Check that flags are supported.
+	//
+	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
+	if opts.Flags&^linux.RWF_HIPRI != 0 {
+		return 0, syserror.EOPNOTSUPP
+	}
+
 	i := f.inode
 	if !i.seekable {
+		bufN, err := i.readFromBuf(ctx, &dst)
+		if err != nil {
+			return bufN, err
+		}
 		n, err := readFromHostFD(ctx, i.hostFD, dst, -1, opts.Flags)
+		total := bufN + n
 		if isBlockError(err) {
 			// If we got any data at all, return it as a "completed" partial read
 			// rather than retrying until complete.
-			if n != 0 {
+			if total != 0 {
 				err = nil
 			} else {
 				err = syserror.ErrWouldBlock
 			}
 		}
-		return n, err
+		return total, err
 	}
 
 	f.offsetMu.Lock()
@@ -587,20 +635,33 @@ func (f *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts
 	return n, err
 }
 
-func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
-	// Check that flags are supported.
-	//
-	// TODO(gvisor.dev/issue/2601): Support select preadv2 flags.
-	if flags&^linux.RWF_HIPRI != 0 {
-		return 0, syserror.EOPNOTSUPP
+func (i *inode) readFromBuf(ctx context.Context, dst *usermem.IOSequence) (int64, error) {
+	if atomic.LoadUint32(&i.haveBuf) == 0 {
+		return 0, nil
+	}
+	i.bufMu.Lock()
+	defer i.bufMu.Unlock()
+	if len(i.buf) == 0 {
+		return 0, nil
 	}
+	n, err := dst.CopyOut(ctx, i.buf)
+	*dst = dst.DropFirst(n)
+	i.buf = i.buf[n:]
+	if len(i.buf) == 0 {
+		atomic.StoreUint32(&i.haveBuf, 0)
+		i.buf = nil
+	}
+	return int64(n), err
+}
+
+func readFromHostFD(ctx context.Context, hostFD int, dst usermem.IOSequence, offset int64, flags uint32) (int64, error) {
 	reader := hostfd.GetReadWriterAt(int32(hostFD), offset, flags)
 	n, err := dst.CopyOutFrom(ctx, reader)
 	hostfd.PutReadWriterAt(reader)
 	return int64(n), err
 }
 
-// PWrite implements FileDescriptionImpl.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	if !f.inode.seekable {
 		return 0, syserror.ESPIPE
@@ -609,7 +670,7 @@ func (f *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, of
 	return f.writeToHostFD(ctx, src, offset, opts.Flags)
 }
 
-// Write implements FileDescriptionImpl.
+// Write implements vfs.FileDescriptionImpl.Write.
 func (f *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	i := f.inode
 	if !i.seekable {
@@ -657,7 +718,7 @@ func (f *fileDescription) writeToHostFD(ctx context.Context, src usermem.IOSeque
 	return int64(n), err
 }
 
-// Seek implements FileDescriptionImpl.
+// Seek implements vfs.FileDescriptionImpl.Seek.
 //
 // Note that we do not support seeking on directories, since we do not even
 // allow directory fds to be imported at all.
@@ -722,15 +783,17 @@ func (f *fileDescription) Seek(_ context.Context, offset int64, whence int32) (i
 	return f.offset, nil
 }
 
-// Sync implements FileDescriptionImpl.
-func (f *fileDescription) Sync(context.Context) error {
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (f *fileDescription) Sync(ctx context.Context) error {
 	// TODO(gvisor.dev/issue/1897): Currently, we always sync everything.
 	return unix.Fsync(f.inode.hostFD)
 }
 
-// ConfigureMMap implements FileDescriptionImpl.
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
 func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts) error {
-	if !f.inode.canMap {
+	// NOTE(b/38213152): Technically, some obscure char devices can be memory
+	// mapped, but we only allow regular files.
+	if f.inode.ftype != syscall.S_IFREG {
 		return syserror.ENODEV
 	}
 	i := f.inode
@@ -741,13 +804,17 @@ func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts
 // EventRegister implements waiter.Waitable.EventRegister.
 func (f *fileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
 	f.inode.queue.EventRegister(e, mask)
-	fdnotifier.UpdateFD(int32(f.inode.hostFD))
+	if f.inode.mayBlock {
+		fdnotifier.UpdateFD(int32(f.inode.hostFD))
+	}
 }
 
 // EventUnregister implements waiter.Waitable.EventUnregister.
 func (f *fileDescription) EventUnregister(e *waiter.Entry) {
 	f.inode.queue.EventUnregister(e)
-	fdnotifier.UpdateFD(int32(f.inode.hostFD))
+	if f.inode.mayBlock {
+		fdnotifier.UpdateFD(int32(f.inode.hostFD))
+	}
 }
 
 // Readiness uses the poll() syscall to check the status of the underlying FD.
diff --git a/pkg/sentry/fsimpl/host/mmap.go b/pkg/sentry/fsimpl/host/mmap.go
index 65d3af38c..3d7eb2f96 100644
--- a/pkg/sentry/fsimpl/host/mmap.go
+++ b/pkg/sentry/fsimpl/host/mmap.go
@@ -27,11 +27,13 @@ import (
 // cannot implement both kernfs.Inode.IncRef and memmap.File.IncRef.
 //
 // inodePlatformFile should only be used if inode.canMap is true.
+//
+// +stateify savable
 type inodePlatformFile struct {
 	*inode
 
 	// fdRefsMu protects fdRefs.
-	fdRefsMu sync.Mutex
+	fdRefsMu sync.Mutex `state:"nosave"`
 
 	// fdRefs counts references on memmap.File offsets. It is used solely for
 	// memory accounting.
@@ -41,7 +43,7 @@ type inodePlatformFile struct {
 	fileMapper fsutil.HostFileMapper
 
 	// fileMapperInitOnce is used to lazily initialize fileMapper.
-	fileMapperInitOnce sync.Once
+	fileMapperInitOnce sync.Once `state:"nosave"`
 }
 
 // IncRef implements memmap.File.IncRef.
diff --git a/pkg/sentry/fsimpl/host/save_restore.go b/pkg/sentry/fsimpl/host/save_restore.go
new file mode 100644
index 000000000..7e32a8863
--- /dev/null
+++ b/pkg/sentry/fsimpl/host/save_restore.go
@@ -0,0 +1,78 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package host
+
+import (
+	"fmt"
+	"io"
+	"sync/atomic"
+	"syscall"
+
+	"gvisor.dev/gvisor/pkg/fdnotifier"
+	"gvisor.dev/gvisor/pkg/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/hostfd"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// beforeSave is invoked by stateify.
+func (i *inode) beforeSave() {
+	if !i.savable {
+		panic("host.inode is not savable")
+	}
+	if i.ftype == syscall.S_IFIFO {
+		// If this pipe FD is readable, drain it so that bytes in the pipe can
+		// be read after restore. (This is a legacy VFS1 feature.) We don't
+		// know if the pipe FD is readable, so just try reading and tolerate
+		// EBADF from the read.
+		i.bufMu.Lock()
+		defer i.bufMu.Unlock()
+		var buf [usermem.PageSize]byte
+		for {
+			n, err := hostfd.Preadv2(int32(i.hostFD), safemem.BlockSeqOf(safemem.BlockFromSafeSlice(buf[:])), -1 /* offset */, 0 /* flags */)
+			if n != 0 {
+				i.buf = append(i.buf, buf[:n]...)
+			}
+			if err != nil {
+				if err == io.EOF || err == syscall.EAGAIN || err == syscall.EBADF {
+					break
+				}
+				panic(fmt.Errorf("host.inode.beforeSave: buffering from pipe failed: %v", err))
+			}
+		}
+		if len(i.buf) != 0 {
+			atomic.StoreUint32(&i.haveBuf, 1)
+		}
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (i *inode) afterLoad() {
+	if i.mayBlock {
+		if err := syscall.SetNonblock(i.hostFD, true); err != nil {
+			panic(fmt.Sprintf("host.inode.afterLoad: failed to set host FD %d non-blocking: %v", i.hostFD, err))
+		}
+		if err := fdnotifier.AddFD(int32(i.hostFD), &i.queue); err != nil {
+			panic(fmt.Sprintf("host.inode.afterLoad: fdnotifier.AddFD(%d) failed: %v", i.hostFD, err))
+		}
+	}
+}
+
+// afterLoad is invoked by stateify.
+func (i *inodePlatformFile) afterLoad() {
+	if i.fileMapper.IsInited() {
+		// Ensure that we don't call i.fileMapper.Init() again.
+		i.fileMapperInitOnce.Do(func() {})
+	}
+}
diff --git a/pkg/sentry/fsimpl/host/socket.go b/pkg/sentry/fsimpl/host/socket.go
index 4979dd0a9..8a447e29f 100644
--- a/pkg/sentry/fsimpl/host/socket.go
+++ b/pkg/sentry/fsimpl/host/socket.go
@@ -22,7 +22,6 @@ import (
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fdnotifier"
 	"gvisor.dev/gvisor/pkg/log"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/socket/control"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
@@ -59,8 +58,7 @@ func newEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue) (transpor
 //
 // +stateify savable
 type ConnectedEndpoint struct {
-	// ref keeps track of references to a ConnectedEndpoint.
-	ref refs.AtomicRefCount
+	ConnectedEndpointRefs
 
 	// mu protects fd below.
 	mu sync.RWMutex `state:"nosave"`
@@ -132,9 +130,9 @@ func NewConnectedEndpoint(ctx context.Context, hostFD int, addr string, saveable
 		return nil, err
 	}
 
-	// AtomicRefCounters start off with a single reference. We need two.
-	e.ref.IncRef()
-	e.ref.EnableLeakCheck("host.ConnectedEndpoint")
+	// ConnectedEndpointRefs start off with a single reference. We need two.
+	e.IncRef()
+	e.EnableLeakCheck()
 	return &e, nil
 }
 
@@ -318,7 +316,7 @@ func (c *ConnectedEndpoint) destroyLocked() {
 // Release implements transport.ConnectedEndpoint.Release and
 // transport.Receiver.Release.
 func (c *ConnectedEndpoint) Release(ctx context.Context) {
-	c.ref.DecRefWithDestructor(ctx, func(context.Context) {
+	c.DecRef(func() {
 		c.mu.Lock()
 		c.destroyLocked()
 		c.mu.Unlock()
@@ -348,12 +346,12 @@ func (e *SCMConnectedEndpoint) Init() error {
 // Release implements transport.ConnectedEndpoint.Release and
 // transport.Receiver.Release.
 func (e *SCMConnectedEndpoint) Release(ctx context.Context) {
-	e.ref.DecRefWithDestructor(ctx, func(context.Context) {
+	e.DecRef(func() {
 		e.mu.Lock()
+		fdnotifier.RemoveFD(int32(e.fd))
 		if err := syscall.Close(e.fd); err != nil {
 			log.Warningf("Failed to close host fd %d: %v", err)
 		}
-		fdnotifier.RemoveFD(int32(e.fd))
 		e.destroyLocked()
 		e.mu.Unlock()
 	})
@@ -378,8 +376,8 @@ func NewSCMEndpoint(ctx context.Context, hostFD int, queue *waiter.Queue, addr s
 		return nil, err
 	}
 
-	// AtomicRefCounters start off with a single reference. We need two.
-	e.ref.IncRef()
-	e.ref.EnableLeakCheck("host.SCMConnectedEndpoint")
+	// ConnectedEndpointRefs start off with a single reference. We need two.
+	e.IncRef()
+	e.EnableLeakCheck()
 	return &e, nil
 }
diff --git a/pkg/sentry/fsimpl/host/socket_unsafe.go b/pkg/sentry/fsimpl/host/socket_unsafe.go
index 35ded24bc..c0bf45f08 100644
--- a/pkg/sentry/fsimpl/host/socket_unsafe.go
+++ b/pkg/sentry/fsimpl/host/socket_unsafe.go
@@ -63,10 +63,10 @@ func fdReadVec(fd int, bufs [][]byte, control []byte, peek bool, maxlen int64) (
 	controlTrunc = msg.Flags&syscall.MSG_CTRUNC == syscall.MSG_CTRUNC
 
 	if n > length {
-		return length, n, msg.Controllen, controlTrunc, err
+		return length, n, msg.Controllen, controlTrunc, nil
 	}
 
-	return n, n, msg.Controllen, controlTrunc, err
+	return n, n, msg.Controllen, controlTrunc, nil
 }
 
 // fdWriteVec sends from bufs to fd.
diff --git a/pkg/sentry/fsimpl/host/tty.go b/pkg/sentry/fsimpl/host/tty.go
index 27cbd3059..f5c596fec 100644
--- a/pkg/sentry/fsimpl/host/tty.go
+++ b/pkg/sentry/fsimpl/host/tty.go
@@ -17,6 +17,7 @@ package host
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@@ -29,6 +30,8 @@ import (
 
 // TTYFileDescription implements vfs.FileDescriptionImpl for a host file
 // descriptor that wraps a TTY FD.
+//
+// +stateify savable
 type TTYFileDescription struct {
 	fileDescription
 
@@ -75,7 +78,7 @@ func (t *TTYFileDescription) Release(ctx context.Context) {
 	t.fileDescription.Release(ctx)
 }
 
-// PRead implements vfs.FileDescriptionImpl.
+// PRead implements vfs.FileDescriptionImpl.PRead.
 //
 // Reading from a TTY is only allowed for foreground process groups. Background
 // process groups will either get EIO or a SIGTTIN.
@@ -93,7 +96,7 @@ func (t *TTYFileDescription) PRead(ctx context.Context, dst usermem.IOSequence,
 	return t.fileDescription.PRead(ctx, dst, offset, opts)
 }
 
-// Read implements vfs.FileDescriptionImpl.
+// Read implements vfs.FileDescriptionImpl.Read.
 //
 // Reading from a TTY is only allowed for foreground process groups. Background
 // process groups will either get EIO or a SIGTTIN.
@@ -111,7 +114,7 @@ func (t *TTYFileDescription) Read(ctx context.Context, dst usermem.IOSequence, o
 	return t.fileDescription.Read(ctx, dst, opts)
 }
 
-// PWrite implements vfs.FileDescriptionImpl.
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
 func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
@@ -126,7 +129,7 @@ func (t *TTYFileDescription) PWrite(ctx context.Context, src usermem.IOSequence,
 	return t.fileDescription.PWrite(ctx, src, offset, opts)
 }
 
-// Write implements vfs.FileDescriptionImpl.
+// Write implements vfs.FileDescriptionImpl.Write.
 func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
 	t.mu.Lock()
 	defer t.mu.Unlock()
@@ -141,8 +144,13 @@ func (t *TTYFileDescription) Write(ctx context.Context, src usermem.IOSequence,
 	return t.fileDescription.Write(ctx, src, opts)
 }
 
-// Ioctl implements vfs.FileDescriptionImpl.
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
 func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	task := kernel.TaskFromContext(ctx)
+	if task == nil {
+		return 0, syserror.ENOTTY
+	}
+
 	// Ignore arg[0]. This is the real FD:
 	fd := t.inode.hostFD
 	ioctl := args[1].Uint64()
@@ -152,9 +160,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		if err != nil {
 			return 0, err
 		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err = termios.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TCSETS, linux.TCSETSW, linux.TCSETSF:
@@ -166,9 +172,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		}
 
 		var termios linux.Termios
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &termios, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := termios.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		err := ioctlSetTermios(fd, ioctl, &termios)
@@ -192,10 +196,8 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		defer t.mu.Unlock()
 
 		// Map the ProcessGroup into a ProcessGroupID in the task's PID namespace.
-		pgID := pidns.IDOfProcessGroup(t.fgProcessGroup)
-		_, err := usermem.CopyObjectOut(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		pgID := primitive.Int32(pidns.IDOfProcessGroup(t.fgProcessGroup))
+		_, err := pgID.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCSPGRP:
@@ -203,11 +205,6 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		// Equivalent to tcsetpgrp(fd, *argp).
 		// Set the foreground process group ID of this terminal.
 
-		task := kernel.TaskFromContext(ctx)
-		if task == nil {
-			return 0, syserror.ENOTTY
-		}
-
 		t.mu.Lock()
 		defer t.mu.Unlock()
 
@@ -226,12 +223,11 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 			return 0, syserror.ENOTTY
 		}
 
-		var pgID kernel.ProcessGroupID
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &pgID, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		var pgIDP primitive.Int32
+		if _, err := pgIDP.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
+		pgID := kernel.ProcessGroupID(pgIDP)
 
 		// pgID must be non-negative.
 		if pgID < 0 {
@@ -260,9 +256,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		if err != nil {
 			return 0, err
 		}
-		_, err = usermem.CopyObjectOut(ctx, io, args[2].Pointer(), winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		})
+		_, err = winsize.CopyOut(task, args[2].Pointer())
 		return 0, err
 
 	case linux.TIOCSWINSZ:
@@ -273,9 +267,7 @@ func (t *TTYFileDescription) Ioctl(ctx context.Context, io usermem.IO, args arch
 		// set the winsize.
 
 		var winsize linux.Winsize
-		if _, err := usermem.CopyObjectIn(ctx, io, args[2].Pointer(), &winsize, usermem.IOOpts{
-			AddressSpaceActive: true,
-		}); err != nil {
+		if _, err := winsize.CopyIn(task, args[2].Pointer()); err != nil {
 			return 0, err
 		}
 		err := ioctlSetWinsize(fd, &winsize)
diff --git a/pkg/sentry/fsimpl/host/util.go b/pkg/sentry/fsimpl/host/util.go
index 412bdb2eb..b2f43a119 100644
--- a/pkg/sentry/fsimpl/host/util.go
+++ b/pkg/sentry/fsimpl/host/util.go
@@ -43,12 +43,6 @@ func timespecToStatxTimestamp(ts unix.Timespec) linux.StatxTimestamp {
 	return linux.StatxTimestamp{Sec: int64(ts.Sec), Nsec: uint32(ts.Nsec)}
 }
 
-// wouldBlock returns true for file types that can return EWOULDBLOCK
-// for blocking operations, e.g. pipes, character devices, and sockets.
-func wouldBlock(fileType uint32) bool {
-	return fileType == syscall.S_IFIFO || fileType == syscall.S_IFCHR || fileType == syscall.S_IFSOCK
-}
-
 // isBlockError checks if an error is EAGAIN or EWOULDBLOCK.
 // If so, they can be transformed into syserror.ErrWouldBlock.
 func isBlockError(err error) bool {
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 3835557fe..aaad67ab8 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -4,6 +4,18 @@ load("//tools/go_generics:defs.bzl", "go_template_instance")
 licenses(["notice"])
 
 go_template_instance(
+    name = "dentry_list",
+    out = "dentry_list.go",
+    package = "kernfs",
+    prefix = "dentry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*Dentry",
+        "Linker": "*Dentry",
+    },
+)
+
+go_template_instance(
     name = "fstree",
     out = "fstree.go",
     package = "kernfs",
@@ -26,9 +38,54 @@ go_template_instance(
     },
 )
 
+go_template_instance(
+    name = "static_directory_refs",
+    out = "static_directory_refs.go",
+    package = "kernfs",
+    prefix = "StaticDirectory",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "StaticDirectory",
+    },
+)
+
+go_template_instance(
+    name = "dir_refs",
+    out = "dir_refs.go",
+    package = "kernfs_test",
+    prefix = "dir",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "dir",
+    },
+)
+
+go_template_instance(
+    name = "readonly_dir_refs",
+    out = "readonly_dir_refs.go",
+    package = "kernfs_test",
+    prefix = "readonlyDir",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "readonlyDir",
+    },
+)
+
+go_template_instance(
+    name = "synthetic_directory_refs",
+    out = "synthetic_directory_refs.go",
+    package = "kernfs",
+    prefix = "syntheticDirectory",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "syntheticDirectory",
+    },
+)
+
 go_library(
     name = "kernfs",
     srcs = [
+        "dentry_list.go",
         "dynamic_bytes_file.go",
         "fd_impl_util.go",
         "filesystem.go",
@@ -36,7 +93,10 @@ go_library(
         "inode_impl_util.go",
         "kernfs.go",
         "slot_list.go",
+        "static_directory_refs.go",
         "symlink.go",
+        "synthetic_directory.go",
+        "synthetic_directory_refs.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
@@ -45,8 +105,11 @@ go_library(
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/refsvfs2",
+        "//pkg/sentry/fs/fsutil",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/time",
         "//pkg/sentry/memmap",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/vfs",
@@ -59,11 +122,18 @@ go_library(
 go_test(
     name = "kernfs_test",
     size = "small",
-    srcs = ["kernfs_test.go"],
+    srcs = [
+        "dir_refs.go",
+        "kernfs_test.go",
+        "readonly_dir_refs.go",
+    ],
     deps = [
         ":kernfs",
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/sentry/contexttest",
         "//pkg/sentry/fsimpl/testutil",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 12adf727a..485504995 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -35,6 +35,7 @@ import (
 // +stateify savable
 type DynamicBytesFile struct {
 	InodeAttrs
+	InodeNoStatFS
 	InodeNoopRefCount
 	InodeNotDirectory
 	InodeNotSymlink
@@ -46,18 +47,18 @@ type DynamicBytesFile struct {
 var _ Inode = (*DynamicBytesFile)(nil)
 
 // Init initializes a dynamic bytes file.
-func (f *DynamicBytesFile) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) {
+func (f *DynamicBytesFile) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode) {
 	if perm&^linux.PermissionsMask != 0 {
 		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
 	}
-	f.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
+	f.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
 	f.data = data
 }
 
 // Open implements Inode.Open.
-func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (f *DynamicBytesFile) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &DynamicBytesFD{}
-	if err := fd.Init(rp.Mount(), vfsd, f.data, &f.locks, opts.Flags); err != nil {
+	if err := fd.Init(rp.Mount(), d, f.data, &f.locks, opts.Flags); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
@@ -86,12 +87,12 @@ type DynamicBytesFD struct {
 }
 
 // Init initializes a DynamicBytesFD.
-func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
+func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *Dentry, data vfs.DynamicBytesSource, locks *vfs.FileLocks, flags uint32) error {
 	fd.LockFD.Init(locks)
-	if err := fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := fd.vfsfd.Init(fd, flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return err
 	}
-	fd.inode = d.Impl().(*Dentry).inode
+	fd.inode = d.inode
 	fd.SetDataSource(data)
 	return nil
 }
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index fcee6200a..f8dae22f8 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -15,7 +15,7 @@
 package kernfs
 
 import (
-	"math"
+	"fmt"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -28,9 +28,29 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// SeekEndConfig describes the SEEK_END behaviour for FDs.
+//
+// +stateify savable
+type SeekEndConfig int
+
+// Constants related to SEEK_END behaviour for FDs.
+const (
+	// Consider the end of the file to be after the final static entry. This is
+	// the default option.
+	SeekEndStaticEntries = iota
+	// Consider the end of the file to be at offset 0.
+	SeekEndZero
+)
+
+// GenericDirectoryFDOptions contains configuration for a GenericDirectoryFD.
+//
+// +stateify savable
+type GenericDirectoryFDOptions struct {
+	SeekEnd SeekEndConfig
+}
+
 // GenericDirectoryFD implements vfs.FileDescriptionImpl for a generic directory
-// inode that uses OrderChildren to track child nodes. GenericDirectoryFD is not
-// compatible with dynamic directories.
+// inode that uses OrderChildren to track child nodes.
 //
 // Note that GenericDirectoryFD holds a lock over OrderedChildren while calling
 // IterDirents callback. The IterDirents callback therefore cannot hash or
@@ -40,16 +60,21 @@ import (
 // Must be initialize with Init before first use.
 //
 // Lock ordering: mu => children.mu.
+//
+// +stateify savable
 type GenericDirectoryFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.DirectoryFileDescriptionDefaultImpl
 	vfs.LockFD
 
+	// Immutable.
+	seekEnd SeekEndConfig
+
 	vfsfd    vfs.FileDescription
 	children *OrderedChildren
 
 	// mu protects the fields below.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// off is the current directory offset. Protected by "mu".
 	off int64
@@ -57,12 +82,12 @@ type GenericDirectoryFD struct {
 
 // NewGenericDirectoryFD creates a new GenericDirectoryFD and returns its
 // dentry.
-func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) (*GenericDirectoryFD, error) {
+func NewGenericDirectoryFD(m *vfs.Mount, d *Dentry, children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) (*GenericDirectoryFD, error) {
 	fd := &GenericDirectoryFD{}
-	if err := fd.Init(children, locks, opts); err != nil {
+	if err := fd.Init(children, locks, opts, fdOpts); err != nil {
 		return nil, err
 	}
-	if err := fd.vfsfd.Init(fd, opts.Flags, m, d, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := fd.vfsfd.Init(fd, opts.Flags, m, d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return fd, nil
@@ -71,12 +96,13 @@ func NewGenericDirectoryFD(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildre
 // Init initializes a GenericDirectoryFD. Use it when overriding
 // GenericDirectoryFD. Caller must call fd.VFSFileDescription.Init() with the
 // correct implementation.
-func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions) error {
+func (fd *GenericDirectoryFD) Init(children *OrderedChildren, locks *vfs.FileLocks, opts *vfs.OpenOptions, fdOpts GenericDirectoryFDOptions) error {
 	if vfs.AccessTypesForOpenFlags(opts)&vfs.MayWrite != 0 {
 		// Can't open directories for writing.
 		return syserror.EISDIR
 	}
 	fd.LockFD.Init(locks)
+	fd.seekEnd = fdOpts.SeekEnd
 	fd.children = children
 	return nil
 }
@@ -119,8 +145,12 @@ func (fd *GenericDirectoryFD) filesystem() *vfs.Filesystem {
 	return fd.vfsfd.VirtualDentry().Mount().Filesystem()
 }
 
+func (fd *GenericDirectoryFD) dentry() *Dentry {
+	return fd.vfsfd.Dentry().Impl().(*Dentry)
+}
+
 func (fd *GenericDirectoryFD) inode() Inode {
-	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
+	return fd.dentry().inode
 }
 
 // IterDirents implements vfs.FileDescriptionImpl.IterDirents. IterDirents holds
@@ -150,8 +180,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 
 	// Handle "..".
 	if fd.off == 1 {
-		vfsd := fd.vfsfd.VirtualDentry().Dentry()
-		parentInode := genericParentOrSelf(vfsd.Impl().(*Dentry)).inode
+		parentInode := genericParentOrSelf(fd.dentry()).inode
 		stat, err := parentInode.Stat(ctx, fd.filesystem(), opts)
 		if err != nil {
 			return err
@@ -175,13 +204,12 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 	// these.
 	childIdx := fd.off - 2
 	for it := fd.children.nthLocked(childIdx); it != nil; it = it.Next() {
-		inode := it.Dentry.Impl().(*Dentry).inode
-		stat, err := inode.Stat(ctx, fd.filesystem(), opts)
+		stat, err := it.inode.Stat(ctx, fd.filesystem(), opts)
 		if err != nil {
 			return err
 		}
 		dirent := vfs.Dirent{
-			Name:    it.Name,
+			Name:    it.name,
 			Type:    linux.FileMode(stat.Mode).DirentType(),
 			Ino:     stat.Ino,
 			NextOff: fd.off + 1,
@@ -194,7 +222,7 @@ func (fd *GenericDirectoryFD) IterDirents(ctx context.Context, cb vfs.IterDirent
 
 	var err error
 	relOffset := fd.off - int64(len(fd.children.set)) - 2
-	fd.off, err = fd.inode().IterDirents(ctx, cb, fd.off, relOffset)
+	fd.off, err = fd.inode().IterDirents(ctx, fd.vfsfd.Mount(), cb, fd.off, relOffset)
 	return err
 }
 
@@ -209,9 +237,17 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
 	case linux.SEEK_CUR:
 		offset += fd.off
 	case linux.SEEK_END:
-		// TODO(gvisor.dev/issue/1193): This can prevent new files from showing up
-		// if they are added after SEEK_END.
-		offset = math.MaxInt64
+		switch fd.seekEnd {
+		case SeekEndStaticEntries:
+			fd.children.mu.RLock()
+			offset += int64(len(fd.children.set))
+			offset += 2 // '.' and '..' aren't tracked in children.
+			fd.children.mu.RUnlock()
+		case SeekEndZero:
+			// No-op: offset += 0.
+		default:
+			panic(fmt.Sprintf("Invalid GenericDirectoryFD.seekEnd = %v", fd.seekEnd))
+		}
 	default:
 		return 0, syserror.EINVAL
 	}
@@ -232,8 +268,7 @@ func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (l
 // SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *GenericDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
 	creds := auth.CredentialsFromContext(ctx)
-	inode := fd.vfsfd.VirtualDentry().Dentry().Impl().(*Dentry).inode
-	return inode.SetStat(ctx, fd.filesystem(), creds, opts)
+	return fd.inode().SetStat(ctx, fd.filesystem(), creds, opts)
 }
 
 // Allocate implements vfs.FileDescriptionImpl.Allocate.
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 3e5192edd..399895f3e 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -32,11 +32,12 @@ import (
 //
 // stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
 //
-// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * !rp.Done().
 //
 // Postcondition: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, mayFollowSymlinks bool) (*vfs.Dentry, error) {
-	d := vfsd.Impl().(*Dentry)
+func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, mayFollowSymlinks bool) (*Dentry, error) {
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
 	}
@@ -53,20 +54,20 @@ afterSymlink:
 	// calls d_revalidate(), but walk_component() => handle_dots() does not.
 	if name == "." {
 		rp.Advance()
-		return vfsd, nil
+		return d, nil
 	}
 	if name == ".." {
-		if isRoot, err := rp.CheckRoot(ctx, vfsd); err != nil {
+		if isRoot, err := rp.CheckRoot(ctx, d.VFSDentry()); err != nil {
 			return nil, err
 		} else if isRoot || d.parent == nil {
 			rp.Advance()
-			return vfsd, nil
+			return d, nil
 		}
-		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+		if err := rp.CheckMount(ctx, d.parent.VFSDentry()); err != nil {
 			return nil, err
 		}
 		rp.Advance()
-		return &d.parent.vfsd, nil
+		return d.parent, nil
 	}
 	if len(name) > linux.NAME_MAX {
 		return nil, syserror.ENAMETOOLONG
@@ -77,7 +78,7 @@ afterSymlink:
 	if err != nil {
 		return nil, err
 	}
-	if err := rp.CheckMount(ctx, &next.vfsd); err != nil {
+	if err := rp.CheckMount(ctx, next.VFSDentry()); err != nil {
 		return nil, err
 	}
 	// Resolve any symlink at current path component.
@@ -88,7 +89,7 @@ afterSymlink:
 		}
 		if targetVD.Ok() {
 			err := rp.HandleJump(targetVD)
-			targetVD.DecRef(ctx)
+			fs.deferDecRefVD(ctx, targetVD)
 			if err != nil {
 				return nil, err
 			}
@@ -100,15 +101,18 @@ afterSymlink:
 		goto afterSymlink
 	}
 	rp.Advance()
-	return &next.vfsd, nil
+	return next, nil
 }
 
 // revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
 // or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
 // nil) to verify that the returned child (or lack thereof) is correct.
 //
-// Preconditions: Filesystem.mu must be locked for at least reading.
-// parent.dirMu must be locked. parent.isDir(). name is not "." or "..".
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * parent.dirMu must be locked.
+// * parent.isDir().
+// * name is not "." or "..".
 //
 // Postconditions: Caller must call fs.processDeferredDecRefs*.
 func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, child *Dentry) (*Dentry, error) {
@@ -116,26 +120,33 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 		// Cached dentry exists, revalidate.
 		if !child.inode.Valid(ctx) {
 			delete(parent.children, name)
-			vfsObj.InvalidateDentry(ctx, &child.vfsd)
-			fs.deferDecRef(&child.vfsd) // Reference from Lookup.
+			if child.inode.Keep() {
+				// Drop the ref owned by kernfs.
+				fs.deferDecRef(child)
+			}
+			vfsObj.InvalidateDentry(ctx, child.VFSDentry())
 			child = nil
 		}
 	}
 	if child == nil {
-		// Dentry isn't cached; it either doesn't exist or failed
-		// revalidation. Attempt to resolve it via Lookup.
-		//
-		// FIXME(gvisor.dev/issue/1193): Inode.Lookup() should return
-		// *(kernfs.)Dentry, not *vfs.Dentry, since (kernfs.)Filesystem assumes
-		// that all dentries in the filesystem are (kernfs.)Dentry and performs
-		// vfs.DentryImpl casts accordingly.
-		childVFSD, err := parent.inode.Lookup(ctx, name)
+		// Dentry isn't cached; it either doesn't exist or failed revalidation.
+		// Attempt to resolve it via Lookup.
+		childInode, err := parent.inode.Lookup(ctx, name)
 		if err != nil {
 			return nil, err
 		}
-		// Reference on childVFSD dropped by a corresponding Valid.
-		child = childVFSD.Impl().(*Dentry)
-		parent.insertChildLocked(name, child)
+		var newChild Dentry
+		newChild.Init(fs, childInode) // childInode's ref is transferred to newChild.
+		parent.insertChildLocked(name, &newChild)
+		child = &newChild
+
+		// Drop the ref on newChild. This will cause the dentry to get pruned
+		// from the dentry tree by the end of current filesystem operation
+		// (before returning to the VFS layer) if another ref is not picked on
+		// this dentry.
+		if !childInode.Keep() {
+			fs.deferDecRef(&newChild)
+		}
 	}
 	return child, nil
 }
@@ -148,20 +159,19 @@ func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.Vir
 // Preconditions: Filesystem.mu must be locked for at least reading.
 //
 // Postconditions: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
-	vfsd := rp.Start()
+func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
+	d := rp.Start().Impl().(*Dentry)
 	for !rp.Done() {
 		var err error
-		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
+		d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */)
 		if err != nil {
-			return nil, nil, err
+			return nil, err
 		}
 	}
-	d := vfsd.Impl().(*Dentry)
 	if rp.MustBeDir() && !d.isDir() {
-		return nil, nil, syserror.ENOTDIR
+		return nil, syserror.ENOTDIR
 	}
-	return vfsd, d.inode, nil
+	return d, nil
 }
 
 // walkParentDirLocked resolves all but the last path component of rp to an
@@ -171,32 +181,34 @@ func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingP
 // walkParentDirLocked is loosely analogous to Linux's
 // fs/namei.c:path_parentat().
 //
-// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * !rp.Done().
 //
 // Postconditions: Caller must call fs.processDeferredDecRefs*.
-func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
-	vfsd := rp.Start()
+func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) {
+	d := rp.Start().Impl().(*Dentry)
 	for !rp.Final() {
 		var err error
-		vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd, true /* mayFollowSymlinks */)
+		d, err = fs.stepExistingLocked(ctx, rp, d, true /* mayFollowSymlinks */)
 		if err != nil {
-			return nil, nil, err
+			return nil, err
 		}
 	}
-	d := vfsd.Impl().(*Dentry)
 	if !d.isDir() {
-		return nil, nil, syserror.ENOTDIR
+		return nil, syserror.ENOTDIR
 	}
-	return vfsd, d.inode, nil
+	return d, nil
 }
 
 // checkCreateLocked checks that a file named rp.Component() may be created in
-// directory parentVFSD, then returns rp.Component().
+// directory parent, then returns rp.Component().
 //
-// Preconditions: Filesystem.mu must be locked for at least reading. parentInode
-// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true.
-func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
-	if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
+// Preconditions:
+// * Filesystem.mu must be locked for at least reading.
+// * isDir(parentInode) == true.
+func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *Dentry) (string, error) {
+	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
 		return "", err
 	}
 	pc := rp.Component()
@@ -206,11 +218,10 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
 	if len(pc) > linux.NAME_MAX {
 		return "", syserror.ENAMETOOLONG
 	}
-	// FIXME(gvisor.dev/issue/1193): Data race due to not holding dirMu.
-	if _, ok := parentVFSD.Impl().(*Dentry).children[pc]; ok {
+	if _, ok := parent.children[pc]; ok {
 		return "", syserror.EEXIST
 	}
-	if parentVFSD.IsDead() {
+	if parent.VFSDentry().IsDead() {
 		return "", syserror.ENOENT
 	}
 	return pc, nil
@@ -219,8 +230,8 @@ func checkCreateLocked(ctx context.Context, rp *vfs.ResolvingPath, parentVFSD *v
 // checkDeleteLocked checks that the file represented by vfsd may be deleted.
 //
 // Preconditions: Filesystem.mu must be locked for at least reading.
-func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
-	parent := vfsd.Impl().(*Dentry).parent
+func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error {
+	parent := d.parent
 	if parent == nil {
 		return syserror.EBUSY
 	}
@@ -249,11 +260,11 @@ func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds
 	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.RUnlock()
 
-	_, inode, err := fs.walkExistingLocked(ctx, rp)
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	return inode.CheckPermissions(ctx, creds, ats)
+	return d.inode.CheckPermissions(ctx, creds, ats)
 }
 
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
@@ -261,20 +272,20 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 	fs.mu.RLock()
 	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.RUnlock()
-	vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
 
 	if opts.CheckSearchable {
-		d := vfsd.Impl().(*Dentry)
 		if !d.isDir() {
 			return nil, syserror.ENOTDIR
 		}
-		if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+		if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
 			return nil, err
 		}
 	}
+	vfsd := d.VFSDentry()
 	vfsd.IncRef() // Ownership transferred to caller.
 	return vfsd, nil
 }
@@ -284,12 +295,12 @@ func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPa
 	fs.mu.RLock()
 	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.RUnlock()
-	vfsd, _, err := fs.walkParentDirLocked(ctx, rp)
+	d, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
-	vfsd.IncRef() // Ownership transferred to caller.
-	return vfsd, nil
+	d.IncRef() // Ownership transferred to caller.
+	return d.VFSDentry(), nil
 }
 
 // LinkAt implements vfs.FilesystemImpl.LinkAt.
@@ -298,13 +309,16 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 		return syserror.EEXIST
 	}
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	pc, err := checkCreateLocked(ctx, rp, parent)
 	if err != nil {
 		return err
 	}
@@ -321,11 +335,13 @@ func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 		return syserror.EPERM
 	}
 
-	childVFSD, err := parentInode.NewLink(ctx, pc, d.inode)
+	childI, err := parent.inode.NewLink(ctx, pc, d.inode)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	var child Dentry
+	child.Init(fs, childI)
+	parent.insertChildLocked(pc, &child)
 	return nil
 }
 
@@ -335,13 +351,16 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return syserror.EEXIST
 	}
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	pc, err := checkCreateLocked(ctx, rp, parent)
 	if err != nil {
 		return err
 	}
@@ -349,11 +368,16 @@ func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	childVFSD, err := parentInode.NewDir(ctx, pc, opts)
+	childI, err := parent.inode.NewDir(ctx, pc, opts)
 	if err != nil {
-		return err
+		if !opts.ForSyntheticMountpoint || err == syserror.EEXIST {
+			return err
+		}
+		childI = newSyntheticDirectory(ctx, rp.Credentials(), opts.Mode)
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	var child Dentry
+	child.Init(fs, childI)
+	parent.insertChildLocked(pc, &child)
 	return nil
 }
 
@@ -363,13 +387,16 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return syserror.EEXIST
 	}
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+	pc, err := checkCreateLocked(ctx, rp, parent)
 	if err != nil {
 		return err
 	}
@@ -377,11 +404,13 @@ func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	newVFSD, err := parentInode.NewNode(ctx, pc, opts)
+	newI, err := parent.inode.NewNode(ctx, pc, opts)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, newVFSD.Impl().(*Dentry))
+	var newD Dentry
+	newD.Init(fs, newI)
+	parent.insertChildLocked(pc, &newD)
 	return nil
 }
 
@@ -397,28 +426,28 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	// Do not create new file.
 	if opts.Flags&linux.O_CREAT == 0 {
 		fs.mu.RLock()
-		vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
+		defer fs.processDeferredDecRefs(ctx)
+		d, err := fs.walkExistingLocked(ctx, rp)
 		if err != nil {
 			fs.mu.RUnlock()
-			fs.processDeferredDecRefs(ctx)
 			return nil, err
 		}
-		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+		if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			fs.mu.RUnlock()
-			fs.processDeferredDecRefs(ctx)
 			return nil, err
 		}
-		inode.IncRef()
-		defer inode.DecRef(ctx)
+		// Open may block so we need to unlock fs.mu. IncRef d to prevent
+		// its destruction while fs.mu is unlocked.
+		d.IncRef()
 		fs.mu.RUnlock()
-		fs.processDeferredDecRefs(ctx)
-		return inode.Open(ctx, rp, vfsd, opts)
+		fd, err := d.inode.Open(ctx, rp, d, opts)
+		d.DecRef(ctx)
+		return fd, err
 	}
 
 	// May create new file.
 	mustCreate := opts.Flags&linux.O_EXCL != 0
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*Dentry).inode
+	d := rp.Start().Impl().(*Dentry)
 	fs.mu.Lock()
 	unlocked := false
 	unlock := func() {
@@ -427,6 +456,10 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 			unlocked = true
 		}
 	}
+	// Process all to-be-decref'd dentries at the end at once.
+	// Since we defer unlock() AFTER this, fs.mu is guaranteed to be unlocked
+	// when this is executed.
+	defer fs.processDeferredDecRefs(ctx)
 	defer unlock()
 	if rp.Done() {
 		if rp.MustBeDir() {
@@ -435,22 +468,24 @@ func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
-		if err := inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
+		if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 			return nil, err
 		}
-		inode.IncRef()
-		defer inode.DecRef(ctx)
+		// Open may block so we need to unlock fs.mu. IncRef d to prevent
+		// its destruction while fs.mu is unlocked.
+		d.IncRef()
 		unlock()
-		return inode.Open(ctx, rp, vfsd, opts)
+		fd, err := d.inode.Open(ctx, rp, d, opts)
+		d.DecRef(ctx)
+		return fd, err
 	}
 afterTrailingSymlink:
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
 	// Check for search permission in the parent directory.
-	if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
+	if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
 	// Reject attempts to open directories with O_CREAT.
@@ -465,10 +500,10 @@ afterTrailingSymlink:
 		return nil, syserror.ENAMETOOLONG
 	}
 	// Determine whether or not we need to create a file.
-	childVFSD, err := fs.stepExistingLocked(ctx, rp, parentVFSD, false /* mayFollowSymlinks */)
+	child, err := fs.stepExistingLocked(ctx, rp, parent, false /* mayFollowSymlinks */)
 	if err == syserror.ENOENT {
 		// Already checked for searchability above; now check for writability.
-		if err := parentInode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+		if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
 			return nil, err
 		}
 		if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -476,16 +511,20 @@ afterTrailingSymlink:
 		}
 		defer rp.Mount().EndWrite()
 		// Create and open the child.
-		childVFSD, err = parentInode.NewFile(ctx, pc, opts)
+		childI, err := parent.inode.NewFile(ctx, pc, opts)
 		if err != nil {
 			return nil, err
 		}
-		child := childVFSD.Impl().(*Dentry)
-		parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
-		child.inode.IncRef()
-		defer child.inode.DecRef(ctx)
+		var child Dentry
+		child.Init(fs, childI)
+		parent.insertChild(pc, &child)
+		// Open may block so we need to unlock fs.mu. IncRef child to prevent
+		// its destruction while fs.mu is unlocked.
+		child.IncRef()
 		unlock()
-		return child.inode.Open(ctx, rp, childVFSD, opts)
+		fd, err := child.inode.Open(ctx, rp, &child, opts)
+		child.DecRef(ctx)
+		return fd, err
 	}
 	if err != nil {
 		return nil, err
@@ -494,7 +533,6 @@ afterTrailingSymlink:
 	if mustCreate {
 		return nil, syserror.EEXIST
 	}
-	child := childVFSD.Impl().(*Dentry)
 	if rp.ShouldFollowSymlink() && child.isSymlink() {
 		targetVD, targetPathname, err := child.inode.Getlink(ctx, rp.Mount())
 		if err != nil {
@@ -502,7 +540,7 @@ afterTrailingSymlink:
 		}
 		if targetVD.Ok() {
 			err := rp.HandleJump(targetVD)
-			targetVD.DecRef(ctx)
+			fs.deferDecRefVD(ctx, targetVD)
 			if err != nil {
 				return nil, err
 			}
@@ -518,25 +556,28 @@ afterTrailingSymlink:
 	if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
-	child.inode.IncRef()
-	defer child.inode.DecRef(ctx)
+	// Open may block so we need to unlock fs.mu. IncRef child to prevent
+	// its destruction while fs.mu is unlocked.
+	child.IncRef()
 	unlock()
-	return child.inode.Open(ctx, rp, &child.vfsd, opts)
+	fd, err := child.inode.Open(ctx, rp, child, opts)
+	child.DecRef(ctx)
+	return fd, err
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
 func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
 	fs.mu.RLock()
-	d, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return "", err
 	}
-	if !d.Impl().(*Dentry).isSymlink() {
+	if !d.isSymlink() {
 		return "", syserror.EINVAL
 	}
-	return inode.Readlink(ctx)
+	return d.inode.Readlink(ctx, rp.Mount())
 }
 
 // RenameAt implements vfs.FilesystemImpl.RenameAt.
@@ -548,16 +589,15 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
 
 	fs.mu.Lock()
-	defer fs.processDeferredDecRefsLocked(ctx)
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
 
 	// Resolve the destination directory first to verify that it's on this
 	// Mount.
-	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+	dstDir, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	dstDir := dstDirVFSD.Impl().(*Dentry)
 	mnt := rp.Mount()
 	if mnt != oldParentVD.Mount() {
 		return syserror.EXDEV
@@ -575,16 +615,15 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	if err != nil {
 		return err
 	}
-	srcVFSD := &src.vfsd
 
 	// Can we remove the src dentry?
-	if err := checkDeleteLocked(ctx, rp, srcVFSD); err != nil {
+	if err := checkDeleteLocked(ctx, rp, src); err != nil {
 		return err
 	}
 
 	// Can we create the dst dentry?
 	var dst *Dentry
-	pc, err := checkCreateLocked(ctx, rp, dstDirVFSD, dstDirInode)
+	pc, err := checkCreateLocked(ctx, rp, dstDir)
 	switch err {
 	case nil:
 		// Ok, continue with rename as replacement.
@@ -595,14 +634,14 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		}
 		dst = dstDir.children[pc]
 		if dst == nil {
-			panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD))
+			panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDir))
 		}
 	default:
 		return err
 	}
 	var dstVFSD *vfs.Dentry
 	if dst != nil {
-		dstVFSD = &dst.vfsd
+		dstVFSD = dst.VFSDentry()
 	}
 
 	mntns := vfs.MountNamespaceFromContext(ctx)
@@ -618,35 +657,44 @@ func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 		defer dstDir.dirMu.Unlock()
 	}
 
+	srcVFSD := src.VFSDentry()
 	if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
 		return err
 	}
-	replaced, err := srcDir.inode.Rename(ctx, src.name, pc, srcVFSD, dstDirVFSD)
+	err = srcDir.inode.Rename(ctx, src.name, pc, src.inode, dstDir.inode)
 	if err != nil {
 		virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
 		return err
 	}
 	delete(srcDir.children, src.name)
 	if srcDir != dstDir {
-		fs.deferDecRef(srcDirVFSD)
-		dstDir.IncRef()
+		fs.deferDecRef(srcDir) // child (src) drops ref on old parent.
+		dstDir.IncRef()        // child (src) takes a ref on the new parent.
 	}
 	src.parent = dstDir
 	src.name = pc
 	if dstDir.children == nil {
 		dstDir.children = make(map[string]*Dentry)
 	}
+	replaced := dstDir.children[pc]
 	dstDir.children[pc] = src
-	virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaced)
+	var replaceVFSD *vfs.Dentry
+	if replaced != nil {
+		// deferDecRef so that fs.mu and dstDir.mu are unlocked by then.
+		fs.deferDecRef(replaced)
+		replaceVFSD = replaced.VFSDentry()
+	}
+	virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD)
 	return nil
 }
 
 // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
 func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
@@ -654,14 +702,13 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
+	if err := checkDeleteLocked(ctx, rp, d); err != nil {
 		return err
 	}
-	d := vfsd.Impl().(*Dentry)
 	if !d.isDir() {
 		return syserror.ENOTDIR
 	}
-	if inode.HasChildren() {
+	if d.inode.HasChildren() {
 		return syserror.ENOTEMPTY
 	}
 	virtfs := rp.VirtualFilesystem()
@@ -671,13 +718,18 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 
 	mntns := vfs.MountNamespaceFromContext(ctx)
 	defer mntns.DecRef(ctx)
+	vfsd := d.VFSDentry()
 	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
 		return err
 	}
-	if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
+
+	if err := parentDentry.inode.RmDir(ctx, d.name, d.inode); err != nil {
 		virtfs.AbortDeleteDentry(vfsd)
 		return err
 	}
+	delete(parentDentry.children, d.name)
+	// Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
+	fs.deferDecRef(d)
 	virtfs.CommitDeleteDentry(ctx, vfsd)
 	return nil
 }
@@ -685,41 +737,40 @@ func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
 func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	fs.mu.RLock()
-	_, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
 	if opts.Stat.Mask == 0 {
 		return nil
 	}
-	return inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
+	return d.inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts)
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
 func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
 	fs.mu.RLock()
-	_, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return linux.Statx{}, err
 	}
-	return inode.Stat(ctx, fs.VFSFilesystem(), opts)
+	return d.inode.Stat(ctx, fs.VFSFilesystem(), opts)
 }
 
 // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
 func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return linux.Statfs{}, err
 	}
-	// TODO(gvisor.dev/issue/1193): actually implement statfs.
-	return linux.Statfs{}, syserror.ENOSYS
+	return d.inode.StatFS(ctx, fs.VFSFilesystem())
 }
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
@@ -728,13 +779,16 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 		return syserror.EEXIST
 	}
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+	parent, err := fs.walkParentDirLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
-	pc, err := checkCreateLocked(ctx, rp, parentVFSD, parentInode)
+	parent.dirMu.Lock()
+	defer parent.dirMu.Unlock()
+
+	pc, err := checkCreateLocked(ctx, rp, parent)
 	if err != nil {
 		return err
 	}
@@ -742,20 +796,23 @@ func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	childVFSD, err := parentInode.NewSymlink(ctx, pc, target)
+	childI, err := parent.inode.NewSymlink(ctx, pc, target)
 	if err != nil {
 		return err
 	}
-	parentVFSD.Impl().(*Dentry).InsertChild(pc, childVFSD.Impl().(*Dentry))
+	var child Dentry
+	child.Init(fs, childI)
+	parent.insertChildLocked(pc, &child)
 	return nil
 }
 
 // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
 func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
+	defer fs.processDeferredDecRefs(ctx)
 	defer fs.mu.Unlock()
-	vfsd, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.processDeferredDecRefsLocked(ctx)
+
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
@@ -763,10 +820,9 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		return err
 	}
 	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(ctx, rp, vfsd); err != nil {
+	if err := checkDeleteLocked(ctx, rp, d); err != nil {
 		return err
 	}
-	d := vfsd.Impl().(*Dentry)
 	if d.isDir() {
 		return syserror.EISDIR
 	}
@@ -776,38 +832,42 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	defer parentDentry.dirMu.Unlock()
 	mntns := vfs.MountNamespaceFromContext(ctx)
 	defer mntns.DecRef(ctx)
+	vfsd := d.VFSDentry()
 	if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil {
 		return err
 	}
-	if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
+	if err := parentDentry.inode.Unlink(ctx, d.name, d.inode); err != nil {
 		virtfs.AbortDeleteDentry(vfsd)
 		return err
 	}
+	delete(parentDentry.children, d.name)
+	// Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then.
+	fs.deferDecRef(d)
 	virtfs.CommitDeleteDentry(ctx, vfsd)
 	return nil
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	fs.mu.RLock()
-	_, inode, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	d, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
-	if err := inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
+	if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil {
 		return nil, err
 	}
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	_, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
@@ -815,12 +875,12 @@ func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 	return nil, syserror.ENOTSUP
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	_, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return "", err
 	}
@@ -828,12 +888,12 @@ func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return "", syserror.ENOTSUP
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	_, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
@@ -841,12 +901,12 @@ func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 	return syserror.ENOTSUP
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
-	_, _, err := fs.walkExistingLocked(ctx, rp)
-	fs.mu.RUnlock()
-	fs.processDeferredDecRefs(ctx)
+	defer fs.processDeferredDecRefs(ctx)
+	defer fs.mu.RUnlock()
+	_, err := fs.walkExistingLocked(ctx, rp)
 	if err != nil {
 		return err
 	}
@@ -860,3 +920,16 @@ func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 	defer fs.mu.RUnlock()
 	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b)
 }
+
+func (fs *Filesystem) deferDecRefVD(ctx context.Context, vd vfs.VirtualDentry) {
+	if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs {
+		// The following is equivalent to vd.DecRef(ctx). This is needed
+		// because if d belongs to this filesystem, we can not DecRef it right
+		// away as we may be holding fs.mu. d.DecRef may acquire fs.mu. So we
+		// defer the DecRef to when locks are dropped.
+		vd.Mount().DecRef(ctx)
+		fs.deferDecRef(d)
+	} else {
+		vd.DecRef(ctx)
+	}
+}
diff --git a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
index fe8a1e710..d9d76758a 100644
--- a/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/inode_impl_util.go
@@ -20,11 +20,12 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // InodeNoopRefCount partially implements the Inode interface, specifically the
@@ -32,7 +33,10 @@ import (
 // count for inodes, performing no extra actions when references are obtained or
 // released. This is suitable for simple file inodes that don't reference any
 // resources.
+//
+// +stateify savable
 type InodeNoopRefCount struct {
+	InodeTemporary
 }
 
 // IncRef implements Inode.IncRef.
@@ -51,30 +55,32 @@ func (InodeNoopRefCount) TryIncRef() bool {
 // InodeDirectoryNoNewChildren partially implements the Inode interface.
 // InodeDirectoryNoNewChildren represents a directory inode which does not
 // support creation of new children.
+//
+// +stateify savable
 type InodeDirectoryNoNewChildren struct{}
 
 // NewFile implements Inode.NewFile.
-func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
 // NewDir implements Inode.NewDir.
-func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
 // NewLink implements Inode.NewLink.
-func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
 // NewSymlink implements Inode.NewSymlink.
-func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
 // NewNode implements Inode.NewNode.
-func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) {
 	return nil, syserror.EPERM
 }
 
@@ -82,7 +88,10 @@ func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOpt
 // inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not
 // represent directories can embed this to provide no-op implementations for
 // directory-related functions.
+//
+// +stateify savable
 type InodeNotDirectory struct {
+	InodeAlwaysValid
 }
 
 // HasChildren implements Inode.HasChildren.
@@ -91,89 +100,64 @@ func (InodeNotDirectory) HasChildren() bool {
 }
 
 // NewFile implements Inode.NewFile.
-func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) {
 	panic("NewFile called on non-directory inode")
 }
 
 // NewDir implements Inode.NewDir.
-func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) {
 	panic("NewDir called on non-directory inode")
 }
 
 // NewLink implements Inode.NewLinkink.
-func (InodeNotDirectory) NewLink(context.Context, string, Inode) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewLink(context.Context, string, Inode) (Inode, error) {
 	panic("NewLink called on non-directory inode")
 }
 
 // NewSymlink implements Inode.NewSymlink.
-func (InodeNotDirectory) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewSymlink(context.Context, string, string) (Inode, error) {
 	panic("NewSymlink called on non-directory inode")
 }
 
 // NewNode implements Inode.NewNode.
-func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) {
 	panic("NewNode called on non-directory inode")
 }
 
 // Unlink implements Inode.Unlink.
-func (InodeNotDirectory) Unlink(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) Unlink(context.Context, string, Inode) error {
 	panic("Unlink called on non-directory inode")
 }
 
 // RmDir implements Inode.RmDir.
-func (InodeNotDirectory) RmDir(context.Context, string, *vfs.Dentry) error {
+func (InodeNotDirectory) RmDir(context.Context, string, Inode) error {
 	panic("RmDir called on non-directory inode")
 }
 
 // Rename implements Inode.Rename.
-func (InodeNotDirectory) Rename(context.Context, string, string, *vfs.Dentry, *vfs.Dentry) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Rename(context.Context, string, string, Inode, Inode) error {
 	panic("Rename called on non-directory inode")
 }
 
 // Lookup implements Inode.Lookup.
-func (InodeNotDirectory) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+func (InodeNotDirectory) Lookup(ctx context.Context, name string) (Inode, error) {
 	panic("Lookup called on non-directory inode")
 }
 
 // IterDirents implements Inode.IterDirents.
-func (InodeNotDirectory) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+func (InodeNotDirectory) IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
 	panic("IterDirents called on non-directory inode")
 }
 
-// Valid implements Inode.Valid.
-func (InodeNotDirectory) Valid(context.Context) bool {
-	return true
-}
-
-// InodeNoDynamicLookup partially implements the Inode interface, specifically
-// the inodeDynamicLookup sub interface. Directory inodes that do not support
-// dymanic entries (i.e. entries that are not "hashed" into the
-// vfs.Dentry.children) can embed this to provide no-op implementations for
-// functions related to dynamic entries.
-type InodeNoDynamicLookup struct{}
-
-// Lookup implements Inode.Lookup.
-func (InodeNoDynamicLookup) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
-	return nil, syserror.ENOENT
-}
-
-// IterDirents implements Inode.IterDirents.
-func (InodeNoDynamicLookup) IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
-	return offset, nil
-}
-
-// Valid implements Inode.Valid.
-func (InodeNoDynamicLookup) Valid(ctx context.Context) bool {
-	return true
-}
-
 // InodeNotSymlink partially implements the Inode interface, specifically the
 // inodeSymlink sub interface. All inodes that are not symlinks may embed this
 // to return the appropriate errors from symlink-related functions.
+//
+// +stateify savable
 type InodeNotSymlink struct{}
 
 // Readlink implements Inode.Readlink.
-func (InodeNotSymlink) Readlink(context.Context) (string, error) {
+func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) {
 	return "", syserror.EINVAL
 }
 
@@ -187,18 +171,26 @@ func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry,
 // inode attributes.
 //
 // Must be initialized by Init prior to first use.
+//
+// +stateify savable
 type InodeAttrs struct {
-	devMajor uint32
-	devMinor uint32
-	ino      uint64
-	mode     uint32
-	uid      uint32
-	gid      uint32
-	nlink    uint32
+	devMajor  uint32
+	devMinor  uint32
+	ino       uint64
+	mode      uint32
+	uid       uint32
+	gid       uint32
+	nlink     uint32
+	blockSize uint32
+
+	// Timestamps, all nsecs from the Unix epoch.
+	atime int64
+	mtime int64
+	ctime int64
 }
 
 // Init initializes this InodeAttrs.
-func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) {
+func (a *InodeAttrs) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) {
 	if mode.FileType() == 0 {
 		panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode))
 	}
@@ -214,6 +206,11 @@ func (a *InodeAttrs) Init(creds *auth.Credentials, devMajor, devMinor uint32, in
 	atomic.StoreUint32(&a.uid, uint32(creds.EffectiveKUID))
 	atomic.StoreUint32(&a.gid, uint32(creds.EffectiveKGID))
 	atomic.StoreUint32(&a.nlink, nlink)
+	atomic.StoreUint32(&a.blockSize, usermem.PageSize)
+	now := ktime.NowFromContext(ctx).Nanoseconds()
+	atomic.StoreInt64(&a.atime, now)
+	atomic.StoreInt64(&a.mtime, now)
+	atomic.StoreInt64(&a.ctime, now)
 }
 
 // DevMajor returns the device major number.
@@ -236,12 +233,33 @@ func (a *InodeAttrs) Mode() linux.FileMode {
 	return linux.FileMode(atomic.LoadUint32(&a.mode))
 }
 
+// TouchAtime updates a.atime to the current time.
+func (a *InodeAttrs) TouchAtime(ctx context.Context, mnt *vfs.Mount) {
+	if mnt.Flags.NoATime || mnt.ReadOnly() {
+		return
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return
+	}
+	atomic.StoreInt64(&a.atime, ktime.NowFromContext(ctx).Nanoseconds())
+	mnt.EndWrite()
+}
+
+// TouchCMtime updates a.{c/m}time to the current time. The caller should
+// synchronize calls to this so that ctime and mtime are updated to the same
+// value.
+func (a *InodeAttrs) TouchCMtime(ctx context.Context) {
+	now := ktime.NowFromContext(ctx).Nanoseconds()
+	atomic.StoreInt64(&a.mtime, now)
+	atomic.StoreInt64(&a.ctime, now)
+}
+
 // Stat partially implements Inode.Stat. Note that this function doesn't provide
 // all the stat fields, and the embedder should consider extending the result
 // with filesystem-specific fields.
 func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
 	var stat linux.Statx
-	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME
 	stat.DevMajor = a.devMajor
 	stat.DevMinor = a.devMinor
 	stat.Ino = atomic.LoadUint64(&a.ino)
@@ -249,9 +267,10 @@ func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (li
 	stat.UID = atomic.LoadUint32(&a.uid)
 	stat.GID = atomic.LoadUint32(&a.gid)
 	stat.Nlink = atomic.LoadUint32(&a.nlink)
-
-	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
-
+	stat.Blksize = atomic.LoadUint32(&a.blockSize)
+	stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.atime))
+	stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.mtime))
+	stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&a.ctime))
 	return stat, nil
 }
 
@@ -260,9 +279,17 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
 	if opts.Stat.Mask == 0 {
 		return nil
 	}
-	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID) != 0 {
+
+	// Note that not all fields are modifiable. For example, the file type and
+	// inode numbers are immutable after node creation. Setting the size is often
+	// allowed by kernfs files but does not do anything. If some other behavior is
+	// needed, the embedder should consider extending SetStat.
+	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
 		return syserror.EPERM
 	}
+	if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() {
+		return syserror.EISDIR
+	}
 	if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(atomic.LoadUint32(&a.uid)), auth.KGID(atomic.LoadUint32(&a.gid))); err != nil {
 		return err
 	}
@@ -285,12 +312,19 @@ func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *aut
 		atomic.StoreUint32(&a.gid, stat.GID)
 	}
 
-	// Note that not all fields are modifiable. For example, the file type and
-	// inode numbers are immutable after node creation.
-
-	// TODO(gvisor.dev/issue/1193): Implement other stat fields like timestamps.
-	// Also, STATX_SIZE will need some special handling, because read-only static
-	// files should return EIO for truncate operations.
+	now := ktime.NowFromContext(ctx).Nanoseconds()
+	if stat.Mask&linux.STATX_ATIME != 0 {
+		if stat.Atime.Nsec == linux.UTIME_NOW {
+			stat.Atime = linux.NsecToStatxTimestamp(now)
+		}
+		atomic.StoreInt64(&a.atime, stat.Atime.ToNsec())
+	}
+	if stat.Mask&linux.STATX_MTIME != 0 {
+		if stat.Mtime.Nsec == linux.UTIME_NOW {
+			stat.Mtime = linux.NsecToStatxTimestamp(now)
+		}
+		atomic.StoreInt64(&a.mtime, stat.Mtime.ToNsec())
+	}
 
 	return nil
 }
@@ -321,13 +355,17 @@ func (a *InodeAttrs) DecLinks() {
 	}
 }
 
+// +stateify savable
 type slot struct {
-	Name   string
-	Dentry *vfs.Dentry
+	name   string
+	inode  Inode
+	static bool
 	slotEntry
 }
 
 // OrderedChildrenOptions contains initialization options for OrderedChildren.
+//
+// +stateify savable
 type OrderedChildrenOptions struct {
 	// Writable indicates whether vfs.FilesystemImpl methods implemented by
 	// OrderedChildren may modify the tracked children. This applies to
@@ -337,20 +375,28 @@ type OrderedChildrenOptions struct {
 }
 
 // OrderedChildren partially implements the Inode interface. OrderedChildren can
-// be embedded in directory inodes to keep track of the children in the
+// be embedded in directory inodes to keep track of children in the
 // directory, and can then be used to implement a generic directory FD -- see
-// GenericDirectoryFD. OrderedChildren is not compatible with dynamic
-// directories.
+// GenericDirectoryFD.
+//
+// OrderedChildren can represent a node in an Inode tree. The children inodes
+// might be directories themselves using OrderedChildren; hence extending the
+// tree. The parent inode (OrderedChildren user) holds a ref on all its static
+// children. This lets the static inodes outlive their associated dentry.
+// While the dentry might have to be regenerated via a Lookup() call, we can
+// keep reusing the same static inode. These static children inodes are finally
+// DecRef'd when this directory inode is being destroyed. This makes
+// OrderedChildren suitable for static directory entries as well.
 //
 // Must be initialize with Init before first use.
+//
+// +stateify savable
 type OrderedChildren struct {
-	refs.AtomicRefCount
-
 	// Can children be modified by user syscalls? It set to false, interface
 	// methods that would modify the children return EPERM. Immutable.
 	writable bool
 
-	mu    sync.RWMutex
+	mu    sync.RWMutex `state:"nosave"`
 	order slotList
 	set   map[string]*slot
 }
@@ -361,36 +407,66 @@ func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
 	o.set = make(map[string]*slot)
 }
 
-// DecRef implements Inode.DecRef.
-func (o *OrderedChildren) DecRef(ctx context.Context) {
-	o.AtomicRefCount.DecRefWithDestructor(ctx, func(context.Context) {
-		o.mu.Lock()
-		defer o.mu.Unlock()
-		o.order.Reset()
-		o.set = nil
-	})
+// Destroy clears the children stored in o. It should be called by structs
+// embedding OrderedChildren upon destruction, i.e. when their reference count
+// reaches zero.
+func (o *OrderedChildren) Destroy(ctx context.Context) {
+	o.mu.Lock()
+	defer o.mu.Unlock()
+	// Drop the ref that o owns on the static inodes it holds.
+	for _, s := range o.set {
+		if s.static {
+			s.inode.DecRef(ctx)
+		}
+	}
+	o.order.Reset()
+	o.set = nil
 }
 
-// Populate inserts children into this OrderedChildren, and d's dentry
-// cache. Populate returns the number of directories inserted, which the caller
+// Populate inserts static children into this OrderedChildren.
+// Populate returns the number of directories inserted, which the caller
 // may use to update the link count for the parent directory.
 //
-// Precondition: d must represent a directory inode. children must not contain
-// any conflicting entries already in o.
-func (o *OrderedChildren) Populate(d *Dentry, children map[string]*Dentry) uint32 {
+// Precondition:
+//   * d must represent a directory inode.
+//   * children must not contain any conflicting entries already in o.
+//   * Caller must hold a reference on all inodes passed.
+//
+// Postcondition: Caller's references on inodes are transferred to o.
+func (o *OrderedChildren) Populate(children map[string]Inode) uint32 {
 	var links uint32
 	for name, child := range children {
-		if child.isDir() {
+		if child.Mode().IsDir() {
 			links++
 		}
-		if err := o.Insert(name, child.VFSDentry()); err != nil {
-			panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v) into %+v", name, child, d))
+		if err := o.insert(name, child, true); err != nil {
+			panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v)", name, child))
 		}
-		d.InsertChild(name, child)
 	}
 	return links
 }
 
+// Lookup implements Inode.Lookup.
+func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error) {
+	o.mu.RLock()
+	defer o.mu.RUnlock()
+
+	s, ok := o.set[name]
+	if !ok {
+		return nil, syserror.ENOENT
+	}
+
+	s.inode.IncRef() // This ref is passed to the dentry upon creation via Init.
+	return s.inode, nil
+}
+
+// IterDirents implements Inode.IterDirents.
+func (o *OrderedChildren) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+	// All entries from OrderedChildren have already been handled in
+	// GenericDirectoryFD.IterDirents.
+	return offset, nil
+}
+
 // HasChildren implements Inode.HasChildren.
 func (o *OrderedChildren) HasChildren() bool {
 	o.mu.RLock()
@@ -398,17 +474,27 @@ func (o *OrderedChildren) HasChildren() bool {
 	return len(o.set) > 0
 }
 
-// Insert inserts child into o. This ignores the writability of o, as this is
-// not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
-func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
+// Insert inserts a dynamic child into o. This ignores the writability of o, as
+// this is not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
+func (o *OrderedChildren) Insert(name string, child Inode) error {
+	return o.insert(name, child, false)
+}
+
+// insert inserts child into o.
+//
+// Precondition: Caller must be holding a ref on child if static is true.
+//
+// Postcondition: Caller's ref on child is transferred to o if static is true.
+func (o *OrderedChildren) insert(name string, child Inode, static bool) error {
 	o.mu.Lock()
 	defer o.mu.Unlock()
 	if _, ok := o.set[name]; ok {
 		return syserror.EEXIST
 	}
 	s := &slot{
-		Name:   name,
-		Dentry: child,
+		name:   name,
+		inode:  child,
+		static: static,
 	}
 	o.order.PushBack(s)
 	o.set[name] = s
@@ -418,44 +504,49 @@ func (o *OrderedChildren) Insert(name string, child *vfs.Dentry) error {
 // Precondition: caller must hold o.mu for writing.
 func (o *OrderedChildren) removeLocked(name string) {
 	if s, ok := o.set[name]; ok {
+		if s.static {
+			panic(fmt.Sprintf("removeLocked called on a static inode: %v", s.inode))
+		}
 		delete(o.set, name)
 		o.order.Remove(s)
 	}
 }
 
 // Precondition: caller must hold o.mu for writing.
-func (o *OrderedChildren) replaceChildLocked(name string, new *vfs.Dentry) *vfs.Dentry {
+func (o *OrderedChildren) replaceChildLocked(ctx context.Context, name string, newI Inode) {
 	if s, ok := o.set[name]; ok {
+		if s.static {
+			panic(fmt.Sprintf("replacing a static inode: %v", s.inode))
+		}
+
 		// Existing slot with given name, simply replace the dentry.
-		var old *vfs.Dentry
-		old, s.Dentry = s.Dentry, new
-		return old
+		s.inode = newI
 	}
 
 	// No existing slot with given name, create and hash new slot.
 	s := &slot{
-		Name:   name,
-		Dentry: new,
+		name:   name,
+		inode:  newI,
+		static: false,
 	}
 	o.order.PushBack(s)
 	o.set[name] = s
-	return nil
 }
 
 // Precondition: caller must hold o.mu for reading or writing.
-func (o *OrderedChildren) checkExistingLocked(name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error {
 	s, ok := o.set[name]
 	if !ok {
 		return syserror.ENOENT
 	}
-	if s.Dentry != child {
-		panic(fmt.Sprintf("Dentry hashed into inode doesn't match what vfs thinks! OrderedChild: %+v, vfs: %+v", s.Dentry, child))
+	if s.inode != child {
+		panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! OrderedChild: %+v, kernfs: %+v", s.inode, child))
 	}
 	return nil
 }
 
 // Unlink implements Inode.Unlink.
-func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.Dentry) error {
+func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) error {
 	if !o.writable {
 		return syserror.EPERM
 	}
@@ -470,13 +561,14 @@ func (o *OrderedChildren) Unlink(ctx context.Context, name string, child *vfs.De
 	return nil
 }
 
-// Rmdir implements Inode.Rmdir.
-func (o *OrderedChildren) RmDir(ctx context.Context, name string, child *vfs.Dentry) error {
+// RmDir implements Inode.RmDir.
+func (o *OrderedChildren) RmDir(ctx context.Context, name string, child Inode) error {
 	// We're not responsible for checking that child is a directory, that it's
 	// empty, or updating any link counts; so this is the same as unlink.
 	return o.Unlink(ctx, name, child)
 }
 
+// +stateify savable
 type renameAcrossDifferentImplementationsError struct{}
 
 func (renameAcrossDifferentImplementationsError) Error() string {
@@ -492,13 +584,13 @@ func (renameAcrossDifferentImplementationsError) Error() string {
 // that will support Rename.
 //
 // Postcondition: reference on any replaced dentry transferred to caller.
-func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (*vfs.Dentry, error) {
-	dst, ok := dstDir.Impl().(*Dentry).inode.(interface{}).(*OrderedChildren)
+func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error {
+	dst, ok := dstDir.(interface{}).(*OrderedChildren)
 	if !ok {
-		return nil, renameAcrossDifferentImplementationsError{}
+		return renameAcrossDifferentImplementationsError{}
 	}
 	if !o.writable || !dst.writable {
-		return nil, syserror.EPERM
+		return syserror.EPERM
 	}
 	// Note: There's a potential deadlock below if concurrent calls to Rename
 	// refer to the same src and dst directories in reverse. We avoid any
@@ -511,12 +603,12 @@ func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, c
 		defer dst.mu.Unlock()
 	}
 	if err := o.checkExistingLocked(oldname, child); err != nil {
-		return nil, err
+		return err
 	}
 
 	// TODO(gvisor.dev/issue/3027): Check sticky bit before removing.
-	replaced := dst.replaceChildLocked(newname, child)
-	return replaced, nil
+	dst.replaceChildLocked(ctx, newname, child)
+	return nil
 }
 
 // nthLocked returns an iterator to the nth child tracked by this object. The
@@ -535,12 +627,14 @@ func (o *OrderedChildren) nthLocked(i int64) *slot {
 }
 
 // InodeSymlink partially implements Inode interface for symlinks.
+//
+// +stateify savable
 type InodeSymlink struct {
 	InodeNotDirectory
 }
 
 // Open implements Inode.Open.
-func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	return nil, syserror.ELOOP
 }
 
@@ -549,43 +643,46 @@ func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.D
 //
 // +stateify savable
 type StaticDirectory struct {
-	InodeNotSymlink
-	InodeDirectoryNoNewChildren
+	InodeAlwaysValid
 	InodeAttrs
-	InodeNoDynamicLookup
+	InodeDirectoryNoNewChildren
+	InodeNoStatFS
+	InodeNotSymlink
+	InodeTemporary
 	OrderedChildren
+	StaticDirectoryRefs
 
-	locks vfs.FileLocks
+	locks  vfs.FileLocks
+	fdOpts GenericDirectoryFDOptions
 }
 
 var _ Inode = (*StaticDirectory)(nil)
 
 // NewStaticDir creates a new static directory and returns its dentry.
-func NewStaticDir(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]*Dentry) *Dentry {
+func NewStaticDir(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]Inode, fdOpts GenericDirectoryFDOptions) Inode {
 	inode := &StaticDirectory{}
-	inode.Init(creds, devMajor, devMinor, ino, perm)
-
-	dentry := &Dentry{}
-	dentry.Init(inode)
+	inode.Init(ctx, creds, devMajor, devMinor, ino, perm, fdOpts)
+	inode.EnableLeakCheck()
 
 	inode.OrderedChildren.Init(OrderedChildrenOptions{})
-	links := inode.OrderedChildren.Populate(dentry, children)
+	links := inode.OrderedChildren.Populate(children)
 	inode.IncLinks(links)
 
-	return dentry
+	return inode
 }
 
 // Init initializes StaticDirectory.
-func (s *StaticDirectory) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+func (s *StaticDirectory) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, fdOpts GenericDirectoryFDOptions) {
 	if perm&^linux.PermissionsMask != 0 {
 		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
 	}
-	s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
+	s.fdOpts = fdOpts
+	s.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
 }
 
-// Open implements kernfs.Inode.
-func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := NewGenericDirectoryFD(rp.Mount(), vfsd, &s.OrderedChildren, &s.locks, &opts)
+// Open implements Inode.Open.
+func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := NewGenericDirectoryFD(rp.Mount(), d, &s.OrderedChildren, &s.locks, &opts, s.fdOpts)
 	if err != nil {
 		return nil, err
 	}
@@ -597,10 +694,38 @@ func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credenti
 	return syserror.EPERM
 }
 
-// AlwaysValid partially implements kernfs.inodeDynamicLookup.
-type AlwaysValid struct{}
+// DecRef implements Inode.DecRef.
+func (s *StaticDirectory) DecRef(ctx context.Context) {
+	s.StaticDirectoryRefs.DecRef(func() { s.Destroy(ctx) })
+}
 
-// Valid implements kernfs.inodeDynamicLookup.
-func (*AlwaysValid) Valid(context.Context) bool {
+// InodeAlwaysValid partially implements Inode.
+//
+// +stateify savable
+type InodeAlwaysValid struct{}
+
+// Valid implements Inode.Valid.
+func (*InodeAlwaysValid) Valid(context.Context) bool {
 	return true
 }
+
+// InodeTemporary partially implements Inode.
+//
+// +stateify savable
+type InodeTemporary struct{}
+
+// Keep implements Inode.Keep.
+func (*InodeTemporary) Keep() bool {
+	return false
+}
+
+// InodeNoStatFS partially implements the Inode interface, where the client
+// filesystem doesn't support statfs(2).
+//
+// +stateify savable
+type InodeNoStatFS struct{}
+
+// StatFS implements Inode.StatFS.
+func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return linux.Statfs{}, syserror.ENOSYS
+}
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs.go b/pkg/sentry/fsimpl/kernfs/kernfs.go
index 51dbc050c..5c5e09ac5 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs.go
@@ -29,12 +29,16 @@
 //
 // Reference Model:
 //
-// Kernfs dentries represents named pointers to inodes. Dentries and inode have
+// Kernfs dentries represents named pointers to inodes. Kernfs is solely
+// reponsible for maintaining and modifying its dentry tree; inode
+// implementations can not access the tree. Dentries and inodes have
 // independent lifetimes and reference counts. A child dentry unconditionally
 // holds a reference on its parent directory's dentry. A dentry also holds a
-// reference on the inode it points to. Multiple dentries can point to the same
-// inode (for example, in the case of hardlinks). File descriptors hold a
-// reference to the dentry they're opened on.
+// reference on the inode it points to (although that might not be the only
+// reference on the inode). Due to this inodes can outlive the dentries that
+// point to them. Multiple dentries can point to the same inode (for example,
+// in the case of hardlinks). File descriptors hold a reference to the dentry
+// they're opened on.
 //
 // Dentries are guaranteed to exist while holding Filesystem.mu for
 // reading. Dropping dentries require holding Filesystem.mu for writing. To
@@ -47,8 +51,8 @@
 //   kernfs.Dentry.dirMu
 //     vfs.VirtualFilesystem.mountMu
 //       vfs.Dentry.mu
-//   kernfs.Filesystem.droppedDentriesMu
 //   (inode implementation locks, if any)
+// kernfs.Filesystem.droppedDentriesMu
 package kernfs
 
 import (
@@ -57,7 +61,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
@@ -66,15 +69,17 @@ import (
 // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
 // filesystem. Concrete implementations are expected to embed this in their own
 // Filesystem type.
+//
+// +stateify savable
 type Filesystem struct {
 	vfsfs vfs.Filesystem
 
-	droppedDentriesMu sync.Mutex
+	droppedDentriesMu sync.Mutex `state:"nosave"`
 
 	// droppedDentries is a list of dentries waiting to be DecRef()ed. This is
 	// used to defer dentry destruction until mu can be acquired for
 	// writing. Protected by droppedDentriesMu.
-	droppedDentries []*vfs.Dentry
+	droppedDentries []*Dentry
 
 	// mu synchronizes the lifetime of Dentries on this filesystem. Holding it
 	// for reading guarantees continued existence of any resolved dentries, but
@@ -93,22 +98,32 @@ type Filesystem struct {
 	// example:
 	//
 	//   fs.mu.RLock()
-	//   fs.mu.processDeferredDecRefs()
+	//   defer fs.processDeferredDecRefs()
 	//   defer fs.mu.RUnlock()
 	//   ...
 	//   fs.deferDecRef(dentry)
-	mu sync.RWMutex
+	mu sync.RWMutex `state:"nosave"`
 
 	// nextInoMinusOne is used to to allocate inode numbers on this
 	// filesystem. Must be accessed by atomic operations.
 	nextInoMinusOne uint64
+
+	// cachedDentries contains all dentries with 0 references. (Due to race
+	// conditions, it may also contain dentries with non-zero references.)
+	// cachedDentriesLen is the number of dentries in cachedDentries. These
+	// fields are protected by mu.
+	cachedDentries    dentryList
+	cachedDentriesLen uint64
+
+	// MaxCachedDentries is the maximum size of cachedDentries. If not set,
+	// defaults to 0 and kernfs does not cache any dentries. This is immutable.
+	MaxCachedDentries uint64
 }
 
 // deferDecRef defers dropping a dentry ref until the next call to
 // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
-//
-// Precondition: d must not already be pending destruction.
-func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
+// This may be called while Filesystem.mu or Dentry.dirMu is locked.
+func (fs *Filesystem) deferDecRef(d *Dentry) {
 	fs.droppedDentriesMu.Lock()
 	fs.droppedDentries = append(fs.droppedDentries, d)
 	fs.droppedDentriesMu.Unlock()
@@ -116,17 +131,14 @@ func (fs *Filesystem) deferDecRef(d *vfs.Dentry) {
 
 // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
 // droppedDentries list. See comment on Filesystem.mu.
+//
+// Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked.
 func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) {
-	fs.mu.Lock()
-	fs.processDeferredDecRefsLocked(ctx)
-	fs.mu.Unlock()
-}
-
-// Precondition: fs.mu must be held for writing.
-func (fs *Filesystem) processDeferredDecRefsLocked(ctx context.Context) {
 	fs.droppedDentriesMu.Lock()
 	for _, d := range fs.droppedDentries {
-		d.DecRef(ctx)
+		// Defer the DecRef call so that we are not holding droppedDentriesMu
+		// when DecRef is called.
+		defer d.DecRef(ctx)
 	}
 	fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse.
 	fs.droppedDentriesMu.Unlock()
@@ -155,15 +167,24 @@ const (
 //
 // A kernfs dentry is similar to a dentry in a traditional filesystem: it's a
 // named reference to an inode. A dentry generally lives as long as it's part of
-// a mounted filesystem tree. Kernfs doesn't cache dentries once all references
-// to them are removed. Dentries hold a single reference to the inode they point
+// a mounted filesystem tree. Kernfs drops dentries once all references to them
+// are dropped. Dentries hold a single reference to the inode they point
 // to, and child dentries hold a reference on their parent.
 //
 // Must be initialized by Init prior to first use.
+//
+// +stateify savable
 type Dentry struct {
 	vfsd vfs.Dentry
 
-	refs.AtomicRefCount
+	// refs is the reference count. When refs reaches 0, the dentry may be
+	// added to the cache or destroyed. If refs == -1, the dentry has already
+	// been destroyed. refs are allowed to go to 0 and increase again. refs is
+	// accessed using atomic memory operations.
+	refs int64
+
+	// fs is the owning filesystem. fs is immutable.
+	fs *Filesystem
 
 	// flags caches useful information about the dentry from the inode. See the
 	// dflags* consts above. Must be accessed by atomic ops.
@@ -172,21 +193,177 @@ type Dentry struct {
 	parent *Dentry
 	name   string
 
+	// If cached is true, dentryEntry links dentry into
+	// Filesystem.cachedDentries. cached and dentryEntry are protected by
+	// Filesystem.mu.
+	cached bool
+	dentryEntry
+
 	// dirMu protects children and the names of child Dentries.
-	dirMu    sync.Mutex
+	//
+	// Note that holding fs.mu for writing is not sufficient;
+	// revalidateChildLocked(), which is a very hot path, may modify children with
+	// fs.mu acquired for reading only.
+	dirMu    sync.Mutex `state:"nosave"`
 	children map[string]*Dentry
 
 	inode Inode
 }
 
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *Dentry) IncRef() {
+	// d.refs may be 0 if d.fs.mu is locked, which serializes against
+	// d.cacheLocked().
+	atomic.AddInt64(&d.refs, 1)
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *Dentry) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&d.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&d.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *Dentry) DecRef(ctx context.Context) {
+	if refs := atomic.AddInt64(&d.refs, -1); refs == 0 {
+		d.fs.mu.Lock()
+		d.cacheLocked(ctx)
+		d.fs.mu.Unlock()
+	} else if refs < 0 {
+		panic("kernfs.Dentry.DecRef() called without holding a reference")
+	}
+}
+
+// cacheLocked should be called after d's reference count becomes 0. The ref
+// count check may happen before acquiring d.fs.mu so there might be a race
+// condition where the ref count is increased again by the time the caller
+// acquires d.fs.mu. This race is handled.
+// Only reachable dentries are added to the cache. However, a dentry might
+// become unreachable *while* it is in the cache due to invalidation.
+//
+// Preconditions: d.fs.mu must be locked for writing.
+func (d *Dentry) cacheLocked(ctx context.Context) {
+	// Dentries with a non-zero reference count must be retained. (The only way
+	// to obtain a reference on a dentry with zero references is via path
+	// resolution, which requires d.fs.mu, so if d.refs is zero then it will
+	// remain zero while we hold d.fs.mu for writing.)
+	refs := atomic.LoadInt64(&d.refs)
+	if refs == -1 {
+		// Dentry has already been destroyed.
+		panic(fmt.Sprintf("cacheLocked called on a dentry which has already been destroyed: %v", d))
+	}
+	if refs > 0 {
+		if d.cached {
+			d.fs.cachedDentries.Remove(d)
+			d.fs.cachedDentriesLen--
+			d.cached = false
+		}
+		return
+	}
+	// If the dentry is deleted and invalidated or has no parent, then it is no
+	// longer reachable by path resolution and should be dropped immediately
+	// because it has zero references.
+	// Note that a dentry may not always have a parent; for example magic links
+	// as described in Inode.Getlink.
+	if isDead := d.VFSDentry().IsDead(); isDead || d.parent == nil {
+		if !isDead {
+			d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry())
+		}
+		if d.cached {
+			d.fs.cachedDentries.Remove(d)
+			d.fs.cachedDentriesLen--
+			d.cached = false
+		}
+		d.destroyLocked(ctx)
+		return
+	}
+	// If d is already cached, just move it to the front of the LRU.
+	if d.cached {
+		d.fs.cachedDentries.Remove(d)
+		d.fs.cachedDentries.PushFront(d)
+		return
+	}
+	// Cache the dentry, then evict the least recently used cached dentry if
+	// the cache becomes over-full.
+	d.fs.cachedDentries.PushFront(d)
+	d.fs.cachedDentriesLen++
+	d.cached = true
+	if d.fs.cachedDentriesLen <= d.fs.MaxCachedDentries {
+		return
+	}
+	// Evict the least recently used dentry because cache size is greater than
+	// max cache size (configured on mount).
+	victim := d.fs.cachedDentries.Back()
+	d.fs.cachedDentries.Remove(victim)
+	d.fs.cachedDentriesLen--
+	victim.cached = false
+	// victim.refs may have become non-zero from an earlier path resolution
+	// after it was inserted into fs.cachedDentries.
+	if atomic.LoadInt64(&victim.refs) == 0 {
+		if !victim.vfsd.IsDead() {
+			victim.parent.dirMu.Lock()
+			// Note that victim can't be a mount point (in any mount
+			// namespace), since VFS holds references on mount points.
+			d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, victim.VFSDentry())
+			delete(victim.parent.children, victim.name)
+			victim.parent.dirMu.Unlock()
+		}
+		victim.destroyLocked(ctx)
+	}
+	// Whether or not victim was destroyed, we brought fs.cachedDentriesLen
+	// back down to fs.MaxCachedDentries, so we don't loop.
+}
+
+// destroyLocked destroys the dentry.
+//
+// Preconditions:
+// * d.fs.mu must be locked for writing.
+// * d.refs == 0.
+// * d should have been removed from d.parent.children, i.e. d is not reachable
+//   by path traversal.
+// * d.vfsd.IsDead() is true.
+func (d *Dentry) destroyLocked(ctx context.Context) {
+	switch atomic.LoadInt64(&d.refs) {
+	case 0:
+		// Mark the dentry destroyed.
+		atomic.StoreInt64(&d.refs, -1)
+	case -1:
+		panic("dentry.destroyLocked() called on already destroyed dentry")
+	default:
+		panic("dentry.destroyLocked() called with references on the dentry")
+	}
+
+	d.inode.DecRef(ctx) // IncRef from Init.
+	d.inode = nil
+
+	// Drop the reference held by d on its parent without recursively locking
+	// d.fs.mu.
+	if d.parent != nil {
+		if refs := atomic.AddInt64(&d.parent.refs, -1); refs == 0 {
+			d.parent.cacheLocked(ctx)
+		} else if refs < 0 {
+			panic("kernfs.Dentry.DecRef() called without holding a reference")
+		}
+	}
+}
+
 // Init initializes this dentry.
 //
 // Precondition: Caller must hold a reference on inode.
 //
 // Postcondition: Caller's reference on inode is transferred to the dentry.
-func (d *Dentry) Init(inode Inode) {
+func (d *Dentry) Init(fs *Filesystem, inode Inode) {
 	d.vfsd.Init(d)
+	d.fs = fs
 	d.inode = inode
+	atomic.StoreInt64(&d.refs, 1)
 	ftype := inode.Mode().FileType()
 	if ftype == linux.ModeDirectory {
 		d.flags |= dflagsIsDir
@@ -211,20 +388,6 @@ func (d *Dentry) isSymlink() bool {
 	return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0
 }
 
-// DecRef implements vfs.DentryImpl.DecRef.
-func (d *Dentry) DecRef(ctx context.Context) {
-	d.AtomicRefCount.DecRefWithDestructor(ctx, d.destroy)
-}
-
-// Precondition: Dentry must be removed from VFS' dentry cache.
-func (d *Dentry) destroy(ctx context.Context) {
-	d.inode.DecRef(ctx) // IncRef from Init.
-	d.inode = nil
-	if d.parent != nil {
-		d.parent.DecRef(ctx) // IncRef from Dentry.InsertChild.
-	}
-}
-
 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
 //
 // Although Linux technically supports inotify on pseudo filesystems (inotify
@@ -240,25 +403,29 @@ func (d *Dentry) Watches() *vfs.Watches {
 // OnZeroWatches implements vfs.Dentry.OnZeroWatches.
 func (d *Dentry) OnZeroWatches(context.Context) {}
 
-// InsertChild inserts child into the vfs dentry cache with the given name under
-// this dentry. This does not update the directory inode, so calling this on
-// its own isn't sufficient to insert a child into a directory. InsertChild
-// updates the link count on d if required.
+// insertChild inserts child into the vfs dentry cache with the given name under
+// this dentry. This does not update the directory inode, so calling this on its
+// own isn't sufficient to insert a child into a directory.
 //
-// Precondition: d must represent a directory inode.
-func (d *Dentry) InsertChild(name string, child *Dentry) {
+// Preconditions:
+// * d must represent a directory inode.
+// * d.fs.mu must be locked for at least reading.
+func (d *Dentry) insertChild(name string, child *Dentry) {
 	d.dirMu.Lock()
 	d.insertChildLocked(name, child)
 	d.dirMu.Unlock()
 }
 
-// insertChildLocked is equivalent to InsertChild, with additional
+// insertChildLocked is equivalent to insertChild, with additional
 // preconditions.
 //
-// Precondition: d.dirMu must be locked.
+// Preconditions:
+// * d must represent a directory inode.
+// * d.dirMu must be locked.
+// * d.fs.mu must be locked for at least reading.
 func (d *Dentry) insertChildLocked(name string, child *Dentry) {
 	if !d.isDir() {
-		panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d))
+		panic(fmt.Sprintf("insertChildLocked called on non-directory Dentry: %+v.", d))
 	}
 	d.IncRef() // DecRef in child's Dentry.destroy.
 	child.parent = d
@@ -289,7 +456,6 @@ func (d *Dentry) Inode() Inode {
 //
 // - Checking that dentries passed to methods are of the appropriate file type.
 // - Checking permissions.
-// - Updating link and reference counts.
 //
 // Specific responsibilities of implementations are documented below.
 type Inode interface {
@@ -299,7 +465,8 @@ type Inode interface {
 	inodeRefs
 
 	// Methods related to node metadata. A generic implementation is provided by
-	// InodeAttrs.
+	// InodeAttrs. Note that a concrete filesystem using kernfs is responsible for
+	// managing link counts.
 	inodeMetadata
 
 	// Method for inodes that represent symlink. InodeNotSymlink provides a
@@ -310,18 +477,26 @@ type Inode interface {
 	// a blanket implementation for all non-directory inodes.
 	inodeDirectory
 
-	// Method for inodes that represent dynamic directories and their
-	// children. InodeNoDynamicLookup provides a blanket implementation for all
-	// non-dynamic-directory inodes.
-	inodeDynamicLookup
-
 	// Open creates a file description for the filesystem object represented by
 	// this inode. The returned file description should hold a reference on the
-	// inode for its lifetime.
+	// dentry for its lifetime.
 	//
 	// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
 	// the inode on which Open() is being called.
-	Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+	Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
+
+	// StatFS returns filesystem statistics for the client filesystem. This
+	// corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem
+	// doesn't support statfs(2), this should return ENOSYS.
+	StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error)
+
+	// Keep indicates whether the dentry created after Inode.Lookup should be
+	// kept in the kernfs dentry tree.
+	Keep() bool
+
+	// Valid should return true if this inode is still valid, or needs to
+	// be resolved again by a call to Lookup.
+	Valid(ctx context.Context) bool
 }
 
 type inodeRefs interface {
@@ -354,8 +529,8 @@ type inodeMetadata interface {
 // Precondition: All methods in this interface may only be called on directory
 // inodes.
 type inodeDirectory interface {
-	// The New{File,Dir,Node,Symlink} methods below should return a new inode
-	// hashed into this inode.
+	// The New{File,Dir,Node,Link,Symlink} methods below should return a new inode
+	// that will be hashed into the dentry tree.
 	//
 	// These inode constructors are inode-level operations rather than
 	// filesystem-level operations to allow client filesystems to mix different
@@ -366,75 +541,69 @@ type inodeDirectory interface {
 	HasChildren() bool
 
 	// NewFile creates a new regular file inode.
-	NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error)
+	NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error)
 
 	// NewDir creates a new directory inode.
-	NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error)
+	NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error)
 
 	// NewLink creates a new hardlink to a specified inode in this
 	// directory. Implementations should create a new kernfs Dentry pointing to
 	// target, and update target's link count.
-	NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error)
+	NewLink(ctx context.Context, name string, target Inode) (Inode, error)
 
 	// NewSymlink creates a new symbolic link inode.
-	NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error)
+	NewSymlink(ctx context.Context, name, target string) (Inode, error)
 
 	// NewNode creates a new filesystem node for a mknod syscall.
-	NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error)
+	NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error)
 
 	// Unlink removes a child dentry from this directory inode.
-	Unlink(ctx context.Context, name string, child *vfs.Dentry) error
+	Unlink(ctx context.Context, name string, child Inode) error
 
 	// RmDir removes an empty child directory from this directory
 	// inode. Implementations must update the parent directory's link count,
 	// if required. Implementations are not responsible for checking that child
 	// is a directory, checking for an empty directory.
-	RmDir(ctx context.Context, name string, child *vfs.Dentry) error
+	RmDir(ctx context.Context, name string, child Inode) error
 
 	// Rename is called on the source directory containing an inode being
 	// renamed. child should point to the resolved child in the source
-	// directory. If Rename replaces a dentry in the destination directory, it
-	// should return the replaced dentry or nil otherwise.
+	// directory.
 	//
 	// Precondition: Caller must serialize concurrent calls to Rename.
-	Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error)
-}
+	Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error
 
-type inodeDynamicLookup interface {
-	// Lookup should return an appropriate dentry if name should resolve to a
-	// child of this dynamic directory inode. This gives the directory an
-	// opportunity on every lookup to resolve additional entries that aren't
-	// hashed into the directory. This is only called when the inode is a
-	// directory. If the inode is not a directory, or if the directory only
-	// contains a static set of children, the implementer can unconditionally
-	// return an appropriate error (ENOTDIR and ENOENT respectively).
+	// Lookup should return an appropriate inode if name should resolve to a
+	// child of this directory inode. This gives the directory an opportunity
+	// on every lookup to resolve additional entries. This is only called when
+	// the inode is a directory.
 	//
-	// The child returned by Lookup will be hashed into the VFS dentry tree. Its
-	// lifetime can be controlled by the filesystem implementation with an
-	// appropriate implementation of Valid.
+	// The child returned by Lookup will be hashed into the VFS dentry tree,
+	// at least for the duration of the current FS operation.
 	//
-	// Lookup returns the child with an extra reference and the caller owns this
-	// reference.
-	Lookup(ctx context.Context, name string) (*vfs.Dentry, error)
-
-	// Valid should return true if this inode is still valid, or needs to
-	// be resolved again by a call to Lookup.
-	Valid(ctx context.Context) bool
+	// Lookup must return the child with an extra reference whose ownership is
+	// transferred to the dentry that is created to point to that inode. If
+	// Inode.Keep returns false, that new dentry will be dropped at the end of
+	// the current filesystem operation (before returning back to the VFS
+	// layer) if no other ref is picked on that dentry. If Inode.Keep returns
+	// true, then the dentry will be cached into the dentry tree until it is
+	// Unlink'd or RmDir'd.
+	Lookup(ctx context.Context, name string) (Inode, error)
 
 	// IterDirents is used to iterate over dynamically created entries. It invokes
-	// cb on each entry in the directory represented by the FileDescription.
+	// cb on each entry in the directory represented by the Inode.
 	// 'offset' is the offset for the entire IterDirents call, which may include
 	// results from the caller (e.g. "." and ".."). 'relOffset' is the offset
 	// inside the entries returned by this IterDirents invocation. In other words,
 	// 'offset' should be used to calculate each vfs.Dirent.NextOff as well as
 	// the return value, while 'relOffset' is the place to start iteration.
-	IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
+	IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
 }
 
 type inodeSymlink interface {
 	// Readlink returns the target of a symbolic link. If an inode is not a
 	// symlink, the implementation should return EINVAL.
-	Readlink(ctx context.Context) (string, error)
+	Readlink(ctx context.Context, mnt *vfs.Mount) (string, error)
 
 	// Getlink returns the target of a symbolic link, as used by path
 	// resolution:
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index c5d5afedf..2418eec44 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -36,7 +36,7 @@ const staticFileContent = "This is sample content for a static test file."
 
 // RootDentryFn is a generator function for creating the root dentry of a test
 // filesystem. See newTestSystem.
-type RootDentryFn func(*auth.Credentials, *filesystem) *kernfs.Dentry
+type RootDentryFn func(context.Context, *auth.Credentials, *filesystem) kernfs.Inode
 
 // newTestSystem sets up a minimal environment for running a test, including an
 // instance of a test filesystem. Tests can control the contents of the
@@ -52,7 +52,7 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *testutil.System {
 	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
+	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create testfs root mount: %v", err)
 	}
@@ -72,14 +72,11 @@ type file struct {
 	content string
 }
 
-func (fs *filesystem) newFile(creds *auth.Credentials, content string) *kernfs.Dentry {
+func (fs *filesystem) newFile(ctx context.Context, creds *auth.Credentials, content string) kernfs.Inode {
 	f := &file{}
 	f.content = content
-	f.DynamicBytesFile.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(f)
-	return d
+	f.DynamicBytesFile.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), f, 0777)
+	return f
 }
 
 func (f *file) Generate(ctx context.Context, buf *bytes.Buffer) error {
@@ -96,100 +93,112 @@ func (*attrs) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.S
 }
 
 type readonlyDir struct {
+	readonlyDirRefs
 	attrs
-	kernfs.InodeNotSymlink
-	kernfs.InodeNoDynamicLookup
+	kernfs.InodeAlwaysValid
 	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNoStatFS
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
 
 	locks vfs.FileLocks
-
-	dentry kernfs.Dentry
 }
 
-func (fs *filesystem) newReadonlyDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newReadonlyDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
 	dir := &readonlyDir{}
-	dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
+	dir.attrs.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
 	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	dir.dentry.Init(dir)
-
-	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
-
-	return &dir.dentry
+	dir.EnableLeakCheck()
+	dir.IncLinks(dir.OrderedChildren.Populate(contents))
+	return dir
 }
 
-func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+func (d *readonlyDir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
+func (d *readonlyDir) DecRef(ctx context.Context) {
+	d.readonlyDirRefs.DecRef(func() { d.Destroy(ctx) })
+}
+
 type dir struct {
+	dirRefs
 	attrs
+	kernfs.InodeAlwaysValid
 	kernfs.InodeNotSymlink
-	kernfs.InodeNoDynamicLookup
+	kernfs.InodeNoStatFS
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
 
 	locks vfs.FileLocks
 
-	fs     *filesystem
-	dentry kernfs.Dentry
+	fs *filesystem
 }
 
-func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
 	dir := &dir{}
 	dir.fs = fs
-	dir.attrs.Init(creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
+	dir.attrs.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, fs.NextIno(), linux.ModeDirectory|mode)
 	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{Writable: true})
-	dir.dentry.Init(dir)
+	dir.EnableLeakCheck()
 
-	dir.IncLinks(dir.OrderedChildren.Populate(&dir.dentry, contents))
-
-	return &dir.dentry
+	dir.IncLinks(dir.OrderedChildren.Populate(contents))
+	return dir
 }
 
-func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
-func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) {
+func (d *dir) DecRef(ctx context.Context) {
+	d.dirRefs.DecRef(func() { d.Destroy(ctx) })
+}
+
+func (d *dir) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (kernfs.Inode, error) {
 	creds := auth.CredentialsFromContext(ctx)
-	dir := d.fs.newDir(creds, opts.Mode, nil)
-	dirVFSD := dir.VFSDentry()
-	if err := d.OrderedChildren.Insert(name, dirVFSD); err != nil {
+	dir := d.fs.newDir(ctx, creds, opts.Mode, nil)
+	if err := d.OrderedChildren.Insert(name, dir); err != nil {
 		dir.DecRef(ctx)
 		return nil, err
 	}
+	d.TouchCMtime(ctx)
 	d.IncLinks(1)
-	return dirVFSD, nil
+	return dir, nil
 }
 
-func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) {
+func (d *dir) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (kernfs.Inode, error) {
 	creds := auth.CredentialsFromContext(ctx)
-	f := d.fs.newFile(creds, "")
-	fVFSD := f.VFSDentry()
-	if err := d.OrderedChildren.Insert(name, fVFSD); err != nil {
+	f := d.fs.newFile(ctx, creds, "")
+	if err := d.OrderedChildren.Insert(name, f); err != nil {
 		f.DecRef(ctx)
 		return nil, err
 	}
-	return fVFSD, nil
+	d.TouchCMtime(ctx)
+	return f, nil
 }
 
-func (*dir) NewLink(context.Context, string, kernfs.Inode) (*vfs.Dentry, error) {
+func (*dir) NewLink(context.Context, string, kernfs.Inode) (kernfs.Inode, error) {
 	return nil, syserror.EPERM
 }
 
-func (*dir) NewSymlink(context.Context, string, string) (*vfs.Dentry, error) {
+func (*dir) NewSymlink(context.Context, string, string) (kernfs.Inode, error) {
 	return nil, syserror.EPERM
 }
 
-func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (*vfs.Dentry, error) {
+func (*dir) NewNode(context.Context, string, vfs.MknodOptions) (kernfs.Inode, error) {
 	return nil, syserror.EPERM
 }
 
@@ -197,19 +206,23 @@ func (fsType) Name() string {
 	return "kernfs"
 }
 
+func (fsType) Release(ctx context.Context) {}
+
 func (fst fsType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opt vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	fs := &filesystem{}
 	fs.VFSFilesystem().Init(vfsObj, &fst, fs)
-	root := fst.rootFn(creds, fs)
-	return fs.VFSFilesystem(), root.VFSDentry(), nil
+	root := fst.rootFn(ctx, creds, fs)
+	var d kernfs.Dentry
+	d.Init(&fs.Filesystem, root)
+	return fs.VFSFilesystem(), d.VFSDentry(), nil
 }
 
 // -------------------- Remainder of the file are test cases --------------------
 
 func TestBasic(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
-			"file1": fs.newFile(creds, staticFileContent),
+	sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
+			"file1": fs.newFile(ctx, creds, staticFileContent),
 		})
 	})
 	defer sys.Destroy()
@@ -217,9 +230,9 @@ func TestBasic(t *testing.T) {
 }
 
 func TestMkdirGetDentry(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
-			"dir1": fs.newDir(creds, 0755, nil),
+	sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
+			"dir1": fs.newDir(ctx, creds, 0755, nil),
 		})
 	})
 	defer sys.Destroy()
@@ -232,9 +245,9 @@ func TestMkdirGetDentry(t *testing.T) {
 }
 
 func TestReadStaticFile(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
-			"file1": fs.newFile(creds, staticFileContent),
+	sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
+			"file1": fs.newFile(ctx, creds, staticFileContent),
 		})
 	})
 	defer sys.Destroy()
@@ -258,9 +271,9 @@ func TestReadStaticFile(t *testing.T) {
 }
 
 func TestCreateNewFileInStaticDir(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
-			"dir1": fs.newDir(creds, 0755, nil),
+	sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
+			"dir1": fs.newDir(ctx, creds, 0755, nil),
 		})
 	})
 	defer sys.Destroy()
@@ -285,8 +298,8 @@ func TestCreateNewFileInStaticDir(t *testing.T) {
 }
 
 func TestDirFDReadWrite(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, nil)
+	sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(ctx, creds, 0755, nil)
 	})
 	defer sys.Destroy()
 
@@ -309,14 +322,14 @@ func TestDirFDReadWrite(t *testing.T) {
 }
 
 func TestDirFDIterDirents(t *testing.T) {
-	sys := newTestSystem(t, func(creds *auth.Credentials, fs *filesystem) *kernfs.Dentry {
-		return fs.newReadonlyDir(creds, 0755, map[string]*kernfs.Dentry{
+	sys := newTestSystem(t, func(ctx context.Context, creds *auth.Credentials, fs *filesystem) kernfs.Inode {
+		return fs.newReadonlyDir(ctx, creds, 0755, map[string]kernfs.Inode{
 			// Fill root with nodes backed by various inode implementations.
-			"dir1": fs.newReadonlyDir(creds, 0755, nil),
-			"dir2": fs.newDir(creds, 0755, map[string]*kernfs.Dentry{
-				"dir3": fs.newDir(creds, 0755, nil),
+			"dir1": fs.newReadonlyDir(ctx, creds, 0755, nil),
+			"dir2": fs.newDir(ctx, creds, 0755, map[string]kernfs.Inode{
+				"dir3": fs.newDir(ctx, creds, 0755, nil),
 			}),
-			"file1": fs.newFile(creds, staticFileContent),
+			"file1": fs.newFile(ctx, creds, staticFileContent),
 		})
 	})
 	defer sys.Destroy()
diff --git a/pkg/sentry/fsimpl/kernfs/symlink.go b/pkg/sentry/fsimpl/kernfs/symlink.go
index 2ab3f53fd..a0736c0d6 100644
--- a/pkg/sentry/fsimpl/kernfs/symlink.go
+++ b/pkg/sentry/fsimpl/kernfs/symlink.go
@@ -24,10 +24,13 @@ import (
 
 // StaticSymlink provides an Inode implementation for symlinks that point to
 // a immutable target.
+//
+// +stateify savable
 type StaticSymlink struct {
 	InodeAttrs
 	InodeNoopRefCount
 	InodeSymlink
+	InodeNoStatFS
 
 	target string
 }
@@ -35,23 +38,20 @@ type StaticSymlink struct {
 var _ Inode = (*StaticSymlink)(nil)
 
 // NewStaticSymlink creates a new symlink file pointing to 'target'.
-func NewStaticSymlink(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) *Dentry {
+func NewStaticSymlink(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, target string) Inode {
 	inode := &StaticSymlink{}
-	inode.Init(creds, devMajor, devMinor, ino, target)
-
-	d := &Dentry{}
-	d.Init(inode)
-	return d
+	inode.Init(ctx, creds, devMajor, devMinor, ino, target)
+	return inode
 }
 
 // Init initializes the instance.
-func (s *StaticSymlink) Init(creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) {
+func (s *StaticSymlink) Init(ctx context.Context, creds *auth.Credentials, devMajor uint32, devMinor uint32, ino uint64, target string) {
 	s.target = target
-	s.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeSymlink|0777)
+	s.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeSymlink|0777)
 }
 
-// Readlink implements Inode.
-func (s *StaticSymlink) Readlink(_ context.Context) (string, error) {
+// Readlink implements Inode.Readlink.
+func (s *StaticSymlink) Readlink(_ context.Context, _ *vfs.Mount) (string, error) {
 	return s.target, nil
 }
 
diff --git a/pkg/sentry/fsimpl/kernfs/synthetic_directory.go b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
new file mode 100644
index 000000000..463d77d79
--- /dev/null
+++ b/pkg/sentry/fsimpl/kernfs/synthetic_directory.go
@@ -0,0 +1,113 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package kernfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// syntheticDirectory implements kernfs.Inode for a directory created by
+// MkdirAt(ForSyntheticMountpoint=true).
+//
+// +stateify savable
+type syntheticDirectory struct {
+	InodeAlwaysValid
+	InodeAttrs
+	InodeNoStatFS
+	InodeNotSymlink
+	OrderedChildren
+	syntheticDirectoryRefs
+
+	locks vfs.FileLocks
+}
+
+var _ Inode = (*syntheticDirectory)(nil)
+
+func newSyntheticDirectory(ctx context.Context, creds *auth.Credentials, perm linux.FileMode) Inode {
+	inode := &syntheticDirectory{}
+	inode.Init(ctx, creds, 0 /* devMajor */, 0 /* devMinor */, 0 /* ino */, perm)
+	return inode
+}
+
+func (dir *syntheticDirectory) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+	if perm&^linux.PermissionsMask != 0 {
+		panic(fmt.Sprintf("perm contains non-permission bits: %#o", perm))
+	}
+	dir.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.S_IFDIR|perm)
+	dir.OrderedChildren.Init(OrderedChildrenOptions{
+		Writable: true,
+	})
+}
+
+// Open implements Inode.Open.
+func (dir *syntheticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := NewGenericDirectoryFD(rp.Mount(), d, &dir.OrderedChildren, &dir.locks, &opts, GenericDirectoryFDOptions{})
+	if err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// NewFile implements Inode.NewFile.
+func (dir *syntheticDirectory) NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) {
+	return nil, syserror.EPERM
+}
+
+// NewDir implements Inode.NewDir.
+func (dir *syntheticDirectory) NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) {
+	if !opts.ForSyntheticMountpoint {
+		return nil, syserror.EPERM
+	}
+	subdirI := newSyntheticDirectory(ctx, auth.CredentialsFromContext(ctx), opts.Mode&linux.PermissionsMask)
+	if err := dir.OrderedChildren.Insert(name, subdirI); err != nil {
+		subdirI.DecRef(ctx)
+		return nil, err
+	}
+	dir.TouchCMtime(ctx)
+	return subdirI, nil
+}
+
+// NewLink implements Inode.NewLink.
+func (dir *syntheticDirectory) NewLink(ctx context.Context, name string, target Inode) (Inode, error) {
+	return nil, syserror.EPERM
+}
+
+// NewSymlink implements Inode.NewSymlink.
+func (dir *syntheticDirectory) NewSymlink(ctx context.Context, name, target string) (Inode, error) {
+	return nil, syserror.EPERM
+}
+
+// NewNode implements Inode.NewNode.
+func (dir *syntheticDirectory) NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) {
+	return nil, syserror.EPERM
+}
+
+// DecRef implements Inode.DecRef.
+func (dir *syntheticDirectory) DecRef(ctx context.Context) {
+	dir.syntheticDirectoryRefs.DecRef(func() { dir.Destroy(ctx) })
+}
+
+// Keep implements Inode.Keep. This is redundant because inodes will never be
+// created via Lookup and inodes are always valid. Makes sense to return true
+// because these inodes are not temporary and should only be removed on RmDir.
+func (dir *syntheticDirectory) Keep() bool {
+	return true
+}
diff --git a/pkg/sentry/fsimpl/overlay/BUILD b/pkg/sentry/fsimpl/overlay/BUILD
index 8cf5b35d3..fd6c55921 100644
--- a/pkg/sentry/fsimpl/overlay/BUILD
+++ b/pkg/sentry/fsimpl/overlay/BUILD
@@ -21,14 +21,18 @@ go_library(
         "directory.go",
         "filesystem.go",
         "fstree.go",
-        "non_directory.go",
         "overlay.go",
+        "regular_file.go",
+        "save_restore.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/refsvfs2",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/memmap",
@@ -37,5 +41,6 @@ go_library(
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/usermem",
+        "//pkg/waiter",
     ],
 )
diff --git a/pkg/sentry/fsimpl/overlay/copy_up.go b/pkg/sentry/fsimpl/overlay/copy_up.go
index b3d19ff82..4506642ca 100644
--- a/pkg/sentry/fsimpl/overlay/copy_up.go
+++ b/pkg/sentry/fsimpl/overlay/copy_up.go
@@ -22,6 +22,8 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/usermem"
@@ -40,6 +42,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		return nil
 	}
 
+	// Attach our credentials to the context, as some VFS operations use
+	// credentials from context rather an take an explicit creds parameter.
+	ctx = auth.ContextWithCredentials(ctx, d.fs.creds)
+
 	ftype := atomic.LoadUint32(&d.mode) & linux.S_IFMT
 	switch ftype {
 	case linux.S_IFREG, linux.S_IFDIR, linux.S_IFLNK, linux.S_IFBLK, linux.S_IFCHR:
@@ -69,13 +75,28 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		return syserror.ENOENT
 	}
 
-	// Perform copy-up.
+	// Obtain settable timestamps from the lower layer.
 	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	oldpop := vfs.PathOperation{
+		Root:  d.lowerVDs[0],
+		Start: d.lowerVDs[0],
+	}
+	const timestampsMask = linux.STATX_ATIME | linux.STATX_MTIME
+	oldStat, err := vfsObj.StatAt(ctx, d.fs.creds, &oldpop, &vfs.StatOptions{
+		Mask: timestampsMask,
+	})
+	if err != nil {
+		return err
+	}
+
+	// Perform copy-up.
 	newpop := vfs.PathOperation{
 		Root:  d.parent.upperVD,
 		Start: d.parent.upperVD,
 		Path:  fspath.Parse(d.name),
 	}
+	// Used during copy-up of memory-mapped regular files.
+	var mmapOpts *memmap.MMapOpts
 	cleanupUndoCopyUp := func() {
 		var err error
 		if ftype == linux.S_IFDIR {
@@ -84,15 +105,16 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 			err = vfsObj.UnlinkAt(ctx, d.fs.creds, &newpop)
 		}
 		if err != nil {
-			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err)
+			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after copy-up error: %v", err))
+		}
+		if d.upperVD.Ok() {
+			d.upperVD.DecRef(ctx)
+			d.upperVD = vfs.VirtualDentry{}
 		}
 	}
 	switch ftype {
 	case linux.S_IFREG:
-		oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
-			Root:  d.lowerVDs[0],
-			Start: d.lowerVDs[0],
-		}, &vfs.OpenOptions{
+		oldFD, err := vfsObj.OpenAt(ctx, d.fs.creds, &oldpop, &vfs.OpenOptions{
 			Flags: linux.O_RDONLY,
 		})
 		if err != nil {
@@ -127,11 +149,32 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 				break
 			}
 		}
+		d.mapsMu.Lock()
+		defer d.mapsMu.Unlock()
+		if d.wrappedMappable != nil {
+			// We may have memory mappings of the file on the lower layer.
+			// Switch to mapping the file on the upper layer instead.
+			mmapOpts = &memmap.MMapOpts{
+				Perms:    usermem.ReadWrite,
+				MaxPerms: usermem.ReadWrite,
+			}
+			if err := newFD.ConfigureMMap(ctx, mmapOpts); err != nil {
+				cleanupUndoCopyUp()
+				return err
+			}
+			if mmapOpts.MappingIdentity != nil {
+				mmapOpts.MappingIdentity.DecRef(ctx)
+			}
+			// Don't actually switch Mappables until the end of copy-up; see
+			// below for why.
+		}
 		if err := newFD.SetStat(ctx, vfs.SetStatOptions{
 			Stat: linux.Statx{
-				Mask: linux.STATX_UID | linux.STATX_GID,
-				UID:  d.uid,
-				GID:  d.gid,
+				Mask:  linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+				UID:   d.uid,
+				GID:   d.gid,
+				Atime: oldStat.Atime,
+				Mtime: oldStat.Mtime,
 			},
 		}); err != nil {
 			cleanupUndoCopyUp()
@@ -148,9 +191,11 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		}
 		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
 			Stat: linux.Statx{
-				Mask: linux.STATX_UID | linux.STATX_GID,
-				UID:  d.uid,
-				GID:  d.gid,
+				Mask:  linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+				UID:   d.uid,
+				GID:   d.gid,
+				Atime: oldStat.Atime,
+				Mtime: oldStat.Mtime,
 			},
 		}); err != nil {
 			cleanupUndoCopyUp()
@@ -164,10 +209,7 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		d.upperVD = upperVD
 
 	case linux.S_IFLNK:
-		target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
-			Root:  d.lowerVDs[0],
-			Start: d.lowerVDs[0],
-		})
+		target, err := vfsObj.ReadlinkAt(ctx, d.fs.creds, &oldpop)
 		if err != nil {
 			return err
 		}
@@ -176,10 +218,12 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		}
 		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
 			Stat: linux.Statx{
-				Mask: linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID,
-				Mode: uint16(d.mode),
-				UID:  d.uid,
-				GID:  d.gid,
+				Mask:  linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+				Mode:  uint16(d.mode),
+				UID:   d.uid,
+				GID:   d.gid,
+				Atime: oldStat.Atime,
+				Mtime: oldStat.Mtime,
 			},
 		}); err != nil {
 			cleanupUndoCopyUp()
@@ -193,25 +237,20 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		d.upperVD = upperVD
 
 	case linux.S_IFBLK, linux.S_IFCHR:
-		lowerStat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
-			Root:  d.lowerVDs[0],
-			Start: d.lowerVDs[0],
-		}, &vfs.StatOptions{})
-		if err != nil {
-			return err
-		}
 		if err := vfsObj.MknodAt(ctx, d.fs.creds, &newpop, &vfs.MknodOptions{
 			Mode:     linux.FileMode(d.mode),
-			DevMajor: lowerStat.RdevMajor,
-			DevMinor: lowerStat.RdevMinor,
+			DevMajor: oldStat.RdevMajor,
+			DevMinor: oldStat.RdevMinor,
 		}); err != nil {
 			return err
 		}
 		if err := vfsObj.SetStatAt(ctx, d.fs.creds, &newpop, &vfs.SetStatOptions{
 			Stat: linux.Statx{
-				Mask: linux.STATX_UID | linux.STATX_GID,
-				UID:  d.uid,
-				GID:  d.gid,
+				Mask:  linux.STATX_UID | linux.STATX_GID | oldStat.Mask&timestampsMask,
+				UID:   d.uid,
+				GID:   d.gid,
+				Atime: oldStat.Atime,
+				Mtime: oldStat.Mtime,
 			},
 		}); err != nil {
 			cleanupUndoCopyUp()
@@ -229,7 +268,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		panic(fmt.Sprintf("unexpected file type %o", ftype))
 	}
 
-	// TODO(gvisor.dev/issue/1199): copy up xattrs
+	if err := d.copyXattrsLocked(ctx); err != nil {
+		cleanupUndoCopyUp()
+		return err
+	}
 
 	// Update the dentry's device and inode numbers (except for directories,
 	// for which these remain overlay-assigned).
@@ -241,14 +283,10 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 			Mask: linux.STATX_INO,
 		})
 		if err != nil {
-			d.upperVD.DecRef(ctx)
-			d.upperVD = vfs.VirtualDentry{}
 			cleanupUndoCopyUp()
 			return err
 		}
 		if upperStat.Mask&linux.STATX_INO == 0 {
-			d.upperVD.DecRef(ctx)
-			d.upperVD = vfs.VirtualDentry{}
 			cleanupUndoCopyUp()
 			return syserror.EREMOTE
 		}
@@ -257,6 +295,135 @@ func (d *dentry) copyUpLocked(ctx context.Context) error {
 		atomic.StoreUint64(&d.ino, upperStat.Ino)
 	}
 
+	if mmapOpts != nil && mmapOpts.Mappable != nil {
+		// Note that if mmapOpts != nil, then d.mapsMu is locked for writing
+		// (from the S_IFREG path above).
+
+		// Propagate mappings of d to the new Mappable. Remember which mappings
+		// we added so we can remove them on failure.
+		upperMappable := mmapOpts.Mappable
+		allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange)
+		for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			added := make(memmap.MappingsOfRange)
+			for m := range seg.Value() {
+				if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil {
+					for m := range added {
+						upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
+					}
+					for mr, mappings := range allAdded {
+						for m := range mappings {
+							upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable)
+						}
+					}
+					return err
+				}
+				added[m] = struct{}{}
+			}
+			allAdded[seg.Range()] = added
+		}
+
+		// Switch to the new Mappable. We do this at the end of copy-up
+		// because:
+		//
+		// - We need to switch Mappables (by changing d.wrappedMappable) before
+		// invalidating Translations from the old Mappable (to pick up
+		// Translations from the new one).
+		//
+		// - We need to lock d.dataMu while changing d.wrappedMappable, but
+		// must invalidate Translations with d.dataMu unlocked (due to lock
+		// ordering).
+		//
+		// - Consequently, once we unlock d.dataMu, other threads may
+		// immediately observe the new (copied-up) Mappable, which we want to
+		// delay until copy-up is guaranteed to succeed.
+		d.dataMu.Lock()
+		lowerMappable := d.wrappedMappable
+		d.wrappedMappable = upperMappable
+		d.dataMu.Unlock()
+		d.lowerMappings.InvalidateAll(memmap.InvalidateOpts{})
+
+		// Remove mappings from the old Mappable.
+		for seg := d.lowerMappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
+			for m := range seg.Value() {
+				lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
+			}
+		}
+		d.lowerMappings.RemoveAll()
+	}
+
 	atomic.StoreUint32(&d.copiedUp, 1)
 	return nil
 }
+
+// copyXattrsLocked copies a subset of lower's extended attributes to upper.
+// Attributes that configure an overlay in the lower are not copied up.
+//
+// Preconditions: d.copyMu must be locked for writing.
+func (d *dentry) copyXattrsLocked(ctx context.Context) error {
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	lowerPop := &vfs.PathOperation{Root: d.lowerVDs[0], Start: d.lowerVDs[0]}
+	upperPop := &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}
+
+	lowerXattrs, err := vfsObj.ListXattrAt(ctx, d.fs.creds, lowerPop, 0)
+	if err != nil {
+		if err == syserror.EOPNOTSUPP {
+			// There are no guarantees as to the contents of lowerXattrs.
+			return nil
+		}
+		ctx.Infof("failed to copy up xattrs because ListXattrAt failed: %v", err)
+		return err
+	}
+
+	for _, name := range lowerXattrs {
+		// Do not copy up overlay attributes.
+		if isOverlayXattr(name) {
+			continue
+		}
+
+		value, err := vfsObj.GetXattrAt(ctx, d.fs.creds, lowerPop, &vfs.GetXattrOptions{Name: name, Size: 0})
+		if err != nil {
+			ctx.Infof("failed to copy up xattrs because GetXattrAt failed: %v", err)
+			return err
+		}
+
+		if err := vfsObj.SetXattrAt(ctx, d.fs.creds, upperPop, &vfs.SetXattrOptions{Name: name, Value: value}); err != nil {
+			ctx.Infof("failed to copy up xattrs because SetXattrAt failed: %v", err)
+			return err
+		}
+	}
+	return nil
+}
+
+// copyUpDescendantsLocked ensures that all descendants of d are copied up.
+//
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) copyUpDescendantsLocked(ctx context.Context, ds **[]*dentry) error {
+	dirents, err := d.getDirentsLocked(ctx)
+	if err != nil {
+		return err
+	}
+	for _, dirent := range dirents {
+		if dirent.Name == "." || dirent.Name == ".." {
+			continue
+		}
+		child, err := d.fs.getChildLocked(ctx, d, dirent.Name, ds)
+		if err != nil {
+			return err
+		}
+		if err := child.copyUpLocked(ctx); err != nil {
+			return err
+		}
+		if child.isDir() {
+			child.dirMu.Lock()
+			err := child.copyUpDescendantsLocked(ctx, ds)
+			child.dirMu.Unlock()
+			if err != nil {
+				return err
+			}
+		}
+	}
+	return nil
+}
diff --git a/pkg/sentry/fsimpl/overlay/directory.go b/pkg/sentry/fsimpl/overlay/directory.go
index 6a79f7ffe..df4492346 100644
--- a/pkg/sentry/fsimpl/overlay/directory.go
+++ b/pkg/sentry/fsimpl/overlay/directory.go
@@ -29,7 +29,9 @@ func (d *dentry) isDir() bool {
 	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
 }
 
-// Preconditions: d.dirMu must be locked. d.isDir().
+// Preconditions:
+// * d.dirMu must be locked.
+// * d.isDir().
 func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string]bool, error) {
 	vfsObj := d.fs.vfsfs.VirtualFilesystem()
 	var readdirErr error
@@ -98,12 +100,13 @@ func (d *dentry) collectWhiteoutsForRmdirLocked(ctx context.Context) (map[string
 	return whiteouts, readdirErr
 }
 
+// +stateify savable
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
 	vfs.DentryMetadataFileDescriptionImpl
 
-	mu      sync.Mutex
+	mu      sync.Mutex `state:"nosave"`
 	off     int64
 	dirents []vfs.Dirent
 }
@@ -114,10 +117,12 @@ func (fd *directoryFD) Release(ctx context.Context) {
 
 // IterDirents implements vfs.FileDescriptionImpl.IterDirents.
 func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	d := fd.dentry()
+	defer d.InotifyWithParent(ctx, linux.IN_ACCESS, 0, vfs.PathEvent)
+
 	fd.mu.Lock()
 	defer fd.mu.Unlock()
 
-	d := fd.dentry()
 	if fd.dirents == nil {
 		ds, err := d.getDirents(ctx)
 		if err != nil {
@@ -141,7 +146,14 @@ func (d *dentry) getDirents(ctx context.Context) ([]vfs.Dirent, error) {
 	defer d.fs.renameMu.RUnlock()
 	d.dirMu.Lock()
 	defer d.dirMu.Unlock()
+	return d.getDirentsLocked(ctx)
+}
 
+// Preconditions:
+// * filesystem.renameMu must be locked.
+// * d.dirMu must be locked.
+// * d.isDir().
+func (d *dentry) getDirentsLocked(ctx context.Context) ([]vfs.Dirent, error) {
 	if d.dirents != nil {
 		return d.dirents, nil
 	}
diff --git a/pkg/sentry/fsimpl/overlay/filesystem.go b/pkg/sentry/fsimpl/overlay/filesystem.go
index 86d0164b4..10161a08d 100644
--- a/pkg/sentry/fsimpl/overlay/filesystem.go
+++ b/pkg/sentry/fsimpl/overlay/filesystem.go
@@ -15,6 +15,8 @@
 package overlay
 
 import (
+	"fmt"
+	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -27,10 +29,15 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs
+// attributes.
+// Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_PREFIX
+const _OVL_XATTR_PREFIX = linux.XATTR_TRUSTED_PREFIX + "overlay."
+
 // _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for
 // opaque directories.
 // Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE
-const _OVL_XATTR_OPAQUE = "trusted.overlay.opaque"
+const _OVL_XATTR_OPAQUE = _OVL_XATTR_PREFIX + "opaque"
 
 func isWhiteout(stat *linux.Statx) bool {
 	return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0
@@ -110,8 +117,10 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de
 // Dentries which may have a reference count of zero, and which therefore
 // should be dropped once traversal is complete, are appended to ds.
 //
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
-// !rp.Done().
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
+// * !rp.Done().
 func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
 	if !d.isDir() {
 		return nil, syserror.ENOTDIR
@@ -159,7 +168,9 @@ afterSymlink:
 	return child, nil
 }
 
-// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * d.dirMu must be locked.
 func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
 	if child, ok := parent.children[name]; ok {
 		return child, nil
@@ -177,7 +188,9 @@ func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name s
 	return child, nil
 }
 
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
 func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
 	childPath := fspath.Parse(name)
 	child := fs.newDentry()
@@ -199,6 +212,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 			lookupErr = err
 			return false
 		}
+		defer childVD.DecRef(ctx)
 
 		mask := uint32(linux.STATX_TYPE)
 		if !existsOnAnyLayer {
@@ -237,6 +251,7 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 		}
 
 		// Update child to include this layer.
+		childVD.IncRef()
 		if isUpper {
 			child.upperVD = childVD
 			child.copiedUp = 1
@@ -261,10 +276,10 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 
 		// Directories are merged with directories from lower layers if they
 		// are not explicitly opaque.
-		opaqueVal, err := vfsObj.GetxattrAt(ctx, fs.creds, &vfs.PathOperation{
+		opaqueVal, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
 			Root:  childVD,
 			Start: childVD,
-		}, &vfs.GetxattrOptions{
+		}, &vfs.GetXattrOptions{
 			Name: _OVL_XATTR_OPAQUE,
 			Size: 1,
 		})
@@ -287,8 +302,14 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 		child.devMinor = fs.dirDevMinor
 		child.ino = fs.newDirIno()
 	} else if !child.upperVD.Ok() {
+		childDevMinor, err := fs.getLowerDevMinor(child.devMajor, child.devMinor)
+		if err != nil {
+			ctx.Infof("overlay.filesystem.lookupLocked: failed to map lower layer device number (%d, %d) to an overlay-specific device number: %v", child.devMajor, child.devMinor, err)
+			child.destroyLocked(ctx)
+			return nil, err
+		}
 		child.devMajor = linux.UNNAMED_MAJOR
-		child.devMinor = fs.lowerDevMinors[child.lowerVDs[0].Mount().Filesystem()]
+		child.devMinor = childDevMinor
 	}
 
 	parent.IncRef()
@@ -300,7 +321,9 @@ func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name str
 // lookupLayerLocked is similar to lookupLocked, but only returns information
 // about the file rather than a dentry.
 //
-// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+// Preconditions:
+// * fs.renameMu must be locked.
+// * parent.dirMu must be locked.
 func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) {
 	childPath := fspath.Parse(name)
 	lookupLayer := lookupLayerNone
@@ -385,7 +408,9 @@ func (ll lookupLayer) existsInOverlay() bool {
 // rp.Start().Impl().(*dentry)). It does not check that the returned directory
 // is searchable by the provider of rp.
 //
-// Preconditions: fs.renameMu must be locked. !rp.Done().
+// Preconditions:
+// * fs.renameMu must be locked.
+// * !rp.Done().
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
 	for !rp.Final() {
 		d.dirMu.Lock()
@@ -425,8 +450,9 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 // doCreateAt checks that creating a file at rp is permitted, then invokes
 // create to do so.
 //
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
 func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
@@ -479,7 +505,13 @@ func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir
 	if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil {
 		return err
 	}
+
 	parent.dirents = nil
+	ev := linux.IN_CREATE
+	if dir {
+		ev |= linux.IN_ISDIR
+	}
+	parent.watches.Notify(ctx, name, uint32(ev), 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
 	return nil
 }
 
@@ -493,7 +525,7 @@ func (fs *filesystem) createWhiteout(ctx context.Context, vfsObj *vfs.VirtualFil
 
 func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) {
 	if err := fs.createWhiteout(ctx, vfsObj, pop); err != nil {
-		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err)
+		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err))
 	}
 }
 
@@ -605,12 +637,13 @@ func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.
 			},
 		}); err != nil {
 			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil {
-				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr)
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr))
 			} else if haveUpperWhiteout {
 				fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
 			}
 			return err
 		}
+		old.watches.Notify(ctx, "", linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
 		return nil
 	})
 }
@@ -644,7 +677,7 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 			},
 		}); err != nil {
 			if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
-				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr)
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr))
 			} else if haveUpperWhiteout {
 				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 			}
@@ -654,12 +687,12 @@ func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 			// There may be directories on lower layers (previously hidden by
 			// the whiteout) that the new directory should not be merged with.
 			// Mark it opaque to prevent merging.
-			if err := vfsObj.SetxattrAt(ctx, fs.creds, &pop, &vfs.SetxattrOptions{
+			if err := vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{
 				Name:  _OVL_XATTR_OPAQUE,
 				Value: "y",
 			}); err != nil {
 				if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
-					ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr)
+					panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr))
 				} else {
 					fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 				}
@@ -703,7 +736,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 			},
 		}); err != nil {
 			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
-				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr)
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr))
 			} else if haveUpperWhiteout {
 				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 			}
@@ -732,10 +765,13 @@ func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 
 	start := rp.Start().Impl().(*dentry)
 	if rp.Done() {
+		if mayCreate && rp.MustBeDir() {
+			return nil, syserror.EISDIR
+		}
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
-		if mayWrite {
+		if start.isRegularFile() && mayWrite {
 			if err := start.copyUpLocked(ctx); err != nil {
 				return nil, err
 			}
@@ -755,6 +791,10 @@ afterTrailingSymlink:
 	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
 		return nil, err
 	}
+	// Reject attempts to open directories with O_CREAT.
+	if mayCreate && rp.MustBeDir() {
+		return nil, syserror.EISDIR
+	}
 	// Determine whether or not we need to create a file.
 	parent.dirMu.Lock()
 	child, err := fs.stepLocked(ctx, rp, parent, false /* mayFollowSymlinks */, &ds)
@@ -763,12 +803,11 @@ afterTrailingSymlink:
 		parent.dirMu.Unlock()
 		return fd, err
 	}
+	parent.dirMu.Unlock()
 	if err != nil {
-		parent.dirMu.Unlock()
 		return nil, err
 	}
 	// Open existing child or follow symlink.
-	parent.dirMu.Unlock()
 	if mustCreate {
 		return nil, syserror.EEXIST
 	}
@@ -783,7 +822,10 @@ afterTrailingSymlink:
 		start = parent
 		goto afterTrailingSymlink
 	}
-	if mayWrite {
+	if rp.MustBeDir() && !child.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if child.isRegularFile() && mayWrite {
 		if err := child.copyUpLocked(ctx); err != nil {
 			return nil, err
 		}
@@ -836,8 +878,11 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *
 	if err != nil {
 		return nil, err
 	}
+	if ftype != linux.S_IFREG {
+		return layerFD, nil
+	}
 	layerFlags := layerFD.StatusFlags()
-	fd := &nonDirectoryFD{
+	fd := &regularFileFD{
 		copiedUp:    isUpper,
 		cachedFD:    layerFD,
 		cachedFlags: layerFlags,
@@ -851,8 +896,9 @@ func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *
 	return &fd.vfsfd, nil
 }
 
-// Preconditions: parent.dirMu must be locked. parent does not already contain
-// a child named rp.Component().
+// Preconditions:
+// * parent.dirMu must be locked.
+// * parent does not already contain a child named rp.Component().
 func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) {
 	creds := rp.Credentials()
 	if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil {
@@ -913,7 +959,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
 		},
 	}); err != nil {
 		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
-			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr)
+			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr))
 		} else if haveUpperWhiteout {
 			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 		}
@@ -924,7 +970,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
 	child, err := fs.getChildLocked(ctx, parent, childName, ds)
 	if err != nil {
 		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
-			ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr)
+			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr))
 		} else if haveUpperWhiteout {
 			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 		}
@@ -932,7 +978,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
 	}
 	// Finally construct the overlay FD.
 	upperFlags := upperFD.StatusFlags()
-	fd := &nonDirectoryFD{
+	fd := &regularFileFD{
 		copiedUp:    true,
 		cachedFD:    upperFD,
 		cachedFlags: upperFlags,
@@ -945,6 +991,7 @@ func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.Resolving
 		// just can't open it anymore for some reason.
 		return nil, err
 	}
+	parent.watches.Notify(ctx, childName, linux.IN_CREATE, 0 /* cookie */, vfs.PathEvent, false /* unlinked */)
 	return &fd.vfsfd, nil
 }
 
@@ -990,9 +1037,224 @@ func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldPa
 	}
 	defer mnt.EndWrite()
 
-	// FIXME(gvisor.dev/issue/1199): Actually implement rename.
-	_ = newParent
-	return syserror.EXDEV
+	oldParent := oldParentVD.Dentry().Impl().(*dentry)
+	creds := rp.Credentials()
+	if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+		return err
+	}
+	// We need a dentry representing the renamed file since, if it's a
+	// directory, we need to check for write permission on it.
+	oldParent.dirMu.Lock()
+	defer oldParent.dirMu.Unlock()
+	renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds)
+	if err != nil {
+		return err
+	}
+	if err := vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&oldParent.mode)), auth.KUID(atomic.LoadUint32(&renamed.uid))); err != nil {
+		return err
+	}
+	if renamed.isDir() {
+		if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
+			return syserror.EINVAL
+		}
+		if oldParent != newParent {
+			if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
+				return err
+			}
+		}
+	} else {
+		if opts.MustBeDir || rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+
+	if oldParent != newParent {
+		if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
+			return err
+		}
+		newParent.dirMu.Lock()
+		defer newParent.dirMu.Unlock()
+	}
+	if newParent.vfsd.IsDead() {
+		return syserror.ENOENT
+	}
+	replacedLayer, err := fs.lookupLayerLocked(ctx, newParent, newName)
+	if err != nil {
+		return err
+	}
+	var (
+		replaced     *dentry
+		replacedVFSD *vfs.Dentry
+		whiteouts    map[string]bool
+	)
+	if replacedLayer.existsInOverlay() {
+		replaced, err = fs.getChildLocked(ctx, newParent, newName, &ds)
+		if err != nil {
+			return err
+		}
+		replacedVFSD = &replaced.vfsd
+		if replaced.isDir() {
+			if !renamed.isDir() {
+				return syserror.EISDIR
+			}
+			if genericIsAncestorDentry(replaced, renamed) {
+				return syserror.ENOTEMPTY
+			}
+			replaced.dirMu.Lock()
+			defer replaced.dirMu.Unlock()
+			whiteouts, err = replaced.collectWhiteoutsForRmdirLocked(ctx)
+			if err != nil {
+				return err
+			}
+		} else {
+			if rp.MustBeDir() || renamed.isDir() {
+				return syserror.ENOTDIR
+			}
+		}
+	}
+
+	if oldParent == newParent && oldName == newName {
+		return nil
+	}
+
+	// renamed and oldParent need to be copied-up before they're renamed on the
+	// upper layer.
+	if err := renamed.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// If renamed is a directory, all of its descendants need to be copied-up
+	// before they're renamed on the upper layer.
+	if renamed.isDir() {
+		if err := renamed.copyUpDescendantsLocked(ctx, &ds); err != nil {
+			return err
+		}
+	}
+	// newParent must be copied-up before it can contain renamed on the upper
+	// layer.
+	if err := newParent.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// If replaced exists, it doesn't need to be copied-up, but we do need to
+	// serialize with copy-up. Holding renameMu for writing should be
+	// sufficient, but out of an abundance of caution...
+	if replaced != nil {
+		replaced.copyMu.RLock()
+		defer replaced.copyMu.RUnlock()
+	}
+
+	vfsObj := rp.VirtualFilesystem()
+	mntns := vfs.MountNamespaceFromContext(ctx)
+	defer mntns.DecRef(ctx)
+	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
+		return err
+	}
+
+	newpop := vfs.PathOperation{
+		Root:  newParent.upperVD,
+		Start: newParent.upperVD,
+		Path:  fspath.Parse(newName),
+	}
+
+	needRecreateWhiteouts := false
+	cleanupRecreateWhiteouts := func() {
+		if !needRecreateWhiteouts {
+			return
+		}
+		for whiteoutName, whiteoutUpper := range whiteouts {
+			if !whiteoutUpper {
+				continue
+			}
+			if err := fs.createWhiteout(ctx, vfsObj, &vfs.PathOperation{
+				Root:  replaced.upperVD,
+				Start: replaced.upperVD,
+				Path:  fspath.Parse(whiteoutName),
+			}); err != nil && err != syserror.EEXIST {
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RenameAt failure: %v", err))
+			}
+		}
+	}
+	if renamed.isDir() {
+		if replacedLayer == lookupLayerUpper {
+			// Remove whiteouts from the directory being replaced.
+			needRecreateWhiteouts = true
+			for whiteoutName, whiteoutUpper := range whiteouts {
+				if !whiteoutUpper {
+					continue
+				}
+				if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
+					Root:  replaced.upperVD,
+					Start: replaced.upperVD,
+					Path:  fspath.Parse(whiteoutName),
+				}); err != nil {
+					cleanupRecreateWhiteouts()
+					vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+					return err
+				}
+			}
+		} else if replacedLayer == lookupLayerUpperWhiteout {
+			// We need to explicitly remove the whiteout since otherwise rename
+			// on the upper layer will fail with ENOTDIR.
+			if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
+				vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+				return err
+			}
+		}
+	}
+
+	// Essentially no gVisor filesystem supports RENAME_WHITEOUT, so just do a
+	// regular rename and create the whiteout at the origin manually. Unlike
+	// RENAME_WHITEOUT, this isn't atomic with respect to other users of the
+	// upper filesystem, but this is already the case for virtually all other
+	// overlay filesystem operations too.
+	oldpop := vfs.PathOperation{
+		Root:  oldParent.upperVD,
+		Start: oldParent.upperVD,
+		Path:  fspath.Parse(oldName),
+	}
+	if err := vfsObj.RenameAt(ctx, creds, &oldpop, &newpop, &opts); err != nil {
+		cleanupRecreateWhiteouts()
+		vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
+		return err
+	}
+
+	// Below this point, the renamed dentry is now at newpop, and anything we
+	// replaced is gone forever. Commit the rename, update the overlay
+	// filesystem tree, and abandon attempts to recover from errors.
+	vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
+	delete(oldParent.children, oldName)
+	if replaced != nil {
+		ds = appendDentry(ds, replaced)
+	}
+	if oldParent != newParent {
+		newParent.dirents = nil
+		// This can't drop the last reference on oldParent because one is held
+		// by oldParentVD, so lock recursion is impossible.
+		oldParent.DecRef(ctx)
+		ds = appendDentry(ds, oldParent)
+		newParent.IncRef()
+		renamed.parent = newParent
+	}
+	renamed.name = newName
+	if newParent.children == nil {
+		newParent.children = make(map[string]*dentry)
+	}
+	newParent.children[newName] = renamed
+	oldParent.dirents = nil
+
+	if err := fs.createWhiteout(ctx, vfsObj, &oldpop); err != nil {
+		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout at origin after RenameAt: %v", err))
+	}
+	if renamed.isDir() {
+		if err := vfsObj.SetXattrAt(ctx, fs.creds, &newpop, &vfs.SetXattrOptions{
+			Name:  _OVL_XATTR_OPAQUE,
+			Value: "y",
+		}); err != nil {
+			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to make renamed directory opaque: %v", err))
+		}
+	}
+
+	vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir())
+	return nil
 }
 
 // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
@@ -1040,6 +1302,9 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	if !child.isDir() {
 		return syserror.ENOTDIR
 	}
+	if err := vfs.CheckDeleteSticky(rp.Credentials(), linux.FileMode(atomic.LoadUint32(&parent.mode)), auth.KUID(atomic.LoadUint32(&child.uid))); err != nil {
+		return err
+	}
 	child.dirMu.Lock()
 	defer child.dirMu.Unlock()
 	whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx)
@@ -1071,7 +1336,7 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 					Start: child.upperVD,
 					Path:  fspath.Parse(whiteoutName),
 				}); err != nil && err != syserror.EEXIST {
-					ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err)
+					panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err))
 				}
 			}
 		}
@@ -1101,15 +1366,14 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		// Don't attempt to recover from this: the original directory is
 		// already gone, so any dentries representing it are invalid, and
 		// creating a new directory won't undo that.
-		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err)
-		vfsObj.AbortDeleteDentry(&child.vfsd)
-		return err
+		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during RmdirAt: %v", err))
 	}
 
 	vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
 	delete(parent.children, name)
 	ds = appendDentry(ds, child)
 	parent.dirents = nil
+	parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0 /* cookie */, vfs.InodeEvent, true /* unlinked */)
 	return nil
 }
 
@@ -1117,12 +1381,25 @@ func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error
 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
 	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
+		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
 		return err
 	}
+	err = d.setStatLocked(ctx, rp, opts)
+	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	if err != nil {
+		return err
+	}
+
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ctx, ev, 0 /* cookie */, vfs.InodeEvent)
+	}
+	return nil
+}
 
+// Precondition: d.fs.renameMu must be held for reading.
+func (d *dentry) setStatLocked(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
 	if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
 		return err
@@ -1217,7 +1494,7 @@ func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, targ
 			},
 		}); err != nil {
 			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
-				ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr)
+				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr))
 			} else if haveUpperWhiteout {
 				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
 			}
@@ -1263,12 +1540,38 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		return err
 	}
 
+	parentMode := atomic.LoadUint32(&parent.mode)
 	child := parent.children[name]
 	var childLayer lookupLayer
+	if child == nil {
+		if parentMode&linux.S_ISVTX != 0 {
+			// If the parent's sticky bit is set, we need a child dentry to get
+			// its owner.
+			child, err = fs.getChildLocked(ctx, parent, name, &ds)
+			if err != nil {
+				return err
+			}
+		} else {
+			// Determine if the file being unlinked actually exists. Holding
+			// parent.dirMu prevents a dentry from being instantiated for the file,
+			// which in turn prevents it from being copied-up, so this result is
+			// stable.
+			childLayer, err = fs.lookupLayerLocked(ctx, parent, name)
+			if err != nil {
+				return err
+			}
+			if !childLayer.existsInOverlay() {
+				return syserror.ENOENT
+			}
+		}
+	}
 	if child != nil {
 		if child.isDir() {
 			return syserror.EISDIR
 		}
+		if err := vfs.CheckDeleteSticky(rp.Credentials(), linux.FileMode(parentMode), auth.KUID(atomic.LoadUint32(&child.uid))); err != nil {
+			return err
+		}
 		if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
 			return err
 		}
@@ -1281,18 +1584,6 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		} else {
 			childLayer = lookupLayerLower
 		}
-	} else {
-		// Determine if the file being unlinked actually exists. Holding
-		// parent.dirMu prevents a dentry from being instantiated for the file,
-		// which in turn prevents it from being copied-up, so this result is
-		// stable.
-		childLayer, err = fs.lookupLayerLocked(ctx, parent, name)
-		if err != nil {
-			return err
-		}
-		if !childLayer.existsInOverlay() {
-			return syserror.ENOENT
-		}
 	}
 
 	pop := vfs.PathOperation{
@@ -1310,70 +1601,175 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 		}
 	}
 	if err := fs.createWhiteout(ctx, vfsObj, &pop); err != nil {
-		ctx.Warningf("Unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err)
-		if child != nil {
-			vfsObj.AbortDeleteDentry(&child.vfsd)
-		}
-		return err
+		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout during UnlinkAt: %v", err))
 	}
 
+	var cw *vfs.Watches
 	if child != nil {
 		vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
 		delete(parent.children, name)
 		ds = appendDentry(ds, child)
+		cw = &child.watches
 	}
+	vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name)
 	parent.dirents = nil
 	return nil
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// isOverlayXattr returns whether the given extended attribute configures the
+// overlay.
+func isOverlayXattr(name string) bool {
+	return strings.HasPrefix(name, _OVL_XATTR_PREFIX)
+}
+
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
 		return nil, err
 	}
-	// TODO(gvisor.dev/issue/1199): Linux overlayfs actually allows listxattr,
-	// but not any other xattr syscalls. For now we just reject all of them.
-	return nil, syserror.ENOTSUP
+
+	return fs.listXattr(ctx, d, size)
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+func (fs *filesystem) listXattr(ctx context.Context, d *dentry, size uint64) ([]string, error) {
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	top := d.topLayer()
+	names, err := vfsObj.ListXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, size)
+	if err != nil {
+		return nil, err
+	}
+
+	// Filter out all overlay attributes.
+	n := 0
+	for _, name := range names {
+		if !isOverlayXattr(name) {
+			names[n] = name
+			n++
+		}
+	}
+	return names[:n], err
+}
+
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
 		return "", err
 	}
-	return "", syserror.ENOTSUP
+
+	return fs.getXattr(ctx, d, rp.Credentials(), &opts)
+}
+
+func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
+		return "", err
+	}
+
+	// Return EOPNOTSUPP when fetching an overlay attribute.
+	// See fs/overlayfs/super.c:ovl_own_xattr_get().
+	if isOverlayXattr(opts.Name) {
+		return "", syserror.EOPNOTSUPP
+	}
+
+	// Analogous to fs/overlayfs/super.c:ovl_other_xattr_get().
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	top := d.topLayer()
+	return vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
+		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+		return err
+	}
+
+	err = fs.setXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), &opts)
+	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	if err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent)
+	return nil
+}
+
+// Precondition: fs.renameMu must be locked.
+func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
+	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
+		return err
+	}
+
+	// Return EOPNOTSUPP when setting an overlay attribute.
+	// See fs/overlayfs/super.c:ovl_own_xattr_set().
+	if isOverlayXattr(opts.Name) {
+		return syserror.EOPNOTSUPP
+	}
+
+	// Analogous to fs/overlayfs/super.c:ovl_other_xattr_set().
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
 		return err
 	}
-	return syserror.ENOTSUP
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	return vfsObj.SetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, opts)
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
-	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
-	_, err := fs.resolveLocked(ctx, rp, &ds)
+	d, err := fs.resolveLocked(ctx, rp, &ds)
 	if err != nil {
+		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+		return err
+	}
+
+	err = fs.removeXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), name)
+	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+	if err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent)
+	return nil
+}
+
+// Precondition: fs.renameMu must be locked.
+func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, name string) error {
+	if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
+		return err
+	}
+
+	// Like SetXattrAt, return EOPNOTSUPP when removing an overlay attribute.
+	// Linux passes the remove request to xattr_handler->set.
+	// See fs/xattr.c:vfs_removexattr().
+	if isOverlayXattr(name) {
+		return syserror.EOPNOTSUPP
+	}
+
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
 		return err
 	}
-	return syserror.ENOTSUP
+	vfsObj := d.fs.vfsfs.VirtualFilesystem()
+	return vfsObj.RemoveXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, name)
 }
 
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
diff --git a/pkg/sentry/fsimpl/overlay/non_directory.go b/pkg/sentry/fsimpl/overlay/non_directory.go
deleted file mode 100644
index d3060a481..000000000
--- a/pkg/sentry/fsimpl/overlay/non_directory.go
+++ /dev/null
@@ -1,266 +0,0 @@
-// Copyright 2020 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package overlay
-
-import (
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/memmap"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/sync"
-	"gvisor.dev/gvisor/pkg/usermem"
-)
-
-func (d *dentry) isSymlink() bool {
-	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
-}
-
-func (d *dentry) readlink(ctx context.Context) (string, error) {
-	layerVD := d.topLayer()
-	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
-		Root:  layerVD,
-		Start: layerVD,
-	})
-}
-
-type nonDirectoryFD struct {
-	fileDescription
-
-	// If copiedUp is false, cachedFD represents
-	// fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents
-	// fileDescription.dentry().upperVD. cachedFlags is the last known value of
-	// cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are
-	// protected by mu.
-	mu          sync.Mutex
-	copiedUp    bool
-	cachedFD    *vfs.FileDescription
-	cachedFlags uint32
-}
-
-func (fd *nonDirectoryFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) {
-	fd.mu.Lock()
-	defer fd.mu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return nil, err
-	}
-	wrappedFD.IncRef()
-	return wrappedFD, nil
-}
-
-func (fd *nonDirectoryFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) {
-	d := fd.dentry()
-	statusFlags := fd.vfsfd.StatusFlags()
-	if !fd.copiedUp && d.isCopiedUp() {
-		// Switch to the copied-up file.
-		upperVD := d.topLayer()
-		upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
-			Root:  upperVD,
-			Start: upperVD,
-		}, &vfs.OpenOptions{
-			Flags: statusFlags,
-		})
-		if err != nil {
-			return nil, err
-		}
-		oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR)
-		if oldOffErr == nil {
-			if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil {
-				upperFD.DecRef(ctx)
-				return nil, err
-			}
-		}
-		fd.cachedFD.DecRef(ctx)
-		fd.copiedUp = true
-		fd.cachedFD = upperFD
-		fd.cachedFlags = statusFlags
-	} else if fd.cachedFlags != statusFlags {
-		if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil {
-			return nil, err
-		}
-		fd.cachedFlags = statusFlags
-	}
-	return fd.cachedFD, nil
-}
-
-// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *nonDirectoryFD) Release(ctx context.Context) {
-	fd.cachedFD.DecRef(ctx)
-	fd.cachedFD = nil
-}
-
-// OnClose implements vfs.FileDescriptionImpl.OnClose.
-func (fd *nonDirectoryFD) OnClose(ctx context.Context) error {
-	// Linux doesn't define ovl_file_operations.flush at all (i.e. its
-	// equivalent to OnClose is a no-op). We pass through to
-	// fd.cachedFD.OnClose() without upgrading if fd.dentry() has been
-	// copied-up, since OnClose is mostly used to define post-close writeback,
-	// and if fd.cachedFD hasn't been updated then it can't have been used to
-	// mutate fd.dentry() anyway.
-	fd.mu.Lock()
-	if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags {
-		if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil {
-			fd.mu.Unlock()
-			return err
-		}
-		fd.cachedFlags = statusFlags
-	}
-	wrappedFD := fd.cachedFD
-	defer wrappedFD.IncRef()
-	fd.mu.Unlock()
-	return wrappedFD.OnClose(ctx)
-}
-
-// Stat implements vfs.FileDescriptionImpl.Stat.
-func (fd *nonDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
-	var stat linux.Statx
-	if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
-		wrappedFD, err := fd.getCurrentFD(ctx)
-		if err != nil {
-			return linux.Statx{}, err
-		}
-		stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{
-			Mask: layerMask,
-			Sync: opts.Sync,
-		})
-		wrappedFD.DecRef(ctx)
-		if err != nil {
-			return linux.Statx{}, err
-		}
-	}
-	fd.dentry().statInternalTo(ctx, &opts, &stat)
-	return stat, nil
-}
-
-// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (fd *nonDirectoryFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	d := fd.dentry()
-	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
-	if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
-		return err
-	}
-	mnt := fd.vfsfd.Mount()
-	if err := mnt.CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer mnt.EndWrite()
-	if err := d.copyUpLocked(ctx); err != nil {
-		return err
-	}
-	// Changes to d's attributes are serialized by d.copyMu.
-	d.copyMu.Lock()
-	defer d.copyMu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return err
-	}
-	if err := wrappedFD.SetStat(ctx, opts); err != nil {
-		return err
-	}
-	d.updateAfterSetStatLocked(&opts)
-	return nil
-}
-
-// StatFS implements vfs.FileDescriptionImpl.StatFS.
-func (fd *nonDirectoryFD) StatFS(ctx context.Context) (linux.Statfs, error) {
-	return fd.filesystem().statFS(ctx)
-}
-
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *nonDirectoryFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
-	wrappedFD, err := fd.getCurrentFD(ctx)
-	if err != nil {
-		return 0, err
-	}
-	defer wrappedFD.DecRef(ctx)
-	return wrappedFD.PRead(ctx, dst, offset, opts)
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (fd *nonDirectoryFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	// Hold fd.mu during the read to serialize the file offset.
-	fd.mu.Lock()
-	defer fd.mu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return 0, err
-	}
-	return wrappedFD.Read(ctx, dst, opts)
-}
-
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *nonDirectoryFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	wrappedFD, err := fd.getCurrentFD(ctx)
-	if err != nil {
-		return 0, err
-	}
-	defer wrappedFD.DecRef(ctx)
-	return wrappedFD.PWrite(ctx, src, offset, opts)
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (fd *nonDirectoryFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
-	// Hold fd.mu during the write to serialize the file offset.
-	fd.mu.Lock()
-	defer fd.mu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return 0, err
-	}
-	return wrappedFD.Write(ctx, src, opts)
-}
-
-// Seek implements vfs.FileDescriptionImpl.Seek.
-func (fd *nonDirectoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
-	// Hold fd.mu during the seek to serialize the file offset.
-	fd.mu.Lock()
-	defer fd.mu.Unlock()
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		return 0, err
-	}
-	return wrappedFD.Seek(ctx, offset, whence)
-}
-
-// Sync implements vfs.FileDescriptionImpl.Sync.
-func (fd *nonDirectoryFD) Sync(ctx context.Context) error {
-	fd.mu.Lock()
-	if !fd.dentry().isCopiedUp() {
-		fd.mu.Unlock()
-		return nil
-	}
-	wrappedFD, err := fd.currentFDLocked(ctx)
-	if err != nil {
-		fd.mu.Unlock()
-		return err
-	}
-	wrappedFD.IncRef()
-	defer wrappedFD.DecRef(ctx)
-	fd.mu.Unlock()
-	return wrappedFD.Sync(ctx)
-}
-
-// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
-func (fd *nonDirectoryFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
-	wrappedFD, err := fd.getCurrentFD(ctx)
-	if err != nil {
-		return err
-	}
-	defer wrappedFD.DecRef(ctx)
-	return wrappedFD.ConfigureMMap(ctx, opts)
-}
diff --git a/pkg/sentry/fsimpl/overlay/overlay.go b/pkg/sentry/fsimpl/overlay/overlay.go
index 75cc006bf..c812f0a70 100644
--- a/pkg/sentry/fsimpl/overlay/overlay.go
+++ b/pkg/sentry/fsimpl/overlay/overlay.go
@@ -18,10 +18,15 @@
 //
 // Lock order:
 //
-// directoryFD.mu / nonDirectoryFD.mu
+// directoryFD.mu / regularFileFD.mu
 //   filesystem.renameMu
 //     dentry.dirMu
 //       dentry.copyMu
+//         filesystem.devMu
+//         *** "memmap.Mappable locks" below this point
+//         dentry.mapsMu
+//           *** "memmap.Mappable locks taken by Translate" below this point
+//           dentry.dataMu
 //
 // Locking dentry.dirMu in multiple dentries requires that parent dentries are
 // locked before child dentries, and that filesystem.renameMu is locked to
@@ -29,14 +34,17 @@
 package overlay
 
 import (
+	"fmt"
 	"strings"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -46,6 +54,8 @@ import (
 const Name = "overlay"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // Name implements vfs.FilesystemType.Name.
@@ -53,8 +63,13 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to
 // FilesystemType.GetFilesystem.
+//
+// +stateify savable
 type FilesystemOptions struct {
 	// Callers passing FilesystemOptions to
 	// overlay.FilesystemType.GetFilesystem() are responsible for ensuring that
@@ -71,6 +86,8 @@ type FilesystemOptions struct {
 }
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
@@ -85,97 +102,93 @@ type filesystem struct {
 	// is immutable.
 	dirDevMinor uint32
 
-	// lowerDevMinors maps lower layer filesystems to device minor numbers
-	// assigned to non-directory files originating from that filesystem.
-	// lowerDevMinors is immutable.
-	lowerDevMinors map[*vfs.Filesystem]uint32
+	// lowerDevMinors maps device numbers from lower layer filesystems to
+	// device minor numbers assigned to non-directory files originating from
+	// that filesystem. (This remapping is necessary for lower layers because a
+	// file on a lower layer, and that same file on an overlay, are
+	// distinguishable because they will diverge after copy-up; this isn't true
+	// for non-directory files already on the upper layer.) lowerDevMinors is
+	// protected by devMu.
+	devMu          sync.Mutex `state:"nosave"`
+	lowerDevMinors map[layerDevNumber]uint32
 
 	// renameMu synchronizes renaming with non-renaming operations in order to
 	// ensure consistent lock ordering between dentry.dirMu in different
 	// dentries.
-	renameMu sync.RWMutex
+	renameMu sync.RWMutex `state:"nosave"`
 
 	// lastDirIno is the last inode number assigned to a directory. lastDirIno
 	// is accessed using atomic memory operations.
 	lastDirIno uint64
 }
 
+// +stateify savable
+type layerDevNumber struct {
+	major uint32
+	minor uint32
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	mopts := vfs.GenericParseMountOptions(opts.Data)
 	fsoptsRaw := opts.InternalData
-	fsopts, haveFSOpts := fsoptsRaw.(FilesystemOptions)
-	if fsoptsRaw != nil && !haveFSOpts {
-		ctx.Warningf("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
+	fsopts, ok := fsoptsRaw.(FilesystemOptions)
+	if fsoptsRaw != nil && !ok {
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
 		return nil, nil, syserror.EINVAL
 	}
-	if haveFSOpts {
-		if len(fsopts.LowerRoots) == 0 {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: LowerRoots must be non-empty")
+	vfsroot := vfs.RootFromContext(ctx)
+	if vfsroot.Ok() {
+		defer vfsroot.DecRef(ctx)
+	}
+
+	if upperPathname, ok := mopts["upperdir"]; ok {
+		if fsopts.UpperRoot.Ok() {
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: both upperdir and FilesystemOptions.UpperRoot are specified")
 			return nil, nil, syserror.EINVAL
 		}
-		if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two LowerRoots are required when UpperRoot is unspecified")
+		delete(mopts, "upperdir")
+		// Linux overlayfs also requires a workdir when upperdir is
+		// specified; we don't, so silently ignore this option.
+		delete(mopts, "workdir")
+		upperPath := fspath.Parse(upperPathname)
+		if !upperPath.Absolute {
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
 			return nil, nil, syserror.EINVAL
 		}
-		// We don't enforce a maximum number of lower layers when not
-		// configured by applications; the sandbox owner can have an overlay
-		// filesystem with any number of lower layers.
-	} else {
-		vfsroot := vfs.RootFromContext(ctx)
-		defer vfsroot.DecRef(ctx)
-		upperPathname, ok := mopts["upperdir"]
-		if ok {
-			delete(mopts, "upperdir")
-			// Linux overlayfs also requires a workdir when upperdir is
-			// specified; we don't, so silently ignore this option.
-			delete(mopts, "workdir")
-			upperPath := fspath.Parse(upperPathname)
-			if !upperPath.Absolute {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
-				return nil, nil, syserror.EINVAL
-			}
-			upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
-				Root:               vfsroot,
-				Start:              vfsroot,
-				Path:               upperPath,
-				FollowFinalSymlink: true,
-			}, &vfs.GetDentryOptions{
-				CheckSearchable: true,
-			})
-			if err != nil {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
-				return nil, nil, err
-			}
-			defer upperRoot.DecRef(ctx)
-			privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
-			if err != nil {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
-				return nil, nil, err
-			}
-			defer privateUpperRoot.DecRef(ctx)
-			fsopts.UpperRoot = privateUpperRoot
+		upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
+			Root:               vfsroot,
+			Start:              vfsroot,
+			Path:               upperPath,
+			FollowFinalSymlink: true,
+		}, &vfs.GetDentryOptions{
+			CheckSearchable: true,
+		})
+		if err != nil {
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
+			return nil, nil, err
+		}
+		privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
+		upperRoot.DecRef(ctx)
+		if err != nil {
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
+			return nil, nil, err
 		}
-		lowerPathnamesStr, ok := mopts["lowerdir"]
-		if !ok {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: missing required option lowerdir")
+		defer privateUpperRoot.DecRef(ctx)
+		fsopts.UpperRoot = privateUpperRoot
+	}
+
+	if lowerPathnamesStr, ok := mopts["lowerdir"]; ok {
+		if len(fsopts.LowerRoots) != 0 {
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: both lowerdir and FilesystemOptions.LowerRoots are specified")
 			return nil, nil, syserror.EINVAL
 		}
 		delete(mopts, "lowerdir")
 		lowerPathnames := strings.Split(lowerPathnamesStr, ":")
-		const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
-		if len(lowerPathnames) < 2 && !fsopts.UpperRoot.Ok() {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: at least two lowerdirs are required when upperdir is unspecified")
-			return nil, nil, syserror.EINVAL
-		}
-		if len(lowerPathnames) > maxLowerLayers {
-			ctx.Warningf("overlay.FilesystemType.GetFilesystem: %d lowerdirs specified, maximum %d", len(lowerPathnames), maxLowerLayers)
-			return nil, nil, syserror.EINVAL
-		}
 		for _, lowerPathname := range lowerPathnames {
 			lowerPath := fspath.Parse(lowerPathname)
 			if !lowerPath.Absolute {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
+				ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
 				return nil, nil, syserror.EINVAL
 			}
 			lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
@@ -187,44 +200,44 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 				CheckSearchable: true,
 			})
 			if err != nil {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
+				ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
 				return nil, nil, err
 			}
-			defer lowerRoot.DecRef(ctx)
 			privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */)
+			lowerRoot.DecRef(ctx)
 			if err != nil {
-				ctx.Warningf("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
+				ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
 				return nil, nil, err
 			}
 			defer privateLowerRoot.DecRef(ctx)
 			fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot)
 		}
 	}
+
 	if len(mopts) != 0 {
-		ctx.Warningf("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
+		return nil, nil, syserror.EINVAL
+	}
+
+	if len(fsopts.LowerRoots) == 0 {
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: at least one lower layer is required")
+		return nil, nil, syserror.EINVAL
+	}
+	if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lower layers are required when no upper layer is present")
+		return nil, nil, syserror.EINVAL
+	}
+	const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
+	if len(fsopts.LowerRoots) > maxLowerLayers {
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lower layers specified, maximum %d", len(fsopts.LowerRoots), maxLowerLayers)
 		return nil, nil, syserror.EINVAL
 	}
 
-	// Allocate device numbers.
+	// Allocate dirDevMinor. lowerDevMinors are allocated dynamically.
 	dirDevMinor, err := vfsObj.GetAnonBlockDevMinor()
 	if err != nil {
 		return nil, nil, err
 	}
-	lowerDevMinors := make(map[*vfs.Filesystem]uint32)
-	for _, lowerRoot := range fsopts.LowerRoots {
-		lowerFS := lowerRoot.Mount().Filesystem()
-		if _, ok := lowerDevMinors[lowerFS]; !ok {
-			devMinor, err := vfsObj.GetAnonBlockDevMinor()
-			if err != nil {
-				vfsObj.PutAnonBlockDevMinor(dirDevMinor)
-				for _, lowerDevMinor := range lowerDevMinors {
-					vfsObj.PutAnonBlockDevMinor(lowerDevMinor)
-				}
-				return nil, nil, err
-			}
-			lowerDevMinors[lowerFS] = devMinor
-		}
-	}
 
 	// Take extra references held by the filesystem.
 	if fsopts.UpperRoot.Ok() {
@@ -238,7 +251,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		opts:           fsopts,
 		creds:          creds.Fork(),
 		dirDevMinor:    dirDevMinor,
-		lowerDevMinors: lowerDevMinors,
+		lowerDevMinors: make(map[layerDevNumber]uint32),
 	}
 	fs.vfsfs.Init(vfsObj, &fstype, fs)
 
@@ -274,7 +287,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		return nil, nil, syserror.EREMOTE
 	}
 	if isWhiteout(&rootStat) {
-		ctx.Warningf("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
+		ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
 		root.destroyLocked(ctx)
 		fs.vfsfs.DecRef(ctx)
 		return nil, nil, syserror.EINVAL
@@ -288,7 +301,14 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		root.ino = fs.newDirIno()
 	} else if !root.upperVD.Ok() {
 		root.devMajor = linux.UNNAMED_MAJOR
-		root.devMinor = fs.lowerDevMinors[root.lowerVDs[0].Mount().Filesystem()]
+		rootDevMinor, err := fs.getLowerDevMinor(rootStat.DevMajor, rootStat.DevMinor)
+		if err != nil {
+			ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err)
+			root.destroyLocked(ctx)
+			fs.vfsfs.DecRef(ctx)
+			return nil, nil, err
+		}
+		root.devMinor = rootDevMinor
 		root.ino = rootStat.Ino
 	} else {
 		root.devMajor = rootStat.DevMajor
@@ -315,7 +335,11 @@ func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forc
 	if err != nil {
 		return vfs.VirtualDentry{}, err
 	}
-	return vfs.MakeVirtualDentry(newmnt, vd.Dentry()), nil
+	// Take a reference on the dentry which will be owned by the returned
+	// VirtualDentry.
+	d := vd.Dentry()
+	d.IncRef()
+	return vfs.MakeVirtualDentry(newmnt, d), nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
@@ -357,7 +381,24 @@ func (fs *filesystem) newDirIno() uint64 {
 	return atomic.AddUint64(&fs.lastDirIno, 1)
 }
 
+func (fs *filesystem) getLowerDevMinor(layerMajor, layerMinor uint32) (uint32, error) {
+	fs.devMu.Lock()
+	defer fs.devMu.Unlock()
+	orig := layerDevNumber{layerMajor, layerMinor}
+	if minor, ok := fs.lowerDevMinors[orig]; ok {
+		return minor, nil
+	}
+	minor, err := fs.vfsfs.VirtualFilesystem().GetAnonBlockDevMinor()
+	if err != nil {
+		return 0, err
+	}
+	fs.lowerDevMinors[orig] = minor
+	return minor, nil
+}
+
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
@@ -390,7 +431,7 @@ type dentry struct {
 	// and dirents (if not nil) is a cache of dirents as returned by
 	// directoryFDs representing this directory. children is protected by
 	// dirMu.
-	dirMu    sync.Mutex
+	dirMu    sync.Mutex `state:"nosave"`
 	children map[string]*dentry
 	dirents  []vfs.Dirent
 
@@ -400,7 +441,7 @@ type dentry struct {
 	// If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e.
 	// be copied up) with copyMu locked for writing; otherwise, it is
 	// immutable. lowerVDs is always immutable.
-	copyMu   sync.RWMutex
+	copyMu   sync.RWMutex `state:"nosave"`
 	upperVD  vfs.VirtualDentry
 	lowerVDs []vfs.VirtualDentry
 
@@ -415,7 +456,43 @@ type dentry struct {
 	devMinor uint32
 	ino      uint64
 
+	// If this dentry represents a regular file, then:
+	//
+	// - mapsMu is used to synchronize between copy-up and memmap.Mappable
+	// methods on dentry preceding mm.MemoryManager.activeMu in the lock order.
+	//
+	// - dataMu is used to synchronize between copy-up and
+	// dentry.(memmap.Mappable).Translate.
+	//
+	// - lowerMappings tracks memory mappings of the file. lowerMappings is
+	// used to invalidate mappings of the lower layer when the file is copied
+	// up to ensure that they remain coherent with subsequent writes to the
+	// file. (Note that, as of this writing, Linux overlayfs does not do this;
+	// this feature is a gVisor extension.) lowerMappings is protected by
+	// mapsMu.
+	//
+	// - If this dentry is copied-up, then wrappedMappable is the Mappable
+	// obtained from a call to the current top layer's
+	// FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil
+	// (from a call to regularFileFD.ensureMappable()), it cannot become nil.
+	// wrappedMappable is protected by mapsMu and dataMu.
+	//
+	// - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is
+	// accessed using atomic memory operations.
+	mapsMu          sync.Mutex `state:"nosave"`
+	lowerMappings   memmap.MappingSet
+	dataMu          sync.RWMutex `state:"nosave"`
+	wrappedMappable memmap.Mappable
+	isMappable      uint32
+
 	locks vfs.FileLocks
+
+	// watches is the set of inotify watches on the file repesented by this dentry.
+	//
+	// Note that hard links to the same file will not share the same set of
+	// watches, due to the fact that we do not have inode structures in this
+	// overlay implementation.
+	watches vfs.Watches
 }
 
 // newDentry creates a new dentry. The dentry initially has no references; it
@@ -428,6 +505,9 @@ func (fs *filesystem) newDentry() *dentry {
 	}
 	d.lowerVDs = d.inlineLowerVDs[:0]
 	d.vfsd.Init(d)
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Register(d, "overlay.dentry")
+	}
 	return d
 }
 
@@ -475,6 +555,14 @@ func (d *dentry) checkDropLocked(ctx context.Context) {
 	if atomic.LoadInt64(&d.refs) != 0 {
 		return
 	}
+
+	// Make sure that we do not lose watches on dentries that have not been
+	// deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so
+	// d.vfsd.IsDead() indicates that d was deleted.
+	if !d.vfsd.IsDead() && d.watches.Size() > 0 {
+		return
+	}
+
 	// Refs is still zero; destroy it.
 	d.destroyLocked(ctx)
 	return
@@ -482,7 +570,9 @@ func (d *dentry) checkDropLocked(ctx context.Context) {
 
 // destroyLocked destroys the dentry.
 //
-// Preconditions: d.fs.renameMu must be locked for writing. d.refs == 0.
+// Preconditions:
+// * d.fs.renameMu must be locked for writing.
+// * d.refs == 0.
 func (d *dentry) destroyLocked(ctx context.Context) {
 	switch atomic.LoadInt64(&d.refs) {
 	case 0:
@@ -501,6 +591,8 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 		lowerVD.DecRef(ctx)
 	}
 
+	d.watches.HandleDeletion(ctx)
+
 	if d.parent != nil {
 		d.parent.dirMu.Lock()
 		if !d.vfsd.IsDead() {
@@ -515,23 +607,48 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 			panic("overlay.dentry.DecRef() called without holding a reference")
 		}
 	}
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Unregister(d, "overlay.dentry")
+	}
+}
+
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (d *dentry) LeakMessage() string {
+	return fmt.Sprintf("[overlay.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
 }
 
 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
 func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) {
-	// TODO(gvisor.dev/issue/1479): Implement inotify.
+	if d.isDir() {
+		events |= linux.IN_ISDIR
+	}
+
+	// overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
+	// that d was deleted.
+	deleted := d.vfsd.IsDead()
+
+	d.fs.renameMu.RLock()
+	// The ordering below is important, Linux always notifies the parent first.
+	if d.parent != nil {
+		d.parent.watches.Notify(ctx, d.name, events, cookie, et, deleted)
+	}
+	d.watches.Notify(ctx, "", events, cookie, et, deleted)
+	d.fs.renameMu.RUnlock()
 }
 
 // Watches implements vfs.DentryImpl.Watches.
 func (d *dentry) Watches() *vfs.Watches {
-	// TODO(gvisor.dev/issue/1479): Implement inotify.
-	return nil
+	return &d.watches
 }
 
 // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
-//
-// TODO(gvisor.dev/issue/1479): Implement inotify.
-func (d *dentry) OnZeroWatches(context.Context) {}
+func (d *dentry) OnZeroWatches(ctx context.Context) {
+	if atomic.LoadInt64(&d.refs) == 0 {
+		d.fs.renameMu.Lock()
+		d.checkDropLocked(ctx)
+		d.fs.renameMu.Unlock()
+	}
+}
 
 // iterLayers invokes yield on each layer comprising d, from top to bottom. If
 // any call to yield returns false, iterLayer stops iteration.
@@ -564,6 +681,16 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
+func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	kuid := auth.KUID(atomic.LoadUint32(&d.uid))
+	kgid := auth.KGID(atomic.LoadUint32(&d.gid))
+	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+		return err
+	}
+	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
+}
+
 // statInternalMask is the set of stat fields that is set by
 // dentry.statInternalTo().
 const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
@@ -602,6 +729,8 @@ func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) {
 
 // fileDescription is embedded by overlay implementations of
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -616,6 +745,48 @@ func (fd *fileDescription) dentry() *dentry {
 	return fd.vfsfd.Dentry().Impl().(*dentry)
 }
 
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.filesystem().listXattr(ctx, fd.dentry(), size)
+}
+
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+	return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts)
+}
+
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
+	fs := fd.filesystem()
+	d := fd.dentry()
+
+	fs.renameMu.RLock()
+	err := fs.setXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts)
+	fs.renameMu.RUnlock()
+	if err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
+	fs := fd.filesystem()
+	d := fd.dentry()
+
+	fs.renameMu.RLock()
+	err := fs.removeXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name)
+	fs.renameMu.RUnlock()
+	if err != nil {
+		return err
+	}
+
+	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
+	return nil
+}
+
 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
 	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
diff --git a/pkg/sentry/fsimpl/overlay/regular_file.go b/pkg/sentry/fsimpl/overlay/regular_file.go
new file mode 100644
index 000000000..2b89a7a6d
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/regular_file.go
@@ -0,0 +1,456 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/log"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/sync"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+	"gvisor.dev/gvisor/pkg/waiter"
+)
+
+func (d *dentry) isRegularFile() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFREG
+}
+
+func (d *dentry) isSymlink() bool {
+	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
+}
+
+func (d *dentry) readlink(ctx context.Context) (string, error) {
+	layerVD := d.topLayer()
+	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  layerVD,
+		Start: layerVD,
+	})
+}
+
+// +stateify savable
+type regularFileFD struct {
+	fileDescription
+
+	// If copiedUp is false, cachedFD represents
+	// fileDescription.dentry().lowerVDs[0]; otherwise, cachedFD represents
+	// fileDescription.dentry().upperVD. cachedFlags is the last known value of
+	// cachedFD.StatusFlags(). copiedUp, cachedFD, and cachedFlags are
+	// protected by mu.
+	mu          sync.Mutex `state:"nosave"`
+	copiedUp    bool
+	cachedFD    *vfs.FileDescription
+	cachedFlags uint32
+
+	// If copiedUp is false, lowerWaiters contains all waiter.Entries
+	// registered with cachedFD. lowerWaiters is protected by mu.
+	lowerWaiters map[*waiter.Entry]waiter.EventMask
+}
+
+func (fd *regularFileFD) getCurrentFD(ctx context.Context) (*vfs.FileDescription, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return nil, err
+	}
+	wrappedFD.IncRef()
+	return wrappedFD, nil
+}
+
+func (fd *regularFileFD) currentFDLocked(ctx context.Context) (*vfs.FileDescription, error) {
+	d := fd.dentry()
+	statusFlags := fd.vfsfd.StatusFlags()
+	if !fd.copiedUp && d.isCopiedUp() {
+		// Switch to the copied-up file.
+		upperVD := d.topLayer()
+		upperFD, err := fd.filesystem().vfsfs.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  upperVD,
+			Start: upperVD,
+		}, &vfs.OpenOptions{
+			Flags: statusFlags,
+		})
+		if err != nil {
+			return nil, err
+		}
+		oldOff, oldOffErr := fd.cachedFD.Seek(ctx, 0, linux.SEEK_CUR)
+		if oldOffErr == nil {
+			if _, err := upperFD.Seek(ctx, oldOff, linux.SEEK_SET); err != nil {
+				upperFD.DecRef(ctx)
+				return nil, err
+			}
+		}
+		if len(fd.lowerWaiters) != 0 {
+			ready := upperFD.Readiness(^waiter.EventMask(0))
+			for e, mask := range fd.lowerWaiters {
+				fd.cachedFD.EventUnregister(e)
+				upperFD.EventRegister(e, mask)
+				if ready&mask != 0 {
+					e.Callback.Callback(e)
+				}
+			}
+		}
+		fd.cachedFD.DecRef(ctx)
+		fd.copiedUp = true
+		fd.cachedFD = upperFD
+		fd.cachedFlags = statusFlags
+		fd.lowerWaiters = nil
+	} else if fd.cachedFlags != statusFlags {
+		if err := fd.cachedFD.SetStatusFlags(ctx, d.fs.creds, statusFlags); err != nil {
+			return nil, err
+		}
+		fd.cachedFlags = statusFlags
+	}
+	return fd.cachedFD, nil
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release(ctx context.Context) {
+	fd.cachedFD.DecRef(ctx)
+	fd.cachedFD = nil
+}
+
+// OnClose implements vfs.FileDescriptionImpl.OnClose.
+func (fd *regularFileFD) OnClose(ctx context.Context) error {
+	// Linux doesn't define ovl_file_operations.flush at all (i.e. its
+	// equivalent to OnClose is a no-op). We pass through to
+	// fd.cachedFD.OnClose() without upgrading if fd.dentry() has been
+	// copied-up, since OnClose is mostly used to define post-close writeback,
+	// and if fd.cachedFD hasn't been updated then it can't have been used to
+	// mutate fd.dentry() anyway.
+	fd.mu.Lock()
+	if statusFlags := fd.vfsfd.StatusFlags(); fd.cachedFlags != statusFlags {
+		if err := fd.cachedFD.SetStatusFlags(ctx, fd.filesystem().creds, statusFlags); err != nil {
+			fd.mu.Unlock()
+			return err
+		}
+		fd.cachedFlags = statusFlags
+	}
+	wrappedFD := fd.cachedFD
+	fd.mu.Unlock()
+	return wrappedFD.OnClose(ctx)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *regularFileFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
+		wrappedFD, err := fd.getCurrentFD(ctx)
+		if err != nil {
+			return linux.Statx{}, err
+		}
+		stat, err = wrappedFD.Stat(ctx, vfs.StatOptions{
+			Mask: layerMask,
+			Sync: opts.Sync,
+		})
+		wrappedFD.DecRef(ctx)
+		if err != nil {
+			return linux.Statx{}, err
+		}
+	}
+	fd.dentry().statInternalTo(ctx, &opts, &stat)
+	return stat, nil
+}
+
+// Allocate implements vfs.FileDescriptionImpl.Allocate.
+func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.Allocate(ctx, mode, offset, length)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *regularFileFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	d := fd.dentry()
+	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
+	if err := vfs.CheckSetStat(ctx, auth.CredentialsFromContext(ctx), &opts, mode, auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid))); err != nil {
+		return err
+	}
+	mnt := fd.vfsfd.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	if err := d.copyUpLocked(ctx); err != nil {
+		return err
+	}
+	// Changes to d's attributes are serialized by d.copyMu.
+	d.copyMu.Lock()
+	defer d.copyMu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return err
+	}
+	if err := wrappedFD.SetStat(ctx, opts); err != nil {
+		return err
+	}
+	d.updateAfterSetStatLocked(&opts)
+	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
+		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
+	}
+	return nil
+}
+
+// StatFS implements vfs.FileDescriptionImpl.StatFS.
+func (fd *regularFileFD) StatFS(ctx context.Context) (linux.Statfs, error) {
+	return fd.filesystem().statFS(ctx)
+}
+
+// Readiness implements waiter.Waitable.Readiness.
+func (fd *regularFileFD) Readiness(mask waiter.EventMask) waiter.EventMask {
+	ctx := context.Background()
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		// TODO(b/171089913): Just use fd.cachedFD since Readiness can't return
+		// an error. This is obviously wrong, but at least consistent with
+		// VFS1.
+		log.Warningf("overlay.regularFileFD.Readiness: currentFDLocked failed: %v", err)
+		fd.mu.Lock()
+		wrappedFD = fd.cachedFD
+		wrappedFD.IncRef()
+		fd.mu.Unlock()
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.Readiness(mask)
+}
+
+// EventRegister implements waiter.Waitable.EventRegister.
+func (fd *regularFileFD) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(context.Background())
+	if err != nil {
+		// TODO(b/171089913): Just use fd.cachedFD since EventRegister can't
+		// return an error. This is obviously wrong, but at least consistent
+		// with VFS1.
+		log.Warningf("overlay.regularFileFD.EventRegister: currentFDLocked failed: %v", err)
+		wrappedFD = fd.cachedFD
+	}
+	wrappedFD.EventRegister(e, mask)
+	if !fd.copiedUp {
+		if fd.lowerWaiters == nil {
+			fd.lowerWaiters = make(map[*waiter.Entry]waiter.EventMask)
+		}
+		fd.lowerWaiters[e] = mask
+	}
+}
+
+// EventUnregister implements waiter.Waitable.EventUnregister.
+func (fd *regularFileFD) EventUnregister(e *waiter.Entry) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	fd.cachedFD.EventUnregister(e)
+	if !fd.copiedUp {
+		delete(fd.lowerWaiters, e)
+	}
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.PRead(ctx, dst, offset, opts)
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// Hold fd.mu during the read to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Read(ctx, dst, opts)
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.PWrite(ctx, src, offset, opts)
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	// Hold fd.mu during the write to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Write(ctx, src, opts)
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	// Hold fd.mu during the seek to serialize the file offset.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return 0, err
+	}
+	return wrappedFD.Seek(ctx, offset, whence)
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+	fd.mu.Lock()
+	if !fd.dentry().isCopiedUp() {
+		fd.mu.Unlock()
+		return nil
+	}
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		fd.mu.Unlock()
+		return err
+	}
+	wrappedFD.IncRef()
+	defer wrappedFD.DecRef(ctx)
+	fd.mu.Unlock()
+	return wrappedFD.Sync(ctx)
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *regularFileFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	wrappedFD, err := fd.getCurrentFD(ctx)
+	if err != nil {
+		return 0, err
+	}
+	defer wrappedFD.DecRef(ctx)
+	return wrappedFD.Ioctl(ctx, uio, args)
+}
+
+// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
+func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	if err := fd.ensureMappable(ctx, opts); err != nil {
+		return err
+	}
+	return vfs.GenericConfigureMMap(&fd.vfsfd, fd.dentry(), opts)
+}
+
+// ensureMappable ensures that fd.dentry().wrappedMappable is not nil.
+func (fd *regularFileFD) ensureMappable(ctx context.Context, opts *memmap.MMapOpts) error {
+	d := fd.dentry()
+
+	// Fast path if we already have a Mappable for the current top layer.
+	if atomic.LoadUint32(&d.isMappable) != 0 {
+		return nil
+	}
+
+	// Only permit mmap of regular files, since other file types may have
+	// unpredictable behavior when mmapped (e.g. /dev/zero).
+	if atomic.LoadUint32(&d.mode)&linux.S_IFMT != linux.S_IFREG {
+		return syserror.ENODEV
+	}
+
+	// Get a Mappable for the current top layer.
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	d.copyMu.RLock()
+	defer d.copyMu.RUnlock()
+	if atomic.LoadUint32(&d.isMappable) != 0 {
+		return nil
+	}
+	wrappedFD, err := fd.currentFDLocked(ctx)
+	if err != nil {
+		return err
+	}
+	if err := wrappedFD.ConfigureMMap(ctx, opts); err != nil {
+		return err
+	}
+	if opts.MappingIdentity != nil {
+		opts.MappingIdentity.DecRef(ctx)
+		opts.MappingIdentity = nil
+	}
+	// Use this Mappable for all mappings of this layer (unless we raced with
+	// another call to ensureMappable).
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	d.dataMu.Lock()
+	defer d.dataMu.Unlock()
+	if d.wrappedMappable == nil {
+		d.wrappedMappable = opts.Mappable
+		atomic.StoreUint32(&d.isMappable, 1)
+	}
+	return nil
+}
+
+// AddMapping implements memmap.Mappable.AddMapping.
+func (d *dentry) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	if err := d.wrappedMappable.AddMapping(ctx, ms, ar, offset, writable); err != nil {
+		return err
+	}
+	if !d.isCopiedUp() {
+		d.lowerMappings.AddMapping(ms, ar, offset, writable)
+	}
+	return nil
+}
+
+// RemoveMapping implements memmap.Mappable.RemoveMapping.
+func (d *dentry) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	d.wrappedMappable.RemoveMapping(ctx, ms, ar, offset, writable)
+	if !d.isCopiedUp() {
+		d.lowerMappings.RemoveMapping(ms, ar, offset, writable)
+	}
+}
+
+// CopyMapping implements memmap.Mappable.CopyMapping.
+func (d *dentry) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	if err := d.wrappedMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable); err != nil {
+		return err
+	}
+	if !d.isCopiedUp() {
+		d.lowerMappings.AddMapping(ms, dstAR, offset, writable)
+	}
+	return nil
+}
+
+// Translate implements memmap.Mappable.Translate.
+func (d *dentry) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
+	d.dataMu.RLock()
+	defer d.dataMu.RUnlock()
+	return d.wrappedMappable.Translate(ctx, required, optional, at)
+}
+
+// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
+func (d *dentry) InvalidateUnsavable(ctx context.Context) error {
+	d.mapsMu.Lock()
+	defer d.mapsMu.Unlock()
+	return d.wrappedMappable.InvalidateUnsavable(ctx)
+}
diff --git a/pkg/sentry/fsimpl/overlay/save_restore.go b/pkg/sentry/fsimpl/overlay/save_restore.go
new file mode 100644
index 000000000..054e17b17
--- /dev/null
+++ b/pkg/sentry/fsimpl/overlay/save_restore.go
@@ -0,0 +1,27 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package overlay
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/refsvfs2"
+)
+
+func (d *dentry) afterLoad() {
+	if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&d.refs) != -1 {
+		refsvfs2.Register(d, "overlay.dentry")
+	}
+}
diff --git a/pkg/sentry/fsimpl/pipefs/pipefs.go b/pkg/sentry/fsimpl/pipefs/pipefs.go
index 2ca793db9..e44b79b68 100644
--- a/pkg/sentry/fsimpl/pipefs/pipefs.go
+++ b/pkg/sentry/fsimpl/pipefs/pipefs.go
@@ -31,6 +31,7 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// +stateify savable
 type filesystemType struct{}
 
 // Name implements vfs.FilesystemType.Name.
@@ -38,11 +39,15 @@ func (filesystemType) Name() string {
 	return "pipefs"
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (filesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	panic("pipefs.filesystemType.GetFilesystem should never be called")
 }
 
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -76,6 +81,8 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 }
 
 // inode implements kernfs.Inode.
+//
+// +stateify savable
 type inode struct {
 	kernfs.InodeNotDirectory
 	kernfs.InodeNotSymlink
@@ -143,12 +150,14 @@ func (i *inode) SetStat(ctx context.Context, vfsfs *vfs.Filesystem, creds *auth.
 	return syserror.EPERM
 }
 
-// TODO(gvisor.dev/issue/1193): kernfs does not provide a way to implement
-// statfs, from which we should indicate PIPEFS_MAGIC.
-
 // Open implements kernfs.Inode.Open.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	return i.pipe.Open(ctx, rp.Mount(), vfsd, opts.Flags, &i.locks)
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	return i.pipe.Open(ctx, rp.Mount(), d.VFSDentry(), opts.Flags, &i.locks)
+}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.PIPEFS_MAGIC), nil
 }
 
 // NewConnectedPipeFDs returns a pair of FileDescriptions representing the read
@@ -159,7 +168,7 @@ func NewConnectedPipeFDs(ctx context.Context, mnt *vfs.Mount, flags uint32) (*vf
 	fs := mnt.Filesystem().Impl().(*filesystem)
 	inode := newInode(ctx, fs)
 	var d kernfs.Dentry
-	d.Init(inode)
+	d.Init(&fs.Filesystem, inode)
 	defer d.DecRef(ctx)
 	return inode.pipe.ReaderWriterPair(mnt, d.VFSDentry(), flags)
 }
diff --git a/pkg/sentry/fsimpl/proc/BUILD b/pkg/sentry/fsimpl/proc/BUILD
index 14ecfd300..5196a2a80 100644
--- a/pkg/sentry/fsimpl/proc/BUILD
+++ b/pkg/sentry/fsimpl/proc/BUILD
@@ -1,18 +1,79 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "fd_dir_inode_refs",
+    out = "fd_dir_inode_refs.go",
+    package = "proc",
+    prefix = "fdDirInode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "fdDirInode",
+    },
+)
+
+go_template_instance(
+    name = "fd_info_dir_inode_refs",
+    out = "fd_info_dir_inode_refs.go",
+    package = "proc",
+    prefix = "fdInfoDirInode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "fdInfoDirInode",
+    },
+)
+
+go_template_instance(
+    name = "subtasks_inode_refs",
+    out = "subtasks_inode_refs.go",
+    package = "proc",
+    prefix = "subtasksInode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "subtasksInode",
+    },
+)
+
+go_template_instance(
+    name = "task_inode_refs",
+    out = "task_inode_refs.go",
+    package = "proc",
+    prefix = "taskInode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "taskInode",
+    },
+)
+
+go_template_instance(
+    name = "tasks_inode_refs",
+    out = "tasks_inode_refs.go",
+    package = "proc",
+    prefix = "tasksInode",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "tasksInode",
+    },
+)
+
 go_library(
     name = "proc",
     srcs = [
+        "fd_dir_inode_refs.go",
+        "fd_info_dir_inode_refs.go",
         "filesystem.go",
         "subtasks.go",
+        "subtasks_inode_refs.go",
         "task.go",
         "task_fds.go",
         "task_files.go",
+        "task_inode_refs.go",
         "task_net.go",
         "tasks.go",
         "tasks_files.go",
+        "tasks_inode_refs.go",
         "tasks_sys.go",
     ],
     visibility = ["//pkg/sentry:internal"],
@@ -21,6 +82,7 @@ go_library(
         "//pkg/context",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/safemem",
         "//pkg/sentry/fs/lock",
         "//pkg/sentry/fsbridge",
@@ -39,6 +101,7 @@ go_library(
         "//pkg/sync",
         "//pkg/syserror",
         "//pkg/tcpip/header",
+        "//pkg/tcpip/network/ipv4",
         "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fsimpl/proc/filesystem.go b/pkg/sentry/fsimpl/proc/filesystem.go
index 2463d51cd..99abcab66 100644
--- a/pkg/sentry/fsimpl/proc/filesystem.go
+++ b/pkg/sentry/fsimpl/proc/filesystem.go
@@ -17,6 +17,7 @@ package proc
 
 import (
 	"fmt"
+	"strconv"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
@@ -24,23 +25,29 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// Name is the default filesystem name.
-const Name = "proc"
+const (
+	// Name is the default filesystem name.
+	Name                     = "proc"
+	defaultMaxCachedDentries = uint64(1000)
+)
 
 // FilesystemType is the factory class for procfs.
 //
 // +stateify savable
 type FilesystemType struct{}
 
-var _ vfs.FilesystemType = (*FilesystemType)(nil)
-
 // Name implements vfs.FilesystemType.Name.
 func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -61,9 +68,22 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF
 	if err != nil {
 		return nil, nil, err
 	}
+
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	maxCachedDentries := defaultMaxCachedDentries
+	if str, ok := mopts["dentry_cache_limit"]; ok {
+		delete(mopts, "dentry_cache_limit")
+		maxCachedDentries, err = strconv.ParseUint(str, 10, 64)
+		if err != nil {
+			ctx.Warningf("proc.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
+			return nil, nil, syserror.EINVAL
+		}
+	}
+
 	procfs := &filesystem{
 		devMinor: devMinor,
 	}
+	procfs.MaxCachedDentries = maxCachedDentries
 	procfs.VFSFilesystem().Init(vfsObj, &ft, procfs)
 
 	var cgroups map[string]string
@@ -72,7 +92,9 @@ func (ft FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualF
 		cgroups = data.Cgroups
 	}
 
-	_, dentry := procfs.newTasksInode(k, pidns, cgroups)
+	inode := procfs.newTasksInode(ctx, k, pidns, cgroups)
+	var dentry kernfs.Dentry
+	dentry.Init(&procfs.Filesystem, inode)
 	return procfs.VFSFilesystem(), dentry.VFSDentry(), nil
 }
 
@@ -84,21 +106,21 @@ func (fs *filesystem) Release(ctx context.Context) {
 
 // dynamicInode is an overfitted interface for common Inodes with
 // dynamicByteSource types used in procfs.
+//
+// +stateify savable
 type dynamicInode interface {
 	kernfs.Inode
 	vfs.DynamicBytesSource
 
-	Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
+	Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, data vfs.DynamicBytesSource, perm linux.FileMode)
 }
 
-func (fs *filesystem) newDentry(creds *auth.Credentials, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
-	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+func (fs *filesystem) newInode(ctx context.Context, creds *auth.Credentials, perm linux.FileMode, inode dynamicInode) dynamicInode {
+	inode.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), inode, perm)
+	return inode
 }
 
+// +stateify savable
 type staticFile struct {
 	kernfs.DynamicBytesFile
 	vfs.StaticData
@@ -110,8 +132,24 @@ func newStaticFile(data string) *staticFile {
 	return &staticFile{StaticData: vfs.StaticData{Data: data}}
 }
 
+func (fs *filesystem) newStaticDir(ctx context.Context, creds *auth.Credentials, children map[string]kernfs.Inode) kernfs.Inode {
+	return kernfs.NewStaticDir(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, children, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
+}
+
 // InternalData contains internal data passed in to the procfs mount via
 // vfs.GetFilesystemOptions.InternalData.
+//
+// +stateify savable
 type InternalData struct {
 	Cgroups map[string]string
 }
+
+// +stateify savable
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.PROC_SUPER_MAGIC), nil
+}
diff --git a/pkg/sentry/fsimpl/proc/subtasks.go b/pkg/sentry/fsimpl/proc/subtasks.go
index 79c2725f3..cb3c5e0fd 100644
--- a/pkg/sentry/fsimpl/proc/subtasks.go
+++ b/pkg/sentry/fsimpl/proc/subtasks.go
@@ -31,11 +31,14 @@ import (
 //
 // +stateify savable
 type subtasksInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
+	subtasksInodeRefs
 
 	locks vfs.FileLocks
 
@@ -47,7 +50,7 @@ type subtasksInode struct {
 
 var _ kernfs.Inode = (*subtasksInode)(nil)
 
-func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *kernfs.Dentry {
+func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) kernfs.Inode {
 	subInode := &subtasksInode{
 		fs:                fs,
 		task:              task,
@@ -55,18 +58,16 @@ func (fs *filesystem) newSubtasks(task *kernel.Task, pidns *kernel.PIDNamespace,
 		cgroupControllers: cgroupControllers,
 	}
 	// Note: credentials are overridden by taskOwnedInode.
-	subInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	subInode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
 	subInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	subInode.EnableLeakCheck()
 
 	inode := &taskOwnedInode{Inode: subInode, owner: task}
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
-
-	return dentry
+	return inode
 }
 
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *subtasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
 	tid, err := strconv.ParseUint(name, 10, 32)
 	if err != nil {
 		return nil, syserror.ENOENT
@@ -79,13 +80,11 @@ func (i *subtasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, e
 	if subTask.ThreadGroup() != i.task.ThreadGroup() {
 		return nil, syserror.ENOENT
 	}
-
-	subTaskDentry := i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers)
-	return subTaskDentry.VFSDentry(), nil
+	return i.fs.newTaskInode(subTask, i.pidns, false, i.cgroupControllers)
 }
 
-// IterDirents implements kernfs.inodeDynamicLookup.
-func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
+func (i *subtasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	tasks := i.task.ThreadGroup().MemberIDs(i.pidns)
 	if len(tasks) == 0 {
 		return offset, syserror.ENOENT
@@ -115,6 +114,7 @@ func (i *subtasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallb
 	return offset, nil
 }
 
+// +stateify savable
 type subtasksFD struct {
 	kernfs.GenericDirectoryFD
 
@@ -152,19 +152,21 @@ func (fd *subtasksFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) erro
 	return fd.GenericDirectoryFD.SetStat(ctx, opts)
 }
 
-// Open implements kernfs.Inode.
-func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *subtasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &subtasksFD{task: i.task}
-	if err := fd.Init(&i.OrderedChildren, &i.locks, &opts); err != nil {
+	if err := fd.Init(&i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	}); err != nil {
 		return nil, err
 	}
-	if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := fd.VFSFileDescription().Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
 func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
 	stat, err := i.InodeAttrs.Stat(ctx, vsfs, opts)
 	if err != nil {
@@ -176,7 +178,12 @@ func (i *subtasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs
 	return stat, nil
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*subtasksInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
+
+// DecRef implements kernfs.Inode.DecRef.
+func (i *subtasksInode) DecRef(ctx context.Context) {
+	i.subtasksInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
diff --git a/pkg/sentry/fsimpl/proc/task.go b/pkg/sentry/fsimpl/proc/task.go
index a5c7aa470..57cf8ce26 100644
--- a/pkg/sentry/fsimpl/proc/task.go
+++ b/pkg/sentry/fsimpl/proc/task.go
@@ -32,11 +32,13 @@ import (
 //
 // +stateify savable
 type taskInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
-	kernfs.InodeNoDynamicLookup
+	implStatFS
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
+	taskInodeRefs
 
 	locks vfs.FileLocks
 
@@ -45,80 +47,92 @@ type taskInode struct {
 
 var _ kernfs.Inode = (*taskInode)(nil)
 
-func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) *kernfs.Dentry {
-	// TODO(gvisor.dev/issue/164): Fail with ESRCH if task exited.
-	contents := map[string]*kernfs.Dentry{
-		"auxv":      fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &auxvData{task: task}),
-		"cmdline":   fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
+func (fs *filesystem) newTaskInode(task *kernel.Task, pidns *kernel.PIDNamespace, isThreadGroup bool, cgroupControllers map[string]string) (kernfs.Inode, error) {
+	if task.ExitState() == kernel.TaskExitDead {
+		return nil, syserror.ESRCH
+	}
+
+	contents := map[string]kernfs.Inode{
+		"auxv":      fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &auxvData{task: task}),
+		"cmdline":   fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: cmdlineDataArg}),
 		"comm":      fs.newComm(task, fs.NextIno(), 0444),
-		"environ":   fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
+		"cwd":       fs.newCwdSymlink(task, fs.NextIno()),
+		"environ":   fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &cmdlineData{task: task, arg: environDataArg}),
 		"exe":       fs.newExeSymlink(task, fs.NextIno()),
 		"fd":        fs.newFDDirInode(task),
 		"fdinfo":    fs.newFDInfoDirInode(task),
-		"gid_map":   fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
-		"io":        fs.newTaskOwnedFile(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
-		"maps":      fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mapsData{task: task}),
-		"mountinfo": fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountInfoData{task: task}),
-		"mounts":    fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &mountsData{task: task}),
+		"gid_map":   fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: true}),
+		"io":        fs.newTaskOwnedInode(task, fs.NextIno(), 0400, newIO(task, isThreadGroup)),
+		"maps":      fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mapsData{task: task}),
+		"mountinfo": fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountInfoData{task: task}),
+		"mounts":    fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &mountsData{task: task}),
 		"net":       fs.newTaskNetDir(task),
-		"ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]*kernfs.Dentry{
+		"ns": fs.newTaskOwnedDir(task, fs.NextIno(), 0511, map[string]kernfs.Inode{
 			"net":  fs.newNamespaceSymlink(task, fs.NextIno(), "net"),
 			"pid":  fs.newNamespaceSymlink(task, fs.NextIno(), "pid"),
 			"user": fs.newNamespaceSymlink(task, fs.NextIno(), "user"),
 		}),
-		"oom_score":     fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newStaticFile("0\n")),
-		"oom_score_adj": fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
-		"smaps":         fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &smapsData{task: task}),
-		"stat":          fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
-		"statm":         fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statmData{task: task}),
-		"status":        fs.newTaskOwnedFile(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
-		"uid_map":       fs.newTaskOwnedFile(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
+		"oom_score":     fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newStaticFile("0\n")),
+		"oom_score_adj": fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &oomScoreAdj{task: task}),
+		"smaps":         fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &smapsData{task: task}),
+		"stat":          fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &taskStatData{task: task, pidns: pidns, tgstats: isThreadGroup}),
+		"statm":         fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statmData{task: task}),
+		"status":        fs.newTaskOwnedInode(task, fs.NextIno(), 0444, &statusData{task: task, pidns: pidns}),
+		"uid_map":       fs.newTaskOwnedInode(task, fs.NextIno(), 0644, &idMapData{task: task, gids: false}),
 	}
 	if isThreadGroup {
 		contents["task"] = fs.newSubtasks(task, pidns, cgroupControllers)
 	}
 	if len(cgroupControllers) > 0 {
-		contents["cgroup"] = fs.newTaskOwnedFile(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers))
+		contents["cgroup"] = fs.newTaskOwnedInode(task, fs.NextIno(), 0444, newCgroupData(cgroupControllers))
 	}
 
 	taskInode := &taskInode{task: task}
 	// Note: credentials are overridden by taskOwnedInode.
-	taskInode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	taskInode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	taskInode.EnableLeakCheck()
 
 	inode := &taskOwnedInode{Inode: taskInode, owner: task}
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
 
 	taskInode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	links := taskInode.OrderedChildren.Populate(dentry, contents)
+	links := taskInode.OrderedChildren.Populate(contents)
 	taskInode.IncLinks(links)
 
-	return dentry
+	return inode, nil
 }
 
-// Valid implements kernfs.inodeDynamicLookup. This inode remains valid as long
+// Valid implements kernfs.Inode.Valid. This inode remains valid as long
 // as the task is still running. When it's dead, another tasks with the same
 // PID could replace it.
 func (i *taskInode) Valid(ctx context.Context) bool {
 	return i.task.ExitState() != kernel.TaskExitDead
 }
 
-// Open implements kernfs.Inode.
-func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+// Open implements kernfs.Inode.Open.
+func (i *taskInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*taskInode) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (i *taskInode) DecRef(ctx context.Context) {
+	i.taskInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
 // taskOwnedInode implements kernfs.Inode and overrides inode owner with task
 // effective user and group.
+//
+// +stateify savable
 type taskOwnedInode struct {
 	kernfs.Inode
 
@@ -128,34 +142,26 @@ type taskOwnedInode struct {
 
 var _ kernfs.Inode = (*taskOwnedInode)(nil)
 
-func (fs *filesystem) newTaskOwnedFile(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) *kernfs.Dentry {
+func (fs *filesystem) newTaskOwnedInode(task *kernel.Task, ino uint64, perm linux.FileMode, inode dynamicInode) kernfs.Inode {
 	// Note: credentials are overridden by taskOwnedInode.
-	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
+	inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, inode, perm)
 
-	taskInode := &taskOwnedInode{Inode: inode, owner: task}
-	d := &kernfs.Dentry{}
-	d.Init(taskInode)
-	return d
+	return &taskOwnedInode{Inode: inode, owner: task}
 }
 
-func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]*kernfs.Dentry) *kernfs.Dentry {
-	dir := &kernfs.StaticDirectory{}
-
+func (fs *filesystem) newTaskOwnedDir(task *kernel.Task, ino uint64, perm linux.FileMode, children map[string]kernfs.Inode) kernfs.Inode {
 	// Note: credentials are overridden by taskOwnedInode.
-	dir.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm)
-
-	inode := &taskOwnedInode{Inode: dir, owner: task}
-	d := &kernfs.Dentry{}
-	d.Init(inode)
+	fdOpts := kernfs.GenericDirectoryFDOptions{SeekEnd: kernfs.SeekEndZero}
+	dir := kernfs.NewStaticDir(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, perm, children, fdOpts)
 
-	dir.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	links := dir.OrderedChildren.Populate(d, children)
-	dir.IncLinks(links)
+	return &taskOwnedInode{Inode: dir, owner: task}
+}
 
-	return d
+func (i *taskOwnedInode) Valid(ctx context.Context) bool {
+	return i.owner.ExitState() != kernel.TaskExitDead && i.Inode.Valid(ctx)
 }
 
-// Stat implements kernfs.Inode.
+// Stat implements kernfs.Inode.Stat.
 func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) {
 	stat, err := i.Inode.Stat(ctx, fs, opts)
 	if err != nil {
@@ -173,7 +179,7 @@ func (i *taskOwnedInode) Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.
 	return stat, nil
 }
 
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
 func (i *taskOwnedInode) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
 	mode := i.Mode()
 	uid, gid := i.getOwner(mode)
diff --git a/pkg/sentry/fsimpl/proc/task_fds.go b/pkg/sentry/fsimpl/proc/task_fds.go
index f0d3f7f5e..d268b44be 100644
--- a/pkg/sentry/fsimpl/proc/task_fds.go
+++ b/pkg/sentry/fsimpl/proc/task_fds.go
@@ -22,7 +22,6 @@ import (
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
-	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -52,6 +51,7 @@ func taskFDExists(ctx context.Context, t *kernel.Task, fd int32) bool {
 	return true
 }
 
+// +stateify savable
 type fdDir struct {
 	locks vfs.FileLocks
 
@@ -63,8 +63,8 @@ type fdDir struct {
 	produceSymlink bool
 }
 
-// IterDirents implements kernfs.inodeDynamicLookup.
-func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
+func (i *fdDir) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
 	var fds []int32
 	i.task.WithMuLocked(func(t *kernel.Task) {
 		if fdTable := t.FDTable(); fdTable != nil {
@@ -87,31 +87,39 @@ func (i *fdDir) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, off
 			Name:    strconv.FormatUint(uint64(fd), 10),
 			Type:    typ,
 			Ino:     i.fs.NextIno(),
-			NextOff: offset + 1,
+			NextOff: int64(fd) + 3,
 		}
 		if err := cb.Handle(dirent); err != nil {
-			return offset, err
+			// Getdents should iterate correctly despite mutation
+			// of fds, so we return the next fd to serialize plus
+			// 2 (which accounts for the "." and ".." tracked by
+			// kernfs) as the offset.
+			return int64(fd) + 2, err
 		}
-		offset++
 	}
-	return offset, nil
+	// We serialized them all.  Next offset should be higher than last
+	// serialized fd.
+	return int64(fds[len(fds)-1]) + 3, nil
 }
 
 // fdDirInode represents the inode for /proc/[pid]/fd directory.
 //
 // +stateify savable
 type fdDirInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	fdDir
+	fdDirInodeRefs
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
-	fdDir
 }
 
 var _ kernfs.Inode = (*fdDirInode)(nil)
 
-func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newFDDirInode(task *kernel.Task) kernfs.Inode {
 	inode := &fdDirInode{
 		fdDir: fdDir{
 			fs:             fs,
@@ -119,17 +127,19 @@ func (fs *filesystem) newFDDirInode(task *kernel.Task) *kernfs.Dentry {
 			produceSymlink: true,
 		},
 	}
-	inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
-
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
+	inode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	inode.EnableLeakCheck()
 	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
+	return inode
+}
 
-	return dentry
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
+func (i *fdDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (int64, error) {
+	return i.fdDir.IterDirents(ctx, mnt, cb, offset, relOffset)
 }
 
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *fdDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
 	fdInt, err := strconv.ParseInt(name, 10, 32)
 	if err != nil {
 		return nil, syserror.ENOENT
@@ -138,20 +148,21 @@ func (i *fdDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 	if !taskFDExists(ctx, i.task, fd) {
 		return nil, syserror.ENOENT
 	}
-	taskDentry := i.fs.newFDSymlink(i.task, fd, i.fs.NextIno())
-	return taskDentry.VFSDentry(), nil
+	return i.fs.newFDSymlink(i.task, fd, i.fs.NextIno()), nil
 }
 
-// Open implements kernfs.Inode.
-func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+// Open implements kernfs.Inode.Open.
+func (i *fdDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
-// CheckPermissions implements kernfs.Inode.
+// CheckPermissions implements kernfs.Inode.CheckPermissions.
 //
 // This is to match Linux, which uses a special permission handler to guarantee
 // that a process can still access /proc/self/fd after it has executed
@@ -173,10 +184,16 @@ func (i *fdDirInode) CheckPermissions(ctx context.Context, creds *auth.Credentia
 	return err
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (i *fdDirInode) DecRef(ctx context.Context) {
+	i.fdDirInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
 // fdSymlink is an symlink for the /proc/[pid]/fd/[fd] file.
 //
 // +stateify savable
 type fdSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -187,19 +204,16 @@ type fdSymlink struct {
 
 var _ kernfs.Inode = (*fdSymlink)(nil)
 
-func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) *kernfs.Dentry {
+func (fs *filesystem) newFDSymlink(task *kernel.Task, fd int32, ino uint64) kernfs.Inode {
 	inode := &fdSymlink{
 		task: task,
 		fd:   fd,
 	}
-	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+	inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+	return inode
 }
 
-func (s *fdSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *fdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
 	file, _ := getTaskFD(s.task, s.fd)
 	if file == nil {
 		return "", syserror.ENOENT
@@ -221,38 +235,43 @@ func (s *fdSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDen
 	return vd, "", nil
 }
 
+// Valid implements kernfs.Inode.Valid.
+func (s *fdSymlink) Valid(ctx context.Context) bool {
+	return taskFDExists(ctx, s.task, s.fd)
+}
+
 // fdInfoDirInode represents the inode for /proc/[pid]/fdinfo directory.
 //
 // +stateify savable
 type fdInfoDirInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	fdDir
+	fdInfoDirInodeRefs
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
-	fdDir
 }
 
 var _ kernfs.Inode = (*fdInfoDirInode)(nil)
 
-func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newFDInfoDirInode(task *kernel.Task) kernfs.Inode {
 	inode := &fdInfoDirInode{
 		fdDir: fdDir{
 			fs:   fs,
 			task: task,
 		},
 	}
-	inode.InodeAttrs.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
-
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
+	inode.InodeAttrs.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	inode.EnableLeakCheck()
 	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-
-	return dentry
+	return inode
 }
 
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
 	fdInt, err := strconv.ParseInt(name, 10, 32)
 	if err != nil {
 		return nil, syserror.ENOENT
@@ -265,25 +284,35 @@ func (i *fdInfoDirInode) Lookup(ctx context.Context, name string) (*vfs.Dentry,
 		task: i.task,
 		fd:   fd,
 	}
-	dentry := i.fs.newTaskOwnedFile(i.task, i.fs.NextIno(), 0444, data)
-	return dentry.VFSDentry(), nil
+	return i.fs.newTaskOwnedInode(i.task, i.fs.NextIno(), 0444, data), nil
 }
 
-// Open implements kernfs.Inode.
-func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+// IterDirents implements Inode.IterDirents.
+func (i *fdInfoDirInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
+	return i.fdDir.IterDirents(ctx, mnt, cb, offset, relOffset)
+}
+
+// Open implements kernfs.Inode.Open.
+func (i *fdInfoDirInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (i *fdInfoDirInode) DecRef(ctx context.Context) {
+	i.fdInfoDirInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
 // fdInfoData implements vfs.DynamicBytesSource for /proc/[pid]/fdinfo/[fd].
 //
 // +stateify savable
 type fdInfoData struct {
 	kernfs.DynamicBytesFile
-	refs.AtomicRefCount
 
 	task *kernel.Task
 	fd   int32
@@ -305,3 +334,8 @@ func (d *fdInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "flags:\t0%o\n", flags)
 	return nil
 }
+
+// Valid implements kernfs.Inode.Valid.
+func (d *fdInfoData) Valid(ctx context.Context) bool {
+	return taskFDExists(ctx, d.task, d.fd)
+}
diff --git a/pkg/sentry/fsimpl/proc/task_files.go b/pkg/sentry/fsimpl/proc/task_files.go
index 830b78949..d3f4e259b 100644
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@@ -247,13 +247,10 @@ type commInode struct {
 	task *kernel.Task
 }
 
-func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) *kernfs.Dentry {
+func (fs *filesystem) newComm(task *kernel.Task, ino uint64, perm linux.FileMode) kernfs.Inode {
 	inode := &commInode{task: task}
-	inode.DynamicBytesFile.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+	inode.DynamicBytesFile.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, &commData{task: task}, perm)
+	return inode
 }
 
 func (i *commInode) CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
@@ -543,7 +540,7 @@ func (s *statusData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	var vss, rss, data uint64
 	s.task.WithMuLocked(func(t *kernel.Task) {
 		if fdTable := t.FDTable(); fdTable != nil {
-			fds = fdTable.Size()
+			fds = fdTable.CurrentMaxFDs()
 		}
 		if mm := t.MemoryManager(); mm != nil {
 			vss = mm.VirtualMemorySize()
@@ -648,6 +645,7 @@ func (o *oomScoreAdj) Write(ctx context.Context, src usermem.IOSequence, offset
 //
 // +stateify savable
 type exeSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -657,29 +655,30 @@ type exeSymlink struct {
 
 var _ kernfs.Inode = (*exeSymlink)(nil)
 
-func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) *kernfs.Dentry {
+func (fs *filesystem) newExeSymlink(task *kernel.Task, ino uint64) kernfs.Inode {
 	inode := &exeSymlink{task: task}
-	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+	inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+	return inode
 }
 
-// Readlink implements kernfs.Inode.
-func (s *exeSymlink) Readlink(ctx context.Context) (string, error) {
-	if !kernel.ContextCanTrace(ctx, s.task, false) {
-		return "", syserror.EACCES
-	}
-
-	// Pull out the executable for /proc/[pid]/exe.
-	exec, err := s.executable()
+// Readlink implements kernfs.Inode.Readlink.
+func (s *exeSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
+	exec, _, err := s.Getlink(ctx, nil)
 	if err != nil {
 		return "", err
 	}
 	defer exec.DecRef(ctx)
 
-	return exec.PathnameWithDeleted(ctx), nil
+	root := vfs.RootFromContext(ctx)
+	if !root.Ok() {
+		// It could have raced with process deletion.
+		return "", syserror.ESRCH
+	}
+	defer root.DecRef(ctx)
+
+	vfsObj := exec.Mount().Filesystem().VirtualFilesystem()
+	name, _ := vfsObj.PathnameWithDeleted(ctx, root, exec)
+	return name, nil
 }
 
 // Getlink implements kernfs.Inode.Getlink.
@@ -687,23 +686,12 @@ func (s *exeSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDent
 	if !kernel.ContextCanTrace(ctx, s.task, false) {
 		return vfs.VirtualDentry{}, "", syserror.EACCES
 	}
-
-	exec, err := s.executable()
-	if err != nil {
-		return vfs.VirtualDentry{}, "", err
-	}
-	defer exec.DecRef(ctx)
-
-	vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
-	vd.IncRef()
-	return vd, "", nil
-}
-
-func (s *exeSymlink) executable() (file fsbridge.File, err error) {
 	if err := checkTaskState(s.task); err != nil {
-		return nil, err
+		return vfs.VirtualDentry{}, "", err
 	}
 
+	var err error
+	var exec fsbridge.File
 	s.task.WithMuLocked(func(t *kernel.Task) {
 		mm := t.MemoryManager()
 		if mm == nil {
@@ -714,12 +702,75 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) {
 		// The MemoryManager may be destroyed, in which case
 		// MemoryManager.destroy will simply set the executable to nil
 		// (with locks held).
-		file = mm.Executable()
-		if file == nil {
+		exec = mm.Executable()
+		if exec == nil {
 			err = syserror.ESRCH
 		}
 	})
-	return
+	if err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+	defer exec.DecRef(ctx)
+
+	vd := exec.(*fsbridge.VFSFile).FileDescription().VirtualDentry()
+	vd.IncRef()
+	return vd, "", nil
+}
+
+// cwdSymlink is an symlink for the /proc/[pid]/cwd file.
+//
+// +stateify savable
+type cwdSymlink struct {
+	implStatFS
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeSymlink
+
+	task *kernel.Task
+}
+
+var _ kernfs.Inode = (*cwdSymlink)(nil)
+
+func (fs *filesystem) newCwdSymlink(task *kernel.Task, ino uint64) kernfs.Inode {
+	inode := &cwdSymlink{task: task}
+	inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
+	return inode
+}
+
+// Readlink implements kernfs.Inode.Readlink.
+func (s *cwdSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
+	cwd, _, err := s.Getlink(ctx, nil)
+	if err != nil {
+		return "", err
+	}
+	defer cwd.DecRef(ctx)
+
+	root := vfs.RootFromContext(ctx)
+	if !root.Ok() {
+		// It could have raced with process deletion.
+		return "", syserror.ESRCH
+	}
+	defer root.DecRef(ctx)
+
+	vfsObj := cwd.Mount().Filesystem().VirtualFilesystem()
+	name, _ := vfsObj.PathnameWithDeleted(ctx, root, cwd)
+	return name, nil
+}
+
+// Getlink implements kernfs.Inode.Getlink.
+func (s *cwdSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	if !kernel.ContextCanTrace(ctx, s.task, false) {
+		return vfs.VirtualDentry{}, "", syserror.EACCES
+	}
+	if err := checkTaskState(s.task); err != nil {
+		return vfs.VirtualDentry{}, "", err
+	}
+	cwd := s.task.FSContext().WorkingDirectoryVFS2()
+	if !cwd.Ok() {
+		// It could have raced with process deletion.
+		return vfs.VirtualDentry{}, "", syserror.ESRCH
+	}
+	return cwd, "", nil
 }
 
 // mountInfoData is used to implement /proc/[pid]/mountinfo.
@@ -784,13 +835,14 @@ func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
+// +stateify savable
 type namespaceSymlink struct {
 	kernfs.StaticSymlink
 
 	task *kernel.Task
 }
 
-func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) *kernfs.Dentry {
+func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns string) kernfs.Inode {
 	// Namespace symlinks should contain the namespace name and the inode number
 	// for the namespace instance, so for example user:[123456]. We currently fake
 	// the inode number by sticking the symlink inode in its place.
@@ -798,40 +850,44 @@ func (fs *filesystem) newNamespaceSymlink(task *kernel.Task, ino uint64, ns stri
 
 	inode := &namespaceSymlink{task: task}
 	// Note: credentials are overridden by taskOwnedInode.
-	inode.Init(task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
+	inode.Init(task, task.Credentials(), linux.UNNAMED_MAJOR, fs.devMinor, ino, target)
 
 	taskInode := &taskOwnedInode{Inode: inode, owner: task}
-	d := &kernfs.Dentry{}
-	d.Init(taskInode)
-	return d
+	return taskInode
 }
 
-// Readlink implements Inode.
-func (s *namespaceSymlink) Readlink(ctx context.Context) (string, error) {
+// Readlink implements kernfs.Inode.Readlink.
+func (s *namespaceSymlink) Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) {
 	if err := checkTaskState(s.task); err != nil {
 		return "", err
 	}
-	return s.StaticSymlink.Readlink(ctx)
+	return s.StaticSymlink.Readlink(ctx, mnt)
 }
 
-// Getlink implements Inode.Getlink.
+// Getlink implements kernfs.Inode.Getlink.
 func (s *namespaceSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
 	if err := checkTaskState(s.task); err != nil {
 		return vfs.VirtualDentry{}, "", err
 	}
 
 	// Create a synthetic inode to represent the namespace.
+	fs := mnt.Filesystem().Impl().(*filesystem)
+	nsInode := &namespaceInode{}
+	nsInode.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0444)
 	dentry := &kernfs.Dentry{}
-	dentry.Init(&namespaceInode{})
+	dentry.Init(&fs.Filesystem, nsInode)
 	vd := vfs.MakeVirtualDentry(mnt, dentry.VFSDentry())
-	vd.IncRef()
-	dentry.DecRef(ctx)
+	// Only IncRef vd.Mount() because vd.Dentry() already holds a ref of 1.
+	mnt.IncRef()
 	return vd, "", nil
 }
 
 // namespaceInode is a synthetic inode created to represent a namespace in
 // /proc/[pid]/ns/*.
+//
+// +stateify savable
 type namespaceInode struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeNotDirectory
@@ -843,19 +899,19 @@ type namespaceInode struct {
 var _ kernfs.Inode = (*namespaceInode)(nil)
 
 // Init initializes a namespace inode.
-func (i *namespaceInode) Init(creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
+func (i *namespaceInode) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode) {
 	if perm&^linux.PermissionsMask != 0 {
 		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
 	}
-	i.InodeAttrs.Init(creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
+	i.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeRegular|perm)
 }
 
-// Open implements Inode.Open.
-func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+// Open implements kernfs.Inode.Open.
+func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	fd := &namespaceFD{inode: i}
 	i.IncRef()
 	fd.LockFD.Init(&i.locks)
-	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{}); err != nil {
 		return nil, err
 	}
 	return &fd.vfsfd, nil
@@ -863,6 +919,8 @@ func (i *namespaceInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *
 
 // namespace FD is a synthetic file that represents a namespace in
 // /proc/[pid]/ns/*.
+//
+// +stateify savable
 type namespaceFD struct {
 	vfs.FileDescriptionDefaultImpl
 	vfs.LockFD
@@ -873,20 +931,20 @@ type namespaceFD struct {
 
 var _ vfs.FileDescriptionImpl = (*namespaceFD)(nil)
 
-// Stat implements FileDescriptionImpl.
+// Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *namespaceFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
 	return fd.inode.Stat(ctx, vfs, opts)
 }
 
-// SetStat implements FileDescriptionImpl.
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
 func (fd *namespaceFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
 	vfs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
 	creds := auth.CredentialsFromContext(ctx)
 	return fd.inode.SetStat(ctx, vfs, creds, opts)
 }
 
-// Release implements FileDescriptionImpl.
+// Release implements vfs.FileDescriptionImpl.Release.
 func (fd *namespaceFD) Release(ctx context.Context) {
 	fd.inode.DecRef(ctx)
 }
diff --git a/pkg/sentry/fsimpl/proc/task_net.go b/pkg/sentry/fsimpl/proc/task_net.go
index a4c884bf9..5a9ee111f 100644
--- a/pkg/sentry/fsimpl/proc/task_net.go
+++ b/pkg/sentry/fsimpl/proc/task_net.go
@@ -37,12 +37,12 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
-func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry {
+func (fs *filesystem) newTaskNetDir(task *kernel.Task) kernfs.Inode {
 	k := task.Kernel()
 	pidns := task.PIDNamespace()
 	root := auth.NewRootCredentials(pidns.UserNamespace())
 
-	var contents map[string]*kernfs.Dentry
+	var contents map[string]kernfs.Inode
 	if stack := task.NetworkNamespace().Stack(); stack != nil {
 		const (
 			arp       = "IP address       HW type     Flags       HW address            Mask     Device\n"
@@ -56,34 +56,34 @@ func (fs *filesystem) newTaskNetDir(task *kernel.Task) *kernfs.Dentry {
 
 		// TODO(gvisor.dev/issue/1833): Make sure file contents reflect the task
 		// network namespace.
-		contents = map[string]*kernfs.Dentry{
-			"dev":  fs.newDentry(root, fs.NextIno(), 0444, &netDevData{stack: stack}),
-			"snmp": fs.newDentry(root, fs.NextIno(), 0444, &netSnmpData{stack: stack}),
+		contents = map[string]kernfs.Inode{
+			"dev":  fs.newInode(task, root, 0444, &netDevData{stack: stack}),
+			"snmp": fs.newInode(task, root, 0444, &netSnmpData{stack: stack}),
 
 			// The following files are simple stubs until they are implemented in
 			// netstack, if the file contains a header the stub is just the header
 			// otherwise it is an empty file.
-			"arp":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(arp)),
-			"netlink":   fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(netlink)),
-			"netstat":   fs.newDentry(root, fs.NextIno(), 0444, &netStatData{}),
-			"packet":    fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(packet)),
-			"protocols": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(protocols)),
+			"arp":       fs.newInode(task, root, 0444, newStaticFile(arp)),
+			"netlink":   fs.newInode(task, root, 0444, newStaticFile(netlink)),
+			"netstat":   fs.newInode(task, root, 0444, &netStatData{}),
+			"packet":    fs.newInode(task, root, 0444, newStaticFile(packet)),
+			"protocols": fs.newInode(task, root, 0444, newStaticFile(protocols)),
 
 			// Linux sets psched values to: nsec per usec, psched tick in ns, 1000000,
 			// high res timer ticks per sec (ClockGetres returns 1ns resolution).
-			"psched": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(psched)),
-			"ptype":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(ptype)),
-			"route":  fs.newDentry(root, fs.NextIno(), 0444, &netRouteData{stack: stack}),
-			"tcp":    fs.newDentry(root, fs.NextIno(), 0444, &netTCPData{kernel: k}),
-			"udp":    fs.newDentry(root, fs.NextIno(), 0444, &netUDPData{kernel: k}),
-			"unix":   fs.newDentry(root, fs.NextIno(), 0444, &netUnixData{kernel: k}),
+			"psched": fs.newInode(task, root, 0444, newStaticFile(psched)),
+			"ptype":  fs.newInode(task, root, 0444, newStaticFile(ptype)),
+			"route":  fs.newInode(task, root, 0444, &netRouteData{stack: stack}),
+			"tcp":    fs.newInode(task, root, 0444, &netTCPData{kernel: k}),
+			"udp":    fs.newInode(task, root, 0444, &netUDPData{kernel: k}),
+			"unix":   fs.newInode(task, root, 0444, &netUnixData{kernel: k}),
 		}
 
 		if stack.SupportsIPv6() {
-			contents["if_inet6"] = fs.newDentry(root, fs.NextIno(), 0444, &ifinet6{stack: stack})
-			contents["ipv6_route"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(""))
-			contents["tcp6"] = fs.newDentry(root, fs.NextIno(), 0444, &netTCP6Data{kernel: k})
-			contents["udp6"] = fs.newDentry(root, fs.NextIno(), 0444, newStaticFile(upd6))
+			contents["if_inet6"] = fs.newInode(task, root, 0444, &ifinet6{stack: stack})
+			contents["ipv6_route"] = fs.newInode(task, root, 0444, newStaticFile(""))
+			contents["tcp6"] = fs.newInode(task, root, 0444, &netTCP6Data{kernel: k})
+			contents["udp6"] = fs.newInode(task, root, 0444, newStaticFile(upd6))
 		}
 	}
 
@@ -262,7 +262,7 @@ func (n *netUnixData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 		// For now, we always redact this pointer.
 		fmt.Fprintf(buf, "%#016p: %08X %08X %08X %04X %02X %8d",
 			(*unix.SocketOperations)(nil), // Num, pointer to kernel socket struct.
-			s.Refs()-1,                    // RefCount, don't count our own ref.
+			s.ReadRefs()-1,                // RefCount, don't count our own ref.
 			0,                             // Protocol, always 0 for UDS.
 			sockFlags,                     // Flags.
 			sops.Endpoint().Type(),        // Type.
@@ -430,7 +430,7 @@ func commonGenerateTCP(ctx context.Context, buf *bytes.Buffer, k *kernel.Kernel,
 
 		// Field: refcount. Don't count the ref we obtain while deferencing
 		// the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", s.Refs()-1)
+		fmt.Fprintf(buf, "%d ", s.ReadRefs()-1)
 
 		// Field: Socket struct address. Redacted due to the same reason as
 		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -589,7 +589,7 @@ func (d *netUDPData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 
 		// Field: ref; reference count on the socket inode. Don't count the ref
 		// we obtain while deferencing the weakref to this socket.
-		fmt.Fprintf(buf, "%d ", s.Refs()-1)
+		fmt.Fprintf(buf, "%d ", s.ReadRefs()-1)
 
 		// Field: Socket struct address. Redacted due to the same reason as
 		// the 'Num' field in /proc/net/unix, see netUnix.ReadSeqFileData.
@@ -616,6 +616,7 @@ type netSnmpData struct {
 
 var _ dynamicInode = (*netSnmpData)(nil)
 
+// +stateify savable
 type snmpLine struct {
 	prefix string
 	header string
@@ -660,7 +661,7 @@ func sprintSlice(s []uint64) string {
 	return r[1 : len(r)-1] // Remove "[]" introduced by fmt of slice.
 }
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 func (d *netSnmpData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	types := []interface{}{
 		&inet.StatSNMPIP{},
@@ -709,7 +710,7 @@ type netRouteData struct {
 
 var _ dynamicInode = (*netRouteData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 // See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
 func (d *netRouteData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	fmt.Fprintf(buf, "%-127s\n", "Iface\tDestination\tGateway\tFlags\tRefCnt\tUse\tMetric\tMask\tMTU\tWindow\tIRTT")
@@ -773,7 +774,7 @@ type netStatData struct {
 
 var _ dynamicInode = (*netStatData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 // See Linux's net/ipv4/fib_trie.c:fib_route_seq_show.
 func (d *netStatData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	buf.WriteString("TcpExt: SyncookiesSent SyncookiesRecv SyncookiesFailed " +
diff --git a/pkg/sentry/fsimpl/proc/tasks.go b/pkg/sentry/fsimpl/proc/tasks.go
index 6d2b90a8b..b81ea14bf 100644
--- a/pkg/sentry/fsimpl/proc/tasks.go
+++ b/pkg/sentry/fsimpl/proc/tasks.go
@@ -37,11 +37,14 @@ const (
 //
 // +stateify savable
 type tasksInode struct {
-	kernfs.InodeNotSymlink
-	kernfs.InodeDirectoryNoNewChildren
+	implStatFS
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
+	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeNotSymlink
+	kernfs.InodeTemporary // This holds no meaning as this inode can't be Looked up and is always valid.
 	kernfs.OrderedChildren
-	kernfs.AlwaysValid
+	tasksInodeRefs
 
 	locks vfs.FileLocks
 
@@ -50,8 +53,6 @@ type tasksInode struct {
 
 	// '/proc/self' and '/proc/thread-self' have custom directory offsets in
 	// Linux. So handle them outside of OrderedChildren.
-	selfSymlink       *vfs.Dentry
-	threadSelfSymlink *vfs.Dentry
 
 	// cgroupControllers is a map of controller name to directory in the
 	// cgroup hierarchy. These controllers are immutable and will be listed
@@ -61,51 +62,53 @@ type tasksInode struct {
 
 var _ kernfs.Inode = (*tasksInode)(nil)
 
-func (fs *filesystem) newTasksInode(k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) (*tasksInode, *kernfs.Dentry) {
+func (fs *filesystem) newTasksInode(ctx context.Context, k *kernel.Kernel, pidns *kernel.PIDNamespace, cgroupControllers map[string]string) *tasksInode {
 	root := auth.NewRootCredentials(pidns.UserNamespace())
-	contents := map[string]*kernfs.Dentry{
-		"cpuinfo":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFileSetStat(cpuInfoData(k))),
-		"filesystems": fs.newDentry(root, fs.NextIno(), 0444, &filesystemsData{}),
-		"loadavg":     fs.newDentry(root, fs.NextIno(), 0444, &loadavgData{}),
-		"sys":         fs.newSysDir(root, k),
-		"meminfo":     fs.newDentry(root, fs.NextIno(), 0444, &meminfoData{}),
-		"mounts":      kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"),
-		"net":         kernfs.NewStaticSymlink(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"),
-		"stat":        fs.newDentry(root, fs.NextIno(), 0444, &statData{}),
-		"uptime":      fs.newDentry(root, fs.NextIno(), 0444, &uptimeData{}),
-		"version":     fs.newDentry(root, fs.NextIno(), 0444, &versionData{}),
+	contents := map[string]kernfs.Inode{
+		"cpuinfo":     fs.newInode(ctx, root, 0444, newStaticFileSetStat(cpuInfoData(k))),
+		"filesystems": fs.newInode(ctx, root, 0444, &filesystemsData{}),
+		"loadavg":     fs.newInode(ctx, root, 0444, &loadavgData{}),
+		"sys":         fs.newSysDir(ctx, root, k),
+		"meminfo":     fs.newInode(ctx, root, 0444, &meminfoData{}),
+		"mounts":      kernfs.NewStaticSymlink(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/mounts"),
+		"net":         kernfs.NewStaticSymlink(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "self/net"),
+		"stat":        fs.newInode(ctx, root, 0444, &statData{}),
+		"uptime":      fs.newInode(ctx, root, 0444, &uptimeData{}),
+		"version":     fs.newInode(ctx, root, 0444, &versionData{}),
 	}
 
 	inode := &tasksInode{
 		pidns:             pidns,
 		fs:                fs,
-		selfSymlink:       fs.newSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(),
-		threadSelfSymlink: fs.newThreadSelfSymlink(root, fs.NextIno(), pidns).VFSDentry(),
 		cgroupControllers: cgroupControllers,
 	}
-	inode.InodeAttrs.Init(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
-
-	dentry := &kernfs.Dentry{}
-	dentry.Init(inode)
+	inode.InodeAttrs.Init(ctx, root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0555)
+	inode.EnableLeakCheck()
 
 	inode.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	links := inode.OrderedChildren.Populate(dentry, contents)
+	links := inode.OrderedChildren.Populate(contents)
 	inode.IncLinks(links)
 
-	return inode, dentry
+	return inode
 }
 
-// Lookup implements kernfs.inodeDynamicLookup.
-func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, error) {
-	// Try to lookup a corresponding task.
+// Lookup implements kernfs.inodeDirectory.Lookup.
+func (i *tasksInode) Lookup(ctx context.Context, name string) (kernfs.Inode, error) {
+	// Check if a static entry was looked up.
+	if d, err := i.OrderedChildren.Lookup(ctx, name); err == nil {
+		return d, nil
+	}
+
+	// Not a static entry. Try to lookup a corresponding task.
 	tid, err := strconv.ParseUint(name, 10, 64)
 	if err != nil {
+		root := auth.NewRootCredentials(i.pidns.UserNamespace())
 		// If it failed to parse, check if it's one of the special handled files.
 		switch name {
 		case selfName:
-			return i.selfSymlink, nil
+			return i.newSelfSymlink(ctx, root), nil
 		case threadSelfName:
-			return i.threadSelfSymlink, nil
+			return i.newThreadSelfSymlink(ctx, root), nil
 		}
 		return nil, syserror.ENOENT
 	}
@@ -115,12 +118,11 @@ func (i *tasksInode) Lookup(ctx context.Context, name string) (*vfs.Dentry, erro
 		return nil, syserror.ENOENT
 	}
 
-	taskDentry := i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers)
-	return taskDentry.VFSDentry(), nil
+	return i.fs.newTaskInode(task, i.pidns, true, i.cgroupControllers)
 }
 
-// IterDirents implements kernfs.inodeDynamicLookup.
-func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
+// IterDirents implements kernfs.inodeDirectory.IterDirents.
+func (i *tasksInode) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, _ int64) (int64, error) {
 	// fs/proc/internal.h: #define FIRST_PROCESS_ENTRY 256
 	const FIRST_PROCESS_ENTRY = 256
 
@@ -197,9 +199,11 @@ func (i *tasksInode) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback
 	return maxTaskID, nil
 }
 
-// Open implements kernfs.Inode.
-func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &i.OrderedChildren, &i.locks, &opts)
+// Open implements kernfs.Inode.Open.
+func (i *tasksInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), d, &i.OrderedChildren, &i.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndZero,
+	})
 	if err != nil {
 		return nil, err
 	}
@@ -224,9 +228,16 @@ func (i *tasksInode) Stat(ctx context.Context, vsfs *vfs.Filesystem, opts vfs.St
 	return stat, nil
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (i *tasksInode) DecRef(ctx context.Context) {
+	i.tasksInodeRefs.DecRef(func() { i.Destroy(ctx) })
+}
+
 // staticFileSetStat implements a special static file that allows inode
 // attributes to be set. This is to support /proc files that are readonly, but
 // allow attributes to be set.
+//
+// +stateify savable
 type staticFileSetStat struct {
 	dynamicBytesFileSetAttr
 	vfs.StaticData
diff --git a/pkg/sentry/fsimpl/proc/tasks_files.go b/pkg/sentry/fsimpl/proc/tasks_files.go
index 7d8983aa5..01b7a6678 100644
--- a/pkg/sentry/fsimpl/proc/tasks_files.go
+++ b/pkg/sentry/fsimpl/proc/tasks_files.go
@@ -31,7 +31,9 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// +stateify savable
 type selfSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -41,16 +43,13 @@ type selfSymlink struct {
 
 var _ kernfs.Inode = (*selfSymlink)(nil)
 
-func (fs *filesystem) newSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
-	inode := &selfSymlink{pidns: pidns}
-	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+func (i *tasksInode) newSelfSymlink(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
+	inode := &selfSymlink{pidns: i.pidns}
+	inode.Init(ctx, creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777)
+	return inode
 }
 
-func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *selfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
 	t := kernel.TaskFromContext(ctx)
 	if t == nil {
 		// Who is reading this link?
@@ -63,17 +62,19 @@ func (s *selfSymlink) Readlink(ctx context.Context) (string, error) {
 	return strconv.FormatUint(uint64(tgid), 10), nil
 }
 
-func (s *selfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
-	target, err := s.Readlink(ctx)
+func (s *selfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	target, err := s.Readlink(ctx, mnt)
 	return vfs.VirtualDentry{}, target, err
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*selfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
+// +stateify savable
 type threadSelfSymlink struct {
+	implStatFS
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
 	kernfs.InodeSymlink
@@ -83,16 +84,13 @@ type threadSelfSymlink struct {
 
 var _ kernfs.Inode = (*threadSelfSymlink)(nil)
 
-func (fs *filesystem) newThreadSelfSymlink(creds *auth.Credentials, ino uint64, pidns *kernel.PIDNamespace) *kernfs.Dentry {
-	inode := &threadSelfSymlink{pidns: pidns}
-	inode.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, ino, linux.ModeSymlink|0777)
-
-	d := &kernfs.Dentry{}
-	d.Init(inode)
-	return d
+func (i *tasksInode) newThreadSelfSymlink(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
+	inode := &threadSelfSymlink{pidns: i.pidns}
+	inode.Init(ctx, creds, linux.UNNAMED_MAJOR, i.fs.devMinor, i.fs.NextIno(), linux.ModeSymlink|0777)
+	return inode
 }
 
-func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
+func (s *threadSelfSymlink) Readlink(ctx context.Context, _ *vfs.Mount) (string, error) {
 	t := kernel.TaskFromContext(ctx)
 	if t == nil {
 		// Who is reading this link?
@@ -106,12 +104,12 @@ func (s *threadSelfSymlink) Readlink(ctx context.Context) (string, error) {
 	return fmt.Sprintf("%d/task/%d", tgid, tid), nil
 }
 
-func (s *threadSelfSymlink) Getlink(ctx context.Context, _ *vfs.Mount) (vfs.VirtualDentry, string, error) {
-	target, err := s.Readlink(ctx)
+func (s *threadSelfSymlink) Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) {
+	target, err := s.Readlink(ctx, mnt)
 	return vfs.VirtualDentry{}, target, err
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
@@ -119,16 +117,20 @@ func (*threadSelfSymlink) SetStat(context.Context, *vfs.Filesystem, *auth.Creden
 // dynamicBytesFileSetAttr implements a special file that allows inode
 // attributes to be set. This is to support /proc files that are readonly, but
 // allow attributes to be set.
+//
+// +stateify savable
 type dynamicBytesFileSetAttr struct {
 	kernfs.DynamicBytesFile
 }
 
-// SetStat implements Inode.SetStat.
+// SetStat implements kernfs.Inode.SetStat.
 func (d *dynamicBytesFileSetAttr) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
 	return d.DynamicBytesFile.InodeAttrs.SetStat(ctx, fs, creds, opts)
 }
 
 // cpuStats contains the breakdown of CPU time for /proc/stat.
+//
+// +stateify savable
 type cpuStats struct {
 	// user is time spent in userspace tasks with non-positive niceness.
 	user uint64
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys.go b/pkg/sentry/fsimpl/proc/tasks_sys.go
index 6435385ef..7c7afdcfa 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys.go
@@ -27,9 +27,11 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// +stateify savable
 type tcpMemDir int
 
 const (
@@ -38,92 +40,93 @@ const (
 )
 
 // newSysDir returns the dentry corresponding to /proc/sys directory.
-func (fs *filesystem) newSysDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
-	return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-		"kernel": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-			"hostname": fs.newDentry(root, fs.NextIno(), 0444, &hostnameData{}),
-			"shmall":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMALL)),
-			"shmmax":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMAX)),
-			"shmmni":   fs.newDentry(root, fs.NextIno(), 0444, shmData(linux.SHMMNI)),
+func (fs *filesystem) newSysDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
+	return fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+		"kernel": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+			"hostname": fs.newInode(ctx, root, 0444, &hostnameData{}),
+			"shmall":   fs.newInode(ctx, root, 0444, shmData(linux.SHMALL)),
+			"shmmax":   fs.newInode(ctx, root, 0444, shmData(linux.SHMMAX)),
+			"shmmni":   fs.newInode(ctx, root, 0444, shmData(linux.SHMMNI)),
 		}),
-		"vm": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-			"mmap_min_addr":     fs.newDentry(root, fs.NextIno(), 0444, &mmapMinAddrData{k: k}),
-			"overcommit_memory": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0\n")),
+		"vm": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+			"mmap_min_addr":     fs.newInode(ctx, root, 0444, &mmapMinAddrData{k: k}),
+			"overcommit_memory": fs.newInode(ctx, root, 0444, newStaticFile("0\n")),
 		}),
-		"net": fs.newSysNetDir(root, k),
+		"net": fs.newSysNetDir(ctx, root, k),
 	})
 }
 
 // newSysNetDir returns the dentry corresponding to /proc/sys/net directory.
-func (fs *filesystem) newSysNetDir(root *auth.Credentials, k *kernel.Kernel) *kernfs.Dentry {
-	var contents map[string]*kernfs.Dentry
+func (fs *filesystem) newSysNetDir(ctx context.Context, root *auth.Credentials, k *kernel.Kernel) kernfs.Inode {
+	var contents map[string]kernfs.Inode
 
 	// TODO(gvisor.dev/issue/1833): Support for using the network stack in the
 	// network namespace of the calling process.
 	if stack := k.RootNetworkNamespace().Stack(); stack != nil {
-		contents = map[string]*kernfs.Dentry{
-			"ipv4": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-				"tcp_recovery": fs.newDentry(root, fs.NextIno(), 0644, &tcpRecoveryData{stack: stack}),
-				"tcp_rmem":     fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
-				"tcp_sack":     fs.newDentry(root, fs.NextIno(), 0644, &tcpSackData{stack: stack}),
-				"tcp_wmem":     fs.newDentry(root, fs.NextIno(), 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
+		contents = map[string]kernfs.Inode{
+			"ipv4": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+				"tcp_recovery": fs.newInode(ctx, root, 0644, &tcpRecoveryData{stack: stack}),
+				"tcp_rmem":     fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpRMem}),
+				"tcp_sack":     fs.newInode(ctx, root, 0644, &tcpSackData{stack: stack}),
+				"tcp_wmem":     fs.newInode(ctx, root, 0644, &tcpMemData{stack: stack, dir: tcpWMem}),
+				"ip_forward":   fs.newInode(ctx, root, 0444, &ipForwarding{stack: stack}),
 
 				// The following files are simple stubs until they are implemented in
 				// netstack, most of these files are configuration related. We use the
 				// value closest to the actual netstack behavior or any empty file, all
 				// of these files will have mode 0444 (read-only for all users).
-				"ip_local_port_range":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("16000   65535")),
-				"ip_local_reserved_ports": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
-				"ipfrag_time":             fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("30")),
-				"ip_nonlocal_bind":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"ip_no_pmtu_disc":         fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+				"ip_local_port_range":     fs.newInode(ctx, root, 0444, newStaticFile("16000   65535")),
+				"ip_local_reserved_ports": fs.newInode(ctx, root, 0444, newStaticFile("")),
+				"ipfrag_time":             fs.newInode(ctx, root, 0444, newStaticFile("30")),
+				"ip_nonlocal_bind":        fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"ip_no_pmtu_disc":         fs.newInode(ctx, root, 0444, newStaticFile("1")),
 
 				// tcp_allowed_congestion_control tell the user what they are able to
 				// do as an unprivledged process so we leave it empty.
-				"tcp_allowed_congestion_control":   fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
-				"tcp_available_congestion_control": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")),
-				"tcp_congestion_control":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("reno")),
+				"tcp_allowed_congestion_control":   fs.newInode(ctx, root, 0444, newStaticFile("")),
+				"tcp_available_congestion_control": fs.newInode(ctx, root, 0444, newStaticFile("reno")),
+				"tcp_congestion_control":           fs.newInode(ctx, root, 0444, newStaticFile("reno")),
 
 				// Many of the following stub files are features netstack doesn't
 				// support. The unsupported features return "0" to indicate they are
 				// disabled.
-				"tcp_base_mss":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1280")),
-				"tcp_dsack":                 fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_early_retrans":         fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_fack":                  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_fastopen":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_fastopen_key":          fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("")),
-				"tcp_invalid_ratelimit":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_keepalive_intvl":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_keepalive_probes":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_keepalive_time":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("7200")),
-				"tcp_mtu_probing":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_no_metrics_save":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
-				"tcp_probe_interval":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_probe_threshold":       fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"tcp_retries1":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
-				"tcp_retries2":              fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("15")),
-				"tcp_rfc1337":               fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
-				"tcp_slow_start_after_idle": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
-				"tcp_synack_retries":        fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
-				"tcp_syn_retries":           fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("3")),
-				"tcp_timestamps":            fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("1")),
+				"tcp_base_mss":              fs.newInode(ctx, root, 0444, newStaticFile("1280")),
+				"tcp_dsack":                 fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_early_retrans":         fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_fack":                  fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_fastopen":              fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_fastopen_key":          fs.newInode(ctx, root, 0444, newStaticFile("")),
+				"tcp_invalid_ratelimit":     fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_keepalive_intvl":       fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_keepalive_probes":      fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_keepalive_time":        fs.newInode(ctx, root, 0444, newStaticFile("7200")),
+				"tcp_mtu_probing":           fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_no_metrics_save":       fs.newInode(ctx, root, 0444, newStaticFile("1")),
+				"tcp_probe_interval":        fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_probe_threshold":       fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"tcp_retries1":              fs.newInode(ctx, root, 0444, newStaticFile("3")),
+				"tcp_retries2":              fs.newInode(ctx, root, 0444, newStaticFile("15")),
+				"tcp_rfc1337":               fs.newInode(ctx, root, 0444, newStaticFile("1")),
+				"tcp_slow_start_after_idle": fs.newInode(ctx, root, 0444, newStaticFile("1")),
+				"tcp_synack_retries":        fs.newInode(ctx, root, 0444, newStaticFile("5")),
+				"tcp_syn_retries":           fs.newInode(ctx, root, 0444, newStaticFile("3")),
+				"tcp_timestamps":            fs.newInode(ctx, root, 0444, newStaticFile("1")),
 			}),
-			"core": kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, map[string]*kernfs.Dentry{
-				"default_qdisc": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("pfifo_fast")),
-				"message_burst": fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("10")),
-				"message_cost":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("5")),
-				"optmem_max":    fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("0")),
-				"rmem_default":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
-				"rmem_max":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
-				"somaxconn":     fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("128")),
-				"wmem_default":  fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
-				"wmem_max":      fs.newDentry(root, fs.NextIno(), 0444, newStaticFile("212992")),
+			"core": fs.newStaticDir(ctx, root, map[string]kernfs.Inode{
+				"default_qdisc": fs.newInode(ctx, root, 0444, newStaticFile("pfifo_fast")),
+				"message_burst": fs.newInode(ctx, root, 0444, newStaticFile("10")),
+				"message_cost":  fs.newInode(ctx, root, 0444, newStaticFile("5")),
+				"optmem_max":    fs.newInode(ctx, root, 0444, newStaticFile("0")),
+				"rmem_default":  fs.newInode(ctx, root, 0444, newStaticFile("212992")),
+				"rmem_max":      fs.newInode(ctx, root, 0444, newStaticFile("212992")),
+				"somaxconn":     fs.newInode(ctx, root, 0444, newStaticFile("128")),
+				"wmem_default":  fs.newInode(ctx, root, 0444, newStaticFile("212992")),
+				"wmem_max":      fs.newInode(ctx, root, 0444, newStaticFile("212992")),
 			}),
 		}
 	}
 
-	return kernfs.NewStaticDir(root, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), 0555, contents)
+	return fs.newStaticDir(ctx, root, contents)
 }
 
 // mmapMinAddrData implements vfs.DynamicBytesSource for
@@ -174,7 +177,7 @@ type tcpSackData struct {
 
 var _ vfs.WritableDynamicBytesSource = (*tcpSackData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 func (d *tcpSackData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	if d.enabled == nil {
 		sack, err := d.stack.TCPSACKEnabled()
@@ -232,7 +235,7 @@ type tcpRecoveryData struct {
 
 var _ vfs.WritableDynamicBytesSource = (*tcpRecoveryData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 func (d *tcpRecoveryData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	recovery, err := d.stack.TCPRecovery()
 	if err != nil {
@@ -284,7 +287,7 @@ type tcpMemData struct {
 
 var _ vfs.WritableDynamicBytesSource = (*tcpMemData)(nil)
 
-// Generate implements vfs.DynamicBytesSource.
+// Generate implements vfs.DynamicBytesSource.Generate.
 func (d *tcpMemData) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	d.mu.Lock()
 	defer d.mu.Unlock()
@@ -354,3 +357,63 @@ func (d *tcpMemData) writeSizeLocked(size inet.TCPBufferSize) error {
 		panic(fmt.Sprintf("unknown tcpMemFile type: %v", d.dir))
 	}
 }
+
+// ipForwarding implements vfs.WritableDynamicBytesSource for
+// /proc/sys/net/ipv4/ip_forwarding.
+//
+// +stateify savable
+type ipForwarding struct {
+	kernfs.DynamicBytesFile
+
+	stack   inet.Stack `state:"wait"`
+	enabled *bool
+}
+
+var _ vfs.WritableDynamicBytesSource = (*ipForwarding)(nil)
+
+// Generate implements vfs.DynamicBytesSource.Generate.
+func (ipf *ipForwarding) Generate(ctx context.Context, buf *bytes.Buffer) error {
+	if ipf.enabled == nil {
+		enabled := ipf.stack.Forwarding(ipv4.ProtocolNumber)
+		ipf.enabled = &enabled
+	}
+
+	val := "0\n"
+	if *ipf.enabled {
+		// Technically, this is not quite compatible with Linux. Linux stores these
+		// as an integer, so if you write "2" into tcp_sack, you should get 2 back.
+		// Tough luck.
+		val = "1\n"
+	}
+	buf.WriteString(val)
+
+	return nil
+}
+
+// Write implements vfs.WritableDynamicBytesSource.Write.
+func (ipf *ipForwarding) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
+	if offset != 0 {
+		// No need to handle partial writes thus far.
+		return 0, syserror.EINVAL
+	}
+	if src.NumBytes() == 0 {
+		return 0, nil
+	}
+
+	// Limit input size so as not to impact performance if input size is large.
+	src = src.TakeFirst(usermem.PageSize - 1)
+
+	var v int32
+	n, err := usermem.CopyInt32StringInVec(ctx, src.IO, src.Addrs, &v, src.Opts)
+	if err != nil {
+		return 0, err
+	}
+	if ipf.enabled == nil {
+		ipf.enabled = new(bool)
+	}
+	*ipf.enabled = v != 0
+	if err := ipf.stack.SetForwarding(ipv4.ProtocolNumber, *ipf.enabled); err != nil {
+		return 0, err
+	}
+	return n, nil
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_sys_test.go b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
index be54897bb..6cee22823 100644
--- a/pkg/sentry/fsimpl/proc/tasks_sys_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_sys_test.go
@@ -20,8 +20,10 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/inet"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 func newIPv6TestStack() *inet.TestStack {
@@ -76,3 +78,72 @@ func TestIfinet6(t *testing.T) {
 		t.Errorf("Got n.contents() = %v, want = %v", got, want)
 	}
 }
+
+// TestIPForwarding tests the implementation of
+// /proc/sys/net/ipv4/ip_forwarding
+func TestConfigureIPForwarding(t *testing.T) {
+	ctx := context.Background()
+	s := inet.NewTestStack()
+
+	var cases = []struct {
+		comment string
+		initial bool
+		str     string
+		final   bool
+	}{
+		{
+			comment: `Forwarding is disabled; write 1 and enable forwarding`,
+			initial: false,
+			str:     "1",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is disabled; write 0 and disable forwarding`,
+			initial: false,
+			str:     "0",
+			final:   false,
+		},
+		{
+			comment: `Forwarding is enabled; write 1 and enable forwarding`,
+			initial: true,
+			str:     "1",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is enabled; write 0 and disable forwarding`,
+			initial: true,
+			str:     "0",
+			final:   false,
+		},
+		{
+			comment: `Forwarding is disabled; write 2404 and enable forwarding`,
+			initial: false,
+			str:     "2404",
+			final:   true,
+		},
+		{
+			comment: `Forwarding is enabled; write 2404 and enable forwarding`,
+			initial: true,
+			str:     "2404",
+			final:   true,
+		},
+	}
+	for _, c := range cases {
+		t.Run(c.comment, func(t *testing.T) {
+			s.IPForwarding = c.initial
+
+			file := &ipForwarding{stack: s, enabled: &c.initial}
+
+			// Write the values.
+			src := usermem.BytesIOSequence([]byte(c.str))
+			if n, err := file.Write(ctx, src, 0); n != int64(len(c.str)) || err != nil {
+				t.Errorf("file.Write(ctx, nil, %q, 0) = (%d, %v); want (%d, nil)", c.str, n, err, len(c.str))
+			}
+
+			// Read the values from the stack and check them.
+			if got, want := s.IPForwarding, c.final; got != want {
+				t.Errorf("s.IPForwarding incorrect; got: %v, want: %v", got, want)
+			}
+		})
+	}
+}
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 3c9297dee..2582ababd 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -67,6 +67,7 @@ var (
 	taskStaticFiles = map[string]testutil.DirentType{
 		"auxv":          linux.DT_REG,
 		"cgroup":        linux.DT_REG,
+		"cwd":           linux.DT_LNK,
 		"cmdline":       linux.DT_REG,
 		"comm":          linux.DT_REG,
 		"environ":       linux.DT_REG,
@@ -104,13 +105,16 @@ func setup(t *testing.T) *testutil.System {
 		AllowUserMount: true,
 	})
 
-	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.GetFilesystemOptions{})
+	mntns, err := k.VFS().NewMountNamespace(ctx, creds, "", tmpfs.Name, &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("NewMountNamespace(): %v", err)
 	}
+	root := mntns.Root()
+	root.IncRef()
+	defer root.DecRef(ctx)
 	pop := &vfs.PathOperation{
-		Root:  mntns.Root(),
-		Start: mntns.Root(),
+		Root:  root,
+		Start: root,
 		Path:  fspath.Parse("/proc"),
 	}
 	if err := k.VFS().MkdirAt(ctx, creds, pop, &vfs.MkdirOptions{Mode: 0777}); err != nil {
@@ -118,8 +122,8 @@ func setup(t *testing.T) *testutil.System {
 	}
 
 	pop = &vfs.PathOperation{
-		Root:  mntns.Root(),
-		Start: mntns.Root(),
+		Root:  root,
+		Start: root,
 		Path:  fspath.Parse("/proc"),
 	}
 	mntOpts := &vfs.MountOptions{
@@ -132,7 +136,7 @@ func setup(t *testing.T) *testutil.System {
 			},
 		},
 	}
-	if err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
+	if _, err := k.VFS().MountAt(ctx, creds, "", pop, Name, mntOpts); err != nil {
 		t.Fatalf("MountAt(/proc): %v", err)
 	}
 	return testutil.NewSystem(ctx, t, k.VFS(), mntns)
diff --git a/pkg/sentry/fsimpl/signalfd/BUILD b/pkg/sentry/fsimpl/signalfd/BUILD
index 067c1657f..adb610213 100644
--- a/pkg/sentry/fsimpl/signalfd/BUILD
+++ b/pkg/sentry/fsimpl/signalfd/BUILD
@@ -8,7 +8,6 @@ go_library(
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
-        "//pkg/binary",
         "//pkg/context",
         "//pkg/sentry/kernel",
         "//pkg/sentry/vfs",
diff --git a/pkg/sentry/fsimpl/signalfd/signalfd.go b/pkg/sentry/fsimpl/signalfd/signalfd.go
index 6297e1df4..10f1452ef 100644
--- a/pkg/sentry/fsimpl/signalfd/signalfd.go
+++ b/pkg/sentry/fsimpl/signalfd/signalfd.go
@@ -16,7 +16,6 @@ package signalfd
 
 import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -26,7 +25,9 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// SignalFileDescription implements FileDescriptionImpl for signal fds.
+// SignalFileDescription implements vfs.FileDescriptionImpl for signal fds.
+//
+// +stateify savable
 type SignalFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -43,7 +44,7 @@ type SignalFileDescription struct {
 	target *kernel.Task
 
 	// mu protects mask.
-	mu sync.Mutex
+	mu sync.Mutex `state:"nosave"`
 
 	// mask is the signal mask. Protected by mu.
 	mask linux.SignalSet
@@ -83,7 +84,7 @@ func (sfd *SignalFileDescription) SetMask(mask linux.SignalSet) {
 	sfd.mask = mask
 }
 
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
 func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ vfs.ReadOptions) (int64, error) {
 	// Attempt to dequeue relevant signals.
 	info, err := sfd.target.Sigtimedwait(sfd.Mask(), 0)
@@ -93,8 +94,7 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen
 	}
 
 	// Copy out the signal info using the specified format.
-	var buf [128]byte
-	binary.Marshal(buf[:0], usermem.ByteOrder, &linux.SignalfdSiginfo{
+	infoNative := linux.SignalfdSiginfo{
 		Signo:   uint32(info.Signo),
 		Errno:   info.Errno,
 		Code:    info.Code,
@@ -103,9 +103,13 @@ func (sfd *SignalFileDescription) Read(ctx context.Context, dst usermem.IOSequen
 		Status:  info.Status(),
 		Overrun: uint32(info.Overrun()),
 		Addr:    info.Addr(),
-	})
-	n, err := dst.CopyOut(ctx, buf[:])
-	return int64(n), err
+	}
+	n, err := infoNative.WriteTo(dst.Writer(ctx))
+	if err == usermem.ErrEndOfIOSequence {
+		// Partial copy-out ok.
+		err = nil
+	}
+	return n, err
 }
 
 // Readiness implements waiter.Waitable.Readiness.
@@ -132,5 +136,5 @@ func (sfd *SignalFileDescription) EventUnregister(entry *waiter.Entry) {
 	sfd.target.SignalUnregister(entry)
 }
 
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
 func (sfd *SignalFileDescription) Release(context.Context) {}
diff --git a/pkg/sentry/fsimpl/sockfs/sockfs.go b/pkg/sentry/fsimpl/sockfs/sockfs.go
index c61818ff6..fda1fa942 100644
--- a/pkg/sentry/fsimpl/sockfs/sockfs.go
+++ b/pkg/sentry/fsimpl/sockfs/sockfs.go
@@ -28,14 +28,16 @@ import (
 )
 
 // filesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type filesystemType struct{}
 
-// GetFilesystem implements FilesystemType.GetFilesystem.
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fsType filesystemType) GetFilesystem(_ context.Context, vfsObj *vfs.VirtualFilesystem, _ *auth.Credentials, _ string, _ vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	panic("sockfs.filesystemType.GetFilesystem should never be called")
 }
 
-// Name implements FilesystemType.Name.
+// Name implements vfs.FilesystemType.Name.
 //
 // Note that registering sockfs is unnecessary, except for the fact that it
 // will not show up under /proc/filesystems as a result. This is a very minor
@@ -44,6 +46,10 @@ func (filesystemType) Name() string {
 	return "sockfs"
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (filesystemType) Release(ctx context.Context) {}
+
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -80,30 +86,37 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 }
 
 // inode implements kernfs.Inode.
+//
+// +stateify savable
 type inode struct {
-	kernfs.InodeNotDirectory
-	kernfs.InodeNotSymlink
 	kernfs.InodeAttrs
 	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
 }
 
 // Open implements kernfs.Inode.Open.
-func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+func (i *inode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	return nil, syserror.ENXIO
 }
 
+// StatFS implements kernfs.Inode.StatFS.
+func (i *inode) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.SOCKFS_MAGIC), nil
+}
+
 // NewDentry constructs and returns a sockfs dentry.
 //
 // Preconditions: mnt.Filesystem() must have been returned by NewFilesystem().
-func NewDentry(creds *auth.Credentials, mnt *vfs.Mount) *vfs.Dentry {
+func NewDentry(ctx context.Context, mnt *vfs.Mount) *vfs.Dentry {
 	fs := mnt.Filesystem().Impl().(*filesystem)
 
 	// File mode matches net/socket.c:sock_alloc.
 	filemode := linux.FileMode(linux.S_IFSOCK | 0600)
 	i := &inode{}
-	i.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode)
+	i.InodeAttrs.Init(ctx, auth.CredentialsFromContext(ctx), linux.UNNAMED_MAJOR, fs.devMinor, fs.Filesystem.NextIno(), filemode)
 
 	d := &kernfs.Dentry{}
-	d.Init(i)
+	d.Init(&fs.Filesystem, i)
 	return d.VFSDentry()
 }
diff --git a/pkg/sentry/fsimpl/sys/BUILD b/pkg/sentry/fsimpl/sys/BUILD
index 1b548ccd4..09043b572 100644
--- a/pkg/sentry/fsimpl/sys/BUILD
+++ b/pkg/sentry/fsimpl/sys/BUILD
@@ -1,21 +1,42 @@
 load("//tools:defs.bzl", "go_library", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 licenses(["notice"])
 
+go_template_instance(
+    name = "dir_refs",
+    out = "dir_refs.go",
+    package = "sys",
+    prefix = "dir",
+    template = "//pkg/refsvfs2:refs_template",
+    types = {
+        "T": "dir",
+    },
+)
+
 go_library(
     name = "sys",
     srcs = [
+        "dir_refs.go",
+        "kcov.go",
         "sys.go",
     ],
     visibility = ["//pkg/sentry:internal"],
     deps = [
         "//pkg/abi/linux",
         "//pkg/context",
+        "//pkg/coverage",
+        "//pkg/log",
+        "//pkg/refs",
+        "//pkg/refsvfs2",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fsimpl/kernfs",
         "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/memmap",
         "//pkg/sentry/vfs",
         "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
 
diff --git a/pkg/sentry/fsimpl/sys/kcov.go b/pkg/sentry/fsimpl/sys/kcov.go
new file mode 100644
index 000000000..b13f141a8
--- /dev/null
+++ b/pkg/sentry/fsimpl/sys/kcov.go
@@ -0,0 +1,118 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package sys
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+func (fs *filesystem) newKcovFile(ctx context.Context, creds *auth.Credentials) kernfs.Inode {
+	k := &kcovInode{}
+	k.InodeAttrs.Init(ctx, creds, 0, 0, fs.NextIno(), linux.S_IFREG|0600)
+	return k
+}
+
+// kcovInode implements kernfs.Inode.
+//
+// +stateify savable
+type kcovInode struct {
+	kernfs.InodeAttrs
+	kernfs.InodeNoopRefCount
+	kernfs.InodeNotDirectory
+	kernfs.InodeNotSymlink
+	implStatFS
+}
+
+func (i *kcovInode) Open(ctx context.Context, rp *vfs.ResolvingPath, d *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	k := kernel.KernelFromContext(ctx)
+	if k == nil {
+		panic("KernelFromContext returned nil")
+	}
+	fd := &kcovFD{
+		inode: i,
+		kcov:  k.NewKcov(),
+	}
+
+	if err := fd.vfsfd.Init(fd, opts.Flags, rp.Mount(), d.VFSDentry(), &vfs.FileDescriptionOptions{
+		DenyPRead:  true,
+		DenyPWrite: true,
+	}); err != nil {
+		return nil, err
+	}
+	return &fd.vfsfd, nil
+}
+
+// +stateify savable
+type kcovFD struct {
+	vfs.FileDescriptionDefaultImpl
+	vfs.NoLockFD
+
+	vfsfd vfs.FileDescription
+	inode *kcovInode
+	kcov  *kernel.Kcov
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *kcovFD) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	cmd := uint32(args[1].Int())
+	arg := args[2].Uint64()
+	switch uint32(cmd) {
+	case linux.KCOV_INIT_TRACE:
+		return 0, fd.kcov.InitTrace(arg)
+	case linux.KCOV_ENABLE:
+		return 0, fd.kcov.EnableTrace(ctx, uint8(arg))
+	case linux.KCOV_DISABLE:
+		if arg != 0 {
+			// This arg is unused; it should be 0.
+			return 0, syserror.EINVAL
+		}
+		return 0, fd.kcov.DisableTrace(ctx)
+	default:
+		return 0, syserror.ENOTTY
+	}
+}
+
+// ConfigureMmap implements vfs.FileDescriptionImpl.ConfigureMmap.
+func (fd *kcovFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return fd.kcov.ConfigureMMap(ctx, opts)
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *kcovFD) Release(ctx context.Context) {
+	// kcov instances have reference counts in Linux, but this seems sufficient
+	// for our purposes.
+	fd.kcov.Clear(ctx)
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *kcovFD) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	creds := auth.CredentialsFromContext(ctx)
+	fs := fd.vfsfd.VirtualDentry().Mount().Filesystem()
+	return fd.inode.SetStat(ctx, fs, creds, opts)
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *kcovFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	return fd.inode.Stat(ctx, fd.vfsfd.Mount().Filesystem(), opts)
+}
diff --git a/pkg/sentry/fsimpl/sys/sys.go b/pkg/sentry/fsimpl/sys/sys.go
index 0401726b6..7d2147141 100644
--- a/pkg/sentry/fsimpl/sys/sys.go
+++ b/pkg/sentry/fsimpl/sys/sys.go
@@ -18,9 +18,11 @@ package sys
 import (
 	"bytes"
 	"fmt"
+	"strconv"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/coverage"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -28,14 +30,21 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// Name is the default filesystem name.
-const Name = "sysfs"
-const defaultSysDirMode = linux.FileMode(0755)
+const (
+	// Name is the default filesystem name.
+	Name                     = "sysfs"
+	defaultSysDirMode        = linux.FileMode(0755)
+	defaultMaxCachedDentries = uint64(1000)
+)
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	kernfs.Filesystem
 
@@ -47,6 +56,9 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	devMinor, err := vfsObj.GetAnonBlockDevMinor()
@@ -54,44 +66,73 @@ func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		return nil, nil, err
 	}
 
+	mopts := vfs.GenericParseMountOptions(opts.Data)
+	maxCachedDentries := defaultMaxCachedDentries
+	if str, ok := mopts["dentry_cache_limit"]; ok {
+		delete(mopts, "dentry_cache_limit")
+		maxCachedDentries, err = strconv.ParseUint(str, 10, 64)
+		if err != nil {
+			ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
+			return nil, nil, syserror.EINVAL
+		}
+	}
+
 	fs := &filesystem{
 		devMinor: devMinor,
 	}
+	fs.MaxCachedDentries = maxCachedDentries
 	fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
 
-	root := fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
-		"block": fs.newDir(creds, defaultSysDirMode, nil),
-		"bus":   fs.newDir(creds, defaultSysDirMode, nil),
-		"class": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
-			"power_supply": fs.newDir(creds, defaultSysDirMode, nil),
+	root := fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
+		"block": fs.newDir(ctx, creds, defaultSysDirMode, nil),
+		"bus":   fs.newDir(ctx, creds, defaultSysDirMode, nil),
+		"class": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
+			"power_supply": fs.newDir(ctx, creds, defaultSysDirMode, nil),
 		}),
-		"dev": fs.newDir(creds, defaultSysDirMode, nil),
-		"devices": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
-			"system": fs.newDir(creds, defaultSysDirMode, map[string]*kernfs.Dentry{
+		"dev": fs.newDir(ctx, creds, defaultSysDirMode, nil),
+		"devices": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
+			"system": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
 				"cpu": cpuDir(ctx, fs, creds),
 			}),
 		}),
-		"firmware": fs.newDir(creds, defaultSysDirMode, nil),
-		"fs":       fs.newDir(creds, defaultSysDirMode, nil),
-		"kernel":   fs.newDir(creds, defaultSysDirMode, nil),
-		"module":   fs.newDir(creds, defaultSysDirMode, nil),
-		"power":    fs.newDir(creds, defaultSysDirMode, nil),
+		"firmware": fs.newDir(ctx, creds, defaultSysDirMode, nil),
+		"fs":       fs.newDir(ctx, creds, defaultSysDirMode, nil),
+		"kernel":   kernelDir(ctx, fs, creds),
+		"module":   fs.newDir(ctx, creds, defaultSysDirMode, nil),
+		"power":    fs.newDir(ctx, creds, defaultSysDirMode, nil),
 	})
-	return fs.VFSFilesystem(), root.VFSDentry(), nil
+	var rootD kernfs.Dentry
+	rootD.Init(&fs.Filesystem, root)
+	return fs.VFSFilesystem(), rootD.VFSDentry(), nil
 }
 
-func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) *kernfs.Dentry {
+func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode {
 	k := kernel.KernelFromContext(ctx)
 	maxCPUCores := k.ApplicationCores()
-	children := map[string]*kernfs.Dentry{
-		"online":   fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
-		"possible": fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
-		"present":  fs.newCPUFile(creds, maxCPUCores, linux.FileMode(0444)),
+	children := map[string]kernfs.Inode{
+		"online":   fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
+		"possible": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
+		"present":  fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
 	}
 	for i := uint(0); i < maxCPUCores; i++ {
-		children[fmt.Sprintf("cpu%d", i)] = fs.newDir(creds, linux.FileMode(0555), nil)
+		children[fmt.Sprintf("cpu%d", i)] = fs.newDir(ctx, creds, linux.FileMode(0555), nil)
 	}
-	return fs.newDir(creds, defaultSysDirMode, children)
+	return fs.newDir(ctx, creds, defaultSysDirMode, children)
+}
+
+func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode {
+	// If kcov is available, set up /sys/kernel/debug/kcov. Technically, debugfs
+	// should be mounted at debug/, but for our purposes, it is sufficient to
+	// keep it in sys.
+	var children map[string]kernfs.Inode
+	if coverage.KcovAvailable() {
+		children = map[string]kernfs.Inode{
+			"debug": fs.newDir(ctx, creds, linux.FileMode(0700), map[string]kernfs.Inode{
+				"kcov": fs.newKcovFile(ctx, creds),
+			}),
+		}
+	}
+	return fs.newDir(ctx, creds, defaultSysDirMode, children)
 }
 
 // Release implements vfs.FilesystemImpl.Release.
@@ -101,46 +142,62 @@ func (fs *filesystem) Release(ctx context.Context) {
 }
 
 // dir implements kernfs.Inode.
+//
+// +stateify savable
 type dir struct {
+	dirRefs
+	kernfs.InodeAlwaysValid
 	kernfs.InodeAttrs
-	kernfs.InodeNoDynamicLookup
 	kernfs.InodeNotSymlink
 	kernfs.InodeDirectoryNoNewChildren
+	kernfs.InodeTemporary
 	kernfs.OrderedChildren
 
 	locks vfs.FileLocks
-
-	dentry kernfs.Dentry
 }
 
-func (fs *filesystem) newDir(creds *auth.Credentials, mode linux.FileMode, contents map[string]*kernfs.Dentry) *kernfs.Dentry {
+func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
 	d := &dir{}
-	d.InodeAttrs.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
+	d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
 	d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
-	d.dentry.Init(d)
-
-	d.IncLinks(d.OrderedChildren.Populate(&d.dentry, contents))
-
-	return &d.dentry
+	d.EnableLeakCheck()
+	d.IncLinks(d.OrderedChildren.Populate(contents))
+	return d
 }
 
-// SetStat implements Inode.SetStat not allowing inode attributes to be changed.
+// SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
 func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
 	return syserror.EPERM
 }
 
 // Open implements kernfs.Inode.Open.
-func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), vfsd, &d.OrderedChildren, &d.locks, &opts)
+func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
+		SeekEnd: kernfs.SeekEndStaticEntries,
+	})
 	if err != nil {
 		return nil, err
 	}
 	return fd.VFSFileDescription(), nil
 }
 
+// DecRef implements kernfs.Inode.DecRef.
+func (d *dir) DecRef(ctx context.Context) {
+	d.dirRefs.DecRef(func() { d.Destroy(ctx) })
+}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
+}
+
 // cpuFile implements kernfs.Inode.
+//
+// +stateify savable
 type cpuFile struct {
+	implStatFS
 	kernfs.DynamicBytesFile
+
 	maxCores uint
 }
 
@@ -150,10 +207,16 @@ func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
 	return nil
 }
 
-func (fs *filesystem) newCPUFile(creds *auth.Credentials, maxCores uint, mode linux.FileMode) *kernfs.Dentry {
+func (fs *filesystem) newCPUFile(ctx context.Context, creds *auth.Credentials, maxCores uint, mode linux.FileMode) kernfs.Inode {
 	c := &cpuFile{maxCores: maxCores}
-	c.DynamicBytesFile.Init(creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode)
-	d := &kernfs.Dentry{}
-	d.Init(c)
-	return d
+	c.DynamicBytesFile.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode)
+	return c
+}
+
+// +stateify savable
+type implStatFS struct{}
+
+// StatFS implements kernfs.Inode.StatFS.
+func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
+	return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
 }
diff --git a/pkg/sentry/fsimpl/sys/sys_test.go b/pkg/sentry/fsimpl/sys/sys_test.go
index 9fd38b295..0a0d914cc 100644
--- a/pkg/sentry/fsimpl/sys/sys_test.go
+++ b/pkg/sentry/fsimpl/sys/sys_test.go
@@ -38,7 +38,7 @@ func newTestSystem(t *testing.T) *testutil.System {
 		AllowUserMount: true,
 	})
 
-	mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.GetFilesystemOptions{})
+	mns, err := k.VFS().NewMountNamespace(ctx, creds, "", sys.Name, &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create new mount namespace: %v", err)
 	}
diff --git a/pkg/sentry/fsimpl/testutil/kernel.go b/pkg/sentry/fsimpl/testutil/kernel.go
index 1813269e0..738c0c9cc 100644
--- a/pkg/sentry/fsimpl/testutil/kernel.go
+++ b/pkg/sentry/fsimpl/testutil/kernel.go
@@ -147,7 +147,12 @@ func CreateTask(ctx context.Context, name string, tc *kernel.ThreadGroup, mntns
 		FSContext:               kernel.NewFSContextVFS2(root, cwd, 0022),
 		FDTable:                 k.NewFDTable(),
 	}
-	return k.TaskSet().NewTask(config)
+	t, err := k.TaskSet().NewTask(ctx, config)
+	if err != nil {
+		config.ThreadGroup.Release(ctx)
+		return nil, err
+	}
+	return t, nil
 }
 
 func newFakeExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry) (*vfs.FileDescription, error) {
diff --git a/pkg/sentry/fsimpl/testutil/testutil.go b/pkg/sentry/fsimpl/testutil/testutil.go
index 568132121..1a8525b06 100644
--- a/pkg/sentry/fsimpl/testutil/testutil.go
+++ b/pkg/sentry/fsimpl/testutil/testutil.go
@@ -46,16 +46,18 @@ type System struct {
 
 // NewSystem constructs a System.
 //
-// Precondition: Caller must hold a reference on MntNs, whose ownership
+// Precondition: Caller must hold a reference on mns, whose ownership
 // is transferred to the new System.
 func NewSystem(ctx context.Context, t *testing.T, v *vfs.VirtualFilesystem, mns *vfs.MountNamespace) *System {
+	root := mns.Root()
+	root.IncRef()
 	s := &System{
 		t:     t,
 		Ctx:   ctx,
 		Creds: auth.CredentialsFromContext(ctx),
 		VFS:   v,
 		MntNs: mns,
-		Root:  mns.Root(),
+		Root:  root,
 	}
 	return s
 }
@@ -254,10 +256,10 @@ func (d *DirentCollector) Contains(name string, typ uint8) error {
 	defer d.mu.Unlock()
 	dirent, ok := d.dirents[name]
 	if !ok {
-		return fmt.Errorf("No dirent named %q found", name)
+		return fmt.Errorf("no dirent named %q found", name)
 	}
 	if dirent.Type != typ {
-		return fmt.Errorf("Dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent)
+		return fmt.Errorf("dirent named %q found, but was expecting type %s, got: %+v", name, linux.DirentType.Parse(uint64(typ)), dirent)
 	}
 	return nil
 }
diff --git a/pkg/sentry/fsimpl/timerfd/timerfd.go b/pkg/sentry/fsimpl/timerfd/timerfd.go
index 86beaa0a8..8853c8ad2 100644
--- a/pkg/sentry/fsimpl/timerfd/timerfd.go
+++ b/pkg/sentry/fsimpl/timerfd/timerfd.go
@@ -26,8 +26,10 @@ import (
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
-// TimerFileDescription implements FileDescriptionImpl for timer fds. It also
+// TimerFileDescription implements vfs.FileDescriptionImpl for timer fds. It also
 // implements ktime.TimerListener.
+//
+// +stateify savable
 type TimerFileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -62,7 +64,7 @@ func New(ctx context.Context, vfsObj *vfs.VirtualFilesystem, clock ktime.Clock,
 	return &tfd.vfsfd, nil
 }
 
-// Read implements FileDescriptionImpl.Read.
+// Read implements vfs.FileDescriptionImpl.Read.
 func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
 	const sizeofUint64 = 8
 	if dst.NumBytes() < sizeofUint64 {
@@ -128,7 +130,7 @@ func (tfd *TimerFileDescription) ResumeTimer() {
 	tfd.timer.Resume()
 }
 
-// Release implements FileDescriptionImpl.Release()
+// Release implements vfs.FileDescriptionImpl.Release.
 func (tfd *TimerFileDescription) Release(context.Context) {
 	tfd.timer.Destroy()
 }
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
index 5cd428d64..fe520b6fd 100644
--- a/pkg/sentry/fsimpl/tmpfs/BUILD
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -31,7 +31,7 @@ go_template_instance(
     out = "inode_refs.go",
     package = "tmpfs",
     prefix = "inode",
-    template = "//pkg/refs_vfs2:refs_template",
+    template = "//pkg/refsvfs2:refs_template",
     types = {
         "T": "inode",
     },
@@ -48,6 +48,7 @@ go_library(
         "inode_refs.go",
         "named_pipe.go",
         "regular_file.go",
+        "save_restore.go",
         "socket_file.go",
         "symlink.go",
         "tmpfs.go",
@@ -60,6 +61,7 @@ go_library(
         "//pkg/fspath",
         "//pkg/log",
         "//pkg/refs",
+        "//pkg/refsvfs2",
         "//pkg/safemem",
         "//pkg/sentry/arch",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
index d263147c2..3cc63e732 100644
--- a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -182,7 +182,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
 			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 				AllowUserMount: true,
 			})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
@@ -193,6 +193,7 @@ func BenchmarkVFS2TmpfsStat(b *testing.B) {
 
 			// Create nested directories with given depth.
 			root := mntns.Root()
+			root.IncRef()
 			defer root.DecRef(ctx)
 			vd := root
 			vd.IncRef()
@@ -376,7 +377,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 				AllowUserMount: true,
 			})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
@@ -387,6 +388,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 
 			// Create the mount point.
 			root := mntns.Root()
+			root.IncRef()
 			defer root.DecRef(ctx)
 			pop := vfs.PathOperation{
 				Root:  root,
@@ -405,7 +407,7 @@ func BenchmarkVFS2TmpfsMountStat(b *testing.B) {
 			}
 			defer mountPoint.DecRef(ctx)
 			// Create and mount the submount.
-			if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
+			if _, err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
 				b.Fatalf("failed to mount tmpfs submount: %v", err)
 			}
 			filePathBuilder.WriteString(mountPointName)
diff --git a/pkg/sentry/fsimpl/tmpfs/device_file.go b/pkg/sentry/fsimpl/tmpfs/device_file.go
index ac54d420d..9129d35b7 100644
--- a/pkg/sentry/fsimpl/tmpfs/device_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/device_file.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 )
 
+// +stateify savable
 type deviceFile struct {
 	inode inode
 	kind  vfs.DeviceKind
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
index 78b4fc5be..e90669cf0 100644
--- a/pkg/sentry/fsimpl/tmpfs/directory.go
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -25,6 +25,7 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
+// +stateify savable
 type directory struct {
 	// Since directories can't be hard-linked, each directory can only be
 	// associated with a single dentry, which we can store in the directory
@@ -44,7 +45,7 @@ type directory struct {
 	// (with inode == nil) that represent the iteration position of
 	// directoryFDs. childList is used to support directoryFD.IterDirents()
 	// efficiently. childList is protected by iterMu.
-	iterMu    sync.Mutex
+	iterMu    sync.Mutex `state:"nosave"`
 	childList dentryList
 }
 
@@ -57,8 +58,9 @@ func (fs *filesystem) newDirectory(kuid auth.KUID, kgid auth.KGID, mode linux.Fi
 	return dir
 }
 
-// Preconditions: filesystem.mu must be locked for writing. dir must not
-// already contain a child with the given name.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * dir must not already contain a child with the given name.
 func (dir *directory) insertChildLocked(child *dentry, name string) {
 	child.parent = &dir.dentry
 	child.name = name
@@ -85,6 +87,7 @@ func (dir *directory) mayDelete(creds *auth.Credentials, child *dentry) error {
 	return vfs.CheckDeleteSticky(creds, linux.FileMode(atomic.LoadUint32(&dir.inode.mode)), auth.KUID(atomic.LoadUint32(&child.inode.uid)))
 }
 
+// +stateify savable
 type directoryFD struct {
 	fileDescription
 	vfs.DirectoryFileDescriptionDefaultImpl
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
index b0ec177e6..e39cd305b 100644
--- a/pkg/sentry/fsimpl/tmpfs/filesystem.go
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -25,7 +25,6 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
-	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Sync implements vfs.FilesystemImpl.Sync.
@@ -39,7 +38,9 @@ func (fs *filesystem) Sync(ctx context.Context) error {
 //
 // stepLocked is loosely analogous to fs/namei.c:walk_component().
 //
-// Preconditions: filesystem.mu must be locked. !rp.Done().
+// Preconditions:
+// * filesystem.mu must be locked.
+// * !rp.Done().
 func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
 	dir, ok := d.inode.impl.(*directory)
 	if !ok {
@@ -97,7 +98,9 @@ afterSymlink:
 // walkParentDirLocked is loosely analogous to Linux's
 // fs/namei.c:path_parentat().
 //
-// Preconditions: filesystem.mu must be locked. !rp.Done().
+// Preconditions:
+// * filesystem.mu must be locked.
+// * !rp.Done().
 func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) {
 	for !rp.Final() {
 		next, err := stepLocked(ctx, rp, d)
@@ -139,8 +142,9 @@ func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error)
 // doCreateAt is loosely analogous to a conjunction of Linux's
 // fs/namei.c:filename_create() and done_path_create().
 //
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
+// Preconditions:
+// * !rp.Done().
+// * For the final path component in rp, !rp.ShouldFollowSymlink().
 func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
@@ -669,11 +673,11 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 		fs.mu.RUnlock()
 		return err
 	}
-	if err := d.inode.setStat(ctx, rp.Credentials(), &opts); err != nil {
-		fs.mu.RUnlock()
+	err = d.inode.setStat(ctx, rp.Credentials(), &opts)
+	fs.mu.RUnlock()
+	if err != nil {
 		return err
 	}
-	fs.mu.RUnlock()
 
 	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
 		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
@@ -701,16 +705,7 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 	if _, err := resolveLocked(ctx, rp); err != nil {
 		return linux.Statfs{}, err
 	}
-	statfs := linux.Statfs{
-		Type:         linux.TMPFS_MAGIC,
-		BlockSize:    usermem.PageSize,
-		FragmentSize: usermem.PageSize,
-		NameLength:   linux.NAME_MAX,
-		// TODO(b/29637826): Allow configuring a tmpfs size and enforce it.
-		Blocks:     0,
-		BlocksFree: 0,
-	}
-	return statfs, nil
+	return globalStatfs, nil
 }
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
@@ -775,7 +770,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return nil
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
@@ -788,65 +783,68 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	}
 	switch impl := d.inode.impl.(type) {
 	case *socketFile:
+		if impl.ep == nil {
+			return nil, syserror.ECONNREFUSED
+		}
 		return impl.ep, nil
 	default:
 		return nil, syserror.ECONNREFUSED
 	}
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		return nil, err
 	}
-	return d.inode.listxattr(size)
+	return d.inode.listXattr(size)
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		return "", err
 	}
-	return d.inode.getxattr(rp.Credentials(), &opts)
+	return d.inode.getXattr(rp.Credentials(), &opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	fs.mu.RLock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		fs.mu.RUnlock()
 		return err
 	}
-	if err := d.inode.setxattr(rp.Credentials(), &opts); err != nil {
-		fs.mu.RUnlock()
+	err = d.inode.setXattr(rp.Credentials(), &opts)
+	fs.mu.RUnlock()
+	if err != nil {
 		return err
 	}
-	fs.mu.RUnlock()
 
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
 	d, err := resolveLocked(ctx, rp)
 	if err != nil {
 		fs.mu.RUnlock()
 		return err
 	}
-	if err := d.inode.removexattr(rp.Credentials(), name); err != nil {
-		fs.mu.RUnlock()
+	err = d.inode.removeXattr(rp.Credentials(), name)
+	fs.mu.RUnlock()
+	if err != nil {
 		return err
 	}
-	fs.mu.RUnlock()
 
 	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
 	return nil
@@ -867,8 +865,16 @@ func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDe
 		}
 		if d.parent == nil {
 			if d.name != "" {
-				// This must be an anonymous memfd file.
+				// This file must have been created by
+				// newUnlinkedRegularFileDescription(). In Linux,
+				// mm/shmem.c:__shmem_file_setup() =>
+				// fs/file_table.c:alloc_file_pseudo() sets the created
+				// dentry's dentry_operations to anon_ops, for which d_dname ==
+				// simple_dname. fs/d_path.c:simple_dname() defines the
+				// dentry's pathname to be its name, prefixed with "/" and
+				// suffixed with " (deleted)".
 				b.PrependComponent("/" + d.name)
+				b.AppendString(" (deleted)")
 				return vfs.PrependPathSyntheticError{}
 			}
 			return vfs.PrependPathAtNonMountRootError{}
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
index 739350cf0..d772db9e9 100644
--- a/pkg/sentry/fsimpl/tmpfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -21,6 +21,7 @@ import (
 	"gvisor.dev/gvisor/pkg/usermem"
 )
 
+// +stateify savable
 type namedPipe struct {
 	inode inode
 
@@ -28,8 +29,8 @@ type namedPipe struct {
 }
 
 // Preconditions:
-//   * fs.mu must be locked.
-//   * rp.Mount().CheckBeginWrite() has been called successfully.
+// * fs.mu must be locked.
+// * rp.Mount().CheckBeginWrite() has been called successfully.
 func (fs *filesystem) newNamedPipe(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
 	file := &namedPipe{pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize, usermem.PageSize)}
 	file.inode.init(file, fs, kuid, kgid, linux.S_IFIFO|mode)
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
index ec2701d8b..2f856ce36 100644
--- a/pkg/sentry/fsimpl/tmpfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -158,13 +158,14 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
 
 	// Create the pipe.
 	root := mntns.Root()
+	root.IncRef()
 	pop := vfs.PathOperation{
 		Root:  root,
 		Start: root,
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
index 0710b65db..98680fde9 100644
--- a/pkg/sentry/fsimpl/tmpfs/regular_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -36,11 +36,17 @@ import (
 )
 
 // regularFile is a regular (=S_IFREG) tmpfs file.
+//
+// +stateify savable
 type regularFile struct {
 	inode inode
 
 	// memFile is a platform.File used to allocate pages to this regularFile.
-	memFile *pgalloc.MemoryFile
+	memFile *pgalloc.MemoryFile `state:"nosave"`
+
+	// memoryUsageKind is the memory accounting category under which pages backing
+	// this regularFile's contents are accounted.
+	memoryUsageKind usage.MemoryKind
 
 	// mapsMu protects mappings.
 	mapsMu sync.Mutex `state:"nosave"`
@@ -62,7 +68,7 @@ type regularFile struct {
 	writableMappingPages uint64
 
 	// dataMu protects the fields below.
-	dataMu sync.RWMutex
+	dataMu sync.RWMutex `state:"nosave"`
 
 	// data maps offsets into the file to offsets into memFile that store
 	// the file's data.
@@ -86,14 +92,75 @@ type regularFile struct {
 
 func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode) *inode {
 	file := &regularFile{
-		memFile: fs.memFile,
-		seals:   linux.F_SEAL_SEAL,
+		memFile:         fs.mfp.MemoryFile(),
+		memoryUsageKind: usage.Tmpfs,
+		seals:           linux.F_SEAL_SEAL,
 	}
 	file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode)
 	file.inode.nlink = 1 // from parent directory
 	return &file.inode
 }
 
+// newUnlinkedRegularFileDescription creates a regular file on the tmpfs
+// filesystem represented by mount and returns an FD representing that file.
+// The new file is not reachable by path traversal from any other file.
+//
+// newUnlinkedRegularFileDescription is analogous to Linux's
+// mm/shmem.c:__shmem_file_setup().
+//
+// Preconditions: mount must be a tmpfs mount.
+func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) {
+	fs, ok := mount.Filesystem().Impl().(*filesystem)
+	if !ok {
+		panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount")
+	}
+
+	inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
+	d := fs.newDentry(inode)
+	defer d.DecRef(ctx)
+	d.name = name
+
+	fd := &regularFileFD{}
+	fd.Init(&inode.locks)
+	flags := uint32(linux.O_RDWR)
+	if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
+		return nil, err
+	}
+	return fd, nil
+}
+
+// NewZeroFile creates a new regular file and file description as for
+// mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is
+// initially (implicitly) filled with zeroes.
+//
+// Preconditions: mount must be a tmpfs mount.
+func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) {
+	// Compare mm/shmem.c:shmem_zero_setup().
+	fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero")
+	if err != nil {
+		return nil, err
+	}
+	rf := fd.inode().impl.(*regularFile)
+	rf.memoryUsageKind = usage.Anonymous
+	rf.size = size
+	return &fd.vfsfd, err
+}
+
+// NewMemfd creates a new regular file and file description as for
+// memfd_create.
+//
+// Preconditions: mount must be a tmpfs mount.
+func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
+	fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name)
+	if err != nil {
+		return nil, err
+	}
+	if allowSeals {
+		fd.inode().impl.(*regularFile).seals = 0
+	}
+	return &fd.vfsfd, nil
+}
+
 // truncate grows or shrinks the file to the given size. It returns true if the
 // file size was updated.
 func (rf *regularFile) truncate(newSize uint64) (bool, error) {
@@ -226,7 +293,7 @@ func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.
 		optional.End = pgend
 	}
 
-	cerr := rf.data.Fill(ctx, required, optional, rf.memFile, usage.Tmpfs, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
+	cerr := rf.data.Fill(ctx, required, optional, rf.size, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
 		// Newly-allocated pages are zeroed, so we don't need to do anything.
 		return dsts.NumBytes(), nil
 	})
@@ -260,13 +327,14 @@ func (*regularFile) InvalidateUnsavable(context.Context) error {
 	return nil
 }
 
+// +stateify savable
 type regularFileFD struct {
 	fileDescription
 
 	// off is the file offset. off is accessed using atomic memory operations.
 	// offMu serializes operations that may mutate off.
 	off   int64
-	offMu sync.Mutex
+	offMu sync.Mutex `state:"nosave"`
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
@@ -575,7 +643,7 @@ func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64,
 		case gap.Ok():
 			// Allocate memory for the write.
 			gapMR := gap.Range().Intersect(pgMR)
-			fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs)
+			fr, err := rw.file.memFile.Allocate(gapMR.Length(), rw.file.memoryUsageKind)
 			if err != nil {
 				retErr = err
 				goto exitLoop
diff --git a/pkg/sentry/fsimpl/tmpfs/save_restore.go b/pkg/sentry/fsimpl/tmpfs/save_restore.go
new file mode 100644
index 000000000..b27f75cc2
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/save_restore.go
@@ -0,0 +1,20 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+// afterLoad is called by stateify.
+func (rf *regularFile) afterLoad() {
+	rf.memFile = rf.inode.fs.mfp.MemoryFile()
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/socket_file.go b/pkg/sentry/fsimpl/tmpfs/socket_file.go
index 3ed650474..5699d5975 100644
--- a/pkg/sentry/fsimpl/tmpfs/socket_file.go
+++ b/pkg/sentry/fsimpl/tmpfs/socket_file.go
@@ -21,6 +21,8 @@ import (
 )
 
 // socketFile is a socket (=S_IFSOCK) tmpfs file.
+//
+// +stateify savable
 type socketFile struct {
 	inode inode
 	ep    transport.BoundEndpoint
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
index b0de5fabe..a102a2ee2 100644
--- a/pkg/sentry/fsimpl/tmpfs/symlink.go
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -19,6 +19,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 )
 
+// +stateify savable
 type symlink struct {
 	inode  inode
 	target string // immutable
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
index de2af6d01..4ce859d57 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -51,14 +51,19 @@ import (
 const Name = "tmpfs"
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
-	// memFile is used to allocate pages to for regular files.
-	memFile *pgalloc.MemoryFile
+	// mfp is used to allocate memory that stores regular file contents. mfp is
+	// immutable.
+	mfp pgalloc.MemoryFileProvider
 
 	// clock is a realtime clock used to set timestamps in file operations.
 	clock time.Clock
@@ -67,9 +72,11 @@ type filesystem struct {
 	devMinor uint32
 
 	// mu serializes changes to the Dentry tree.
-	mu sync.RWMutex
+	mu sync.RWMutex `state:"nosave"`
 
 	nextInoMinusOne uint64 // accessed using atomic memory operations
+
+	root *dentry
 }
 
 // Name implements vfs.FilesystemType.Name.
@@ -77,7 +84,12 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
 // FilesystemOpts is used to pass configuration data to tmpfs.
+//
+// +stateify savable
 type FilesystemOpts struct {
 	// RootFileType is the FileType of the filesystem root. Valid values
 	// are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
@@ -95,8 +107,8 @@ type FilesystemOpts struct {
 
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
-	if memFileProvider == nil {
+	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
+	if mfp == nil {
 		panic("MemoryFileProviderFromContext returned nil")
 	}
 
@@ -170,7 +182,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 	}
 	clock := time.RealtimeClockFromContext(ctx)
 	fs := filesystem{
-		memFile:  memFileProvider.MemoryFile(),
+		mfp:      mfp,
 		clock:    clock,
 		devMinor: devMinor,
 	}
@@ -188,6 +200,7 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 		fs.vfsfs.DecRef(ctx)
 		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
 	}
+	fs.root = root
 	return &fs.vfsfs, &root.vfsd, nil
 }
 
@@ -199,9 +212,61 @@ func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *au
 // Release implements vfs.FilesystemImpl.Release.
 func (fs *filesystem) Release(ctx context.Context) {
 	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
+	fs.mu.Lock()
+	if fs.root.inode.isDir() {
+		fs.root.releaseChildrenLocked(ctx)
+	}
+	fs.mu.Unlock()
+}
+
+// releaseChildrenLocked is called on the mount point by filesystem.Release() to
+// destroy all objects in the mount. It performs a depth-first walk of the
+// filesystem and "unlinks" everything by decrementing link counts
+// appropriately. There should be no open file descriptors when this is called,
+// so each inode should only have one outstanding reference that is removed once
+// its link count hits zero.
+//
+// Note that we do not update filesystem state precisely while tearing down (for
+// instance, the child maps are ignored)--we only care to remove all remaining
+// references so that every filesystem object gets destroyed. Also note that we
+// do not need to trigger DecRef on the mount point itself or any child mount;
+// these are taken care of by the destructor of the enclosing MountNamespace.
+//
+// Precondition: filesystem.mu is held.
+func (d *dentry) releaseChildrenLocked(ctx context.Context) {
+	dir := d.inode.impl.(*directory)
+	for _, child := range dir.childMap {
+		if child.inode.isDir() {
+			child.releaseChildrenLocked(ctx)
+			child.inode.decLinksLocked(ctx) // link for child/.
+			dir.inode.decLinksLocked(ctx)   // link for child/..
+		}
+		child.inode.decLinksLocked(ctx) // link for child
+	}
+}
+
+// immutable
+var globalStatfs = linux.Statfs{
+	Type:         linux.TMPFS_MAGIC,
+	BlockSize:    usermem.PageSize,
+	FragmentSize: usermem.PageSize,
+	NameLength:   linux.NAME_MAX,
+
+	// tmpfs currently does not support configurable size limits. In Linux,
+	// such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from
+	// statfs(2). However, many applications treat this as having a size limit
+	// of 0. To work around this, claim to have a very large but non-zero size,
+	// chosen to ensure that BlockSize * Blocks does not overflow int64 (which
+	// applications may also handle incorrectly).
+	// TODO(b/29637826): allow configuring a tmpfs size and enforce it.
+	Blocks:          math.MaxInt64 / usermem.PageSize,
+	BlocksFree:      math.MaxInt64 / usermem.PageSize,
+	BlocksAvailable: math.MaxInt64 / usermem.PageSize,
 }
 
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
@@ -281,6 +346,8 @@ func (d *dentry) Watches() *vfs.Watches {
 func (d *dentry) OnZeroWatches(context.Context) {}
 
 // inode represents a filesystem object.
+//
+// +stateify savable
 type inode struct {
 	// fs is the owning filesystem. fs is immutable.
 	fs *filesystem
@@ -297,12 +364,12 @@ type inode struct {
 
 	// Inode metadata. Writing multiple fields atomically requires holding
 	// mu, othewise atomic operations can be used.
-	mu    sync.Mutex
-	mode  uint32 // file type and mode
-	nlink uint32 // protected by filesystem.mu instead of inode.mu
-	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
-	gid   uint32 // auth.KGID, but ...
-	ino   uint64 // immutable
+	mu    sync.Mutex `state:"nosave"`
+	mode  uint32     // file type and mode
+	nlink uint32     // protected by filesystem.mu instead of inode.mu
+	uid   uint32     // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid   uint32     // auth.KGID, but ...
+	ino   uint64     // immutable
 
 	// Linux's tmpfs has no concept of btime.
 	atime int64 // nanoseconds
@@ -340,8 +407,10 @@ func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth
 
 // incLinksLocked increments i's link count.
 //
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
-// i.nlink < maxLinks.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * i.nlink != 0.
+// * i.nlink < maxLinks.
 func (i *inode) incLinksLocked() {
 	if i.nlink == 0 {
 		panic("tmpfs.inode.incLinksLocked() called with no existing links")
@@ -355,7 +424,9 @@ func (i *inode) incLinksLocked() {
 // decLinksLocked decrements i's link count. If the link count reaches 0, we
 // remove a reference on i as well.
 //
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+// Preconditions:
+// * filesystem.mu must be locked for writing.
+// * i.nlink != 0.
 func (i *inode) decLinksLocked(ctx context.Context) {
 	if i.nlink == 0 {
 		panic("tmpfs.inode.decLinksLocked() called with no existing links")
@@ -594,66 +665,59 @@ func (i *inode) touchCMtime() {
 	i.mu.Unlock()
 }
 
-// Preconditions: The caller has called vfs.Mount.CheckBeginWrite() and holds
-// inode.mu.
+// Preconditions:
+// * The caller has called vfs.Mount.CheckBeginWrite().
+// * inode.mu must be locked.
 func (i *inode) touchCMtimeLocked() {
 	now := i.fs.clock.Now().Nanoseconds()
 	atomic.StoreInt64(&i.mtime, now)
 	atomic.StoreInt64(&i.ctime, now)
 }
 
-func (i *inode) listxattr(size uint64) ([]string, error) {
-	return i.xattrs.Listxattr(size)
+func (i *inode) listXattr(size uint64) ([]string, error) {
+	return i.xattrs.ListXattr(size)
 }
 
-func (i *inode) getxattr(creds *auth.Credentials, opts *vfs.GetxattrOptions) (string, error) {
-	if err := i.checkPermissions(creds, vfs.MayRead); err != nil {
+func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
+	if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
 		return "", err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return "", syserror.EOPNOTSUPP
-	}
-	if !i.userXattrSupported() {
-		return "", syserror.ENODATA
-	}
-	return i.xattrs.Getxattr(opts)
+	return i.xattrs.GetXattr(opts)
 }
 
-func (i *inode) setxattr(creds *auth.Credentials, opts *vfs.SetxattrOptions) error {
-	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
+	if err := i.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(opts.Name, linux.XATTR_USER_PREFIX) {
-		return syserror.EOPNOTSUPP
-	}
-	if !i.userXattrSupported() {
-		return syserror.EPERM
-	}
-	return i.xattrs.Setxattr(opts)
+	return i.xattrs.SetXattr(opts)
 }
 
-func (i *inode) removexattr(creds *auth.Credentials, name string) error {
-	if err := i.checkPermissions(creds, vfs.MayWrite); err != nil {
+func (i *inode) removeXattr(creds *auth.Credentials, name string) error {
+	if err := i.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
 		return err
 	}
-	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
+	return i.xattrs.RemoveXattr(name)
+}
+
+func (i *inode) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
+	// We currently only support extended attributes in the user.* and
+	// trusted.* namespaces. See b/148380782.
+	if !strings.HasPrefix(name, linux.XATTR_USER_PREFIX) && !strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) {
 		return syserror.EOPNOTSUPP
 	}
-	if !i.userXattrSupported() {
-		return syserror.EPERM
+	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
+	kuid := auth.KUID(atomic.LoadUint32(&i.uid))
+	kgid := auth.KGID(atomic.LoadUint32(&i.gid))
+	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
+		return err
 	}
-	return i.xattrs.Removexattr(name)
-}
-
-// Extended attributes in the user.* namespace are only supported for regular
-// files and directories.
-func (i *inode) userXattrSupported() bool {
-	filetype := linux.S_IFMT & atomic.LoadUint32(&i.mode)
-	return filetype == linux.S_IFREG || filetype == linux.S_IFDIR
+	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
 }
 
 // fileDescription is embedded by tmpfs implementations of
 // vfs.FileDescriptionImpl.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -693,20 +757,25 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	return nil
 }
 
-// Listxattr implements vfs.FileDescriptionImpl.Listxattr.
-func (fd *fileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
-	return fd.inode().listxattr(size)
+// StatFS implements vfs.FileDescriptionImpl.StatFS.
+func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+	return globalStatfs, nil
 }
 
-// Getxattr implements vfs.FileDescriptionImpl.Getxattr.
-func (fd *fileDescription) Getxattr(ctx context.Context, opts vfs.GetxattrOptions) (string, error) {
-	return fd.inode().getxattr(auth.CredentialsFromContext(ctx), &opts)
+// ListXattr implements vfs.FileDescriptionImpl.ListXattr.
+func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
+	return fd.inode().listXattr(size)
 }
 
-// Setxattr implements vfs.FileDescriptionImpl.Setxattr.
-func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOptions) error {
+// GetXattr implements vfs.FileDescriptionImpl.GetXattr.
+func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
+	return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts)
+}
+
+// SetXattr implements vfs.FileDescriptionImpl.SetXattr.
+func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
 	d := fd.dentry()
-	if err := d.inode.setxattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
+	if err := d.inode.setXattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
 		return err
 	}
 
@@ -715,10 +784,10 @@ func (fd *fileDescription) Setxattr(ctx context.Context, opts vfs.SetxattrOption
 	return nil
 }
 
-// Removexattr implements vfs.FileDescriptionImpl.Removexattr.
-func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
+// RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
+func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
 	d := fd.dentry()
-	if err := d.inode.removexattr(auth.CredentialsFromContext(ctx), name); err != nil {
+	if err := d.inode.removeXattr(auth.CredentialsFromContext(ctx), name); err != nil {
 		return err
 	}
 
@@ -727,37 +796,6 @@ func (fd *fileDescription) Removexattr(ctx context.Context, name string) error {
 	return nil
 }
 
-// NewMemfd creates a new tmpfs regular file and file description that can back
-// an anonymous fd created by memfd_create.
-func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
-	fs, ok := mount.Filesystem().Impl().(*filesystem)
-	if !ok {
-		panic("NewMemfd() called with non-tmpfs mount")
-	}
-
-	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
-	// S_IRWXUGO.
-	inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777)
-	rf := inode.impl.(*regularFile)
-	if allowSeals {
-		rf.seals = 0
-	}
-
-	d := fs.newDentry(inode)
-	defer d.DecRef(ctx)
-	d.name = name
-
-	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd files are set up with
-	// FMODE_READ | FMODE_WRITE.
-	var fd regularFileFD
-	fd.Init(&inode.locks)
-	flags := uint32(linux.O_RDWR)
-	if err := fd.vfsfd.Init(&fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
-		return nil, err
-	}
-	return &fd.vfsfd, nil
-}
-
 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
 	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
index 6f3e3ae6f..fc5323abc 100644
--- a/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs_test.go
@@ -41,11 +41,12 @@ func newTmpfsRoot(ctx context.Context) (*vfs.VirtualFilesystem, vfs.VirtualDentr
 	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
 		AllowUserMount: true,
 	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.MountOptions{})
 	if err != nil {
 		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
 	}
 	root := mntns.Root()
+	root.IncRef()
 	return vfsObj, root, func() {
 		root.DecRef(ctx)
 		mntns.DecRef(ctx)
diff --git a/pkg/sentry/fsimpl/verity/BUILD b/pkg/sentry/fsimpl/verity/BUILD
index 28d2a4bcb..e265be0ee 100644
--- a/pkg/sentry/fsimpl/verity/BUILD
+++ b/pkg/sentry/fsimpl/verity/BUILD
@@ -1,4 +1,4 @@
-load("//tools:defs.bzl", "go_library")
+load("//tools:defs.bzl", "go_library", "go_test")
 
 licenses(["notice"])
 
@@ -6,6 +6,7 @@ go_library(
     name = "verity",
     srcs = [
         "filesystem.go",
+        "save_restore.go",
         "verity.go",
     ],
     visibility = ["//pkg/sentry:internal"],
@@ -13,11 +14,38 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/context",
         "//pkg/fspath",
+        "//pkg/marshal/primitive",
+        "//pkg/merkletree",
+        "//pkg/refsvfs2",
+        "//pkg/sentry/arch",
         "//pkg/sentry/fs/lock",
+        "//pkg/sentry/kernel",
         "//pkg/sentry/kernel/auth",
         "//pkg/sentry/socket/unix/transport",
         "//pkg/sentry/vfs",
         "//pkg/sync",
         "//pkg/syserror",
+        "//pkg/usermem",
+    ],
+)
+
+go_test(
+    name = "verity_test",
+    srcs = [
+        "verity_test.go",
+    ],
+    library = ":verity",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/context",
+        "//pkg/fspath",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/fsimpl/testutil",
+        "//pkg/sentry/fsimpl/tmpfs",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+        "//pkg/usermem",
     ],
 )
diff --git a/pkg/sentry/fsimpl/verity/filesystem.go b/pkg/sentry/fsimpl/verity/filesystem.go
index 78c6074bd..81dfed266 100644
--- a/pkg/sentry/fsimpl/verity/filesystem.go
+++ b/pkg/sentry/fsimpl/verity/filesystem.go
@@ -15,9 +15,17 @@
 package verity
 
 import (
+	"bytes"
+	"fmt"
+	"io"
+	"strconv"
+	"strings"
+	"sync/atomic"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/merkletree"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -91,10 +99,498 @@ func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*de
 	putDentrySlice(*ds)
 }
 
-// resolveLocked resolves rp to an existing file.
-func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
-	// TODO(b/159261227): Implement resolveLocked.
-	return nil, nil
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// Dentries which may have a reference count of zero, and which therefore
+// should be dropped once traversal is complete, are appended to ds.
+//
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// !rp.Done().
+func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, error) {
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+
+	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+
+afterSymlink:
+	name := rp.Component()
+	if name == "." {
+		rp.Advance()
+		return d, nil
+	}
+	if name == ".." {
+		if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
+			return nil, err
+		} else if isRoot || d.parent == nil {
+			rp.Advance()
+			return d, nil
+		}
+		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return d.parent, nil
+	}
+	child, err := fs.getChildLocked(ctx, d, name, ds)
+	if err != nil {
+		return nil, err
+	}
+	if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
+		return nil, err
+	}
+	if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return child, nil
+}
+
+// verifyChild verifies the hash of child against the already verified hash of
+// the parent to ensure the child is expected.  verifyChild triggers a sentry
+// panic if unexpected modifications to the file system are detected. In
+// noCrashOnVerificationFailure mode it returns a syserror instead.
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+// TODO(b/166474175): Investigate all possible errors returned in this
+// function, and make sure we differentiate all errors that indicate unexpected
+// modifications to the file system from the ones that are not harmful.
+func (fs *filesystem) verifyChild(ctx context.Context, parent *dentry, child *dentry) (*dentry, error) {
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+
+	// Get the path to the child dentry. This is only used to provide path
+	// information in failure case.
+	childPath, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD)
+	if err != nil {
+		return nil, err
+	}
+
+	fs.verityMu.RLock()
+	defer fs.verityMu.RUnlock()
+	// Read the offset of the child from the extended attributes of the
+	// corresponding Merkle tree file.
+	// This is the offset of the hash for child in its parent's Merkle tree
+	// file.
+	off, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  child.lowerMerkleVD,
+		Start: child.lowerMerkleVD,
+	}, &vfs.GetXattrOptions{
+		Name: merkleOffsetInParentXattr,
+		Size: sizeOfStringInt32,
+	})
+
+	// The Merkle tree file for the child should have been created and
+	// contains the expected xattrs. If the file or the xattr does not
+	// exist, it indicates unexpected modifications to the file system.
+	if err == syserror.ENOENT || err == syserror.ENODATA {
+		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleOffsetInParentXattr, childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+	// The offset xattr should be an integer. If it's not, it indicates
+	// unexpected modifications to the file system.
+	offset, err := strconv.Atoi(off)
+	if err != nil {
+		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleOffsetInParentXattr, childPath, err))
+	}
+
+	// Open parent Merkle tree file to read and verify child's hash.
+	parentMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  parent.lowerMerkleVD,
+		Start: parent.lowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+
+	// The parent Merkle tree file should have been created. If it's
+	// missing, it indicates an unexpected modification to the file system.
+	if err == syserror.ENOENT {
+		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to open parent Merkle file for %s: %v", childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	// dataSize is the size of raw data for the Merkle tree. For a file,
+	// dataSize is the size of the whole file. For a directory, dataSize is
+	// the size of all its children's hashes.
+	dataSize, err := parentMerkleFD.GetXattr(ctx, &vfs.GetXattrOptions{
+		Name: merkleSizeXattr,
+		Size: sizeOfStringInt32,
+	})
+
+	// The Merkle tree file for the child should have been created and
+	// contains the expected xattrs. If the file or the xattr does not
+	// exist, it indicates unexpected modifications to the file system.
+	if err == syserror.ENOENT || err == syserror.ENODATA {
+		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for %s: %v", merkleSizeXattr, childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	// The dataSize xattr should be an integer. If it's not, it indicates
+	// unexpected modifications to the file system.
+	parentSize, err := strconv.Atoi(dataSize)
+	if err != nil {
+		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
+	}
+
+	fdReader := vfs.FileReadWriteSeeker{
+		FD:  parentMerkleFD,
+		Ctx: ctx,
+	}
+
+	parentStat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  parent.lowerVD,
+		Start: parent.lowerVD,
+	}, &vfs.StatOptions{})
+	if err == syserror.ENOENT {
+		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to get parent stat for %s: %v", childPath, err))
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	// Since we are verifying against a directory Merkle tree, buf should
+	// contain the hash of the children in the parent Merkle tree when
+	// Verify returns with success.
+	var buf bytes.Buffer
+	if _, err := merkletree.Verify(&merkletree.VerifyParams{
+		Out:                   &buf,
+		File:                  &fdReader,
+		Tree:                  &fdReader,
+		Size:                  int64(parentSize),
+		Name:                  parent.name,
+		Mode:                  uint32(parentStat.Mode),
+		UID:                   parentStat.UID,
+		GID:                   parentStat.GID,
+		ReadOffset:            int64(offset),
+		ReadSize:              int64(merkletree.DigestSize()),
+		Expected:              parent.hash,
+		DataAndTreeInSameFile: true,
+	}); err != nil && err != io.EOF {
+		return nil, alertIntegrityViolation(fmt.Sprintf("Verification for %s failed: %v", childPath, err))
+	}
+
+	// Cache child hash when it's verified the first time.
+	if len(child.hash) == 0 {
+		child.hash = buf.Bytes()
+	}
+	return child, nil
+}
+
+// verifyStat verifies the stat against the verified hash. The mode/uid/gid of
+// the file is cached after verified.
+func (fs *filesystem) verifyStat(ctx context.Context, d *dentry, stat linux.Statx) error {
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+
+	// Get the path to the child dentry. This is only used to provide path
+	// information in failure case.
+	childPath, err := vfsObj.PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD)
+	if err != nil {
+		return err
+	}
+
+	fs.verityMu.RLock()
+	defer fs.verityMu.RUnlock()
+
+	fd, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  d.lowerMerkleVD,
+		Start: d.lowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+	if err == syserror.ENOENT {
+		return alertIntegrityViolation(fmt.Sprintf("Failed to open merkle file for %s: %v", childPath, err))
+	}
+	if err != nil {
+		return err
+	}
+
+	merkleSize, err := fd.GetXattr(ctx, &vfs.GetXattrOptions{
+		Name: merkleSizeXattr,
+		Size: sizeOfStringInt32,
+	})
+
+	if err == syserror.ENODATA {
+		return alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s for merkle file of %s: %v", merkleSizeXattr, childPath, err))
+	}
+	if err != nil {
+		return err
+	}
+
+	size, err := strconv.Atoi(merkleSize)
+	if err != nil {
+		return alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s for %s to int: %v", merkleSizeXattr, childPath, err))
+	}
+
+	fdReader := vfs.FileReadWriteSeeker{
+		FD:  fd,
+		Ctx: ctx,
+	}
+
+	var buf bytes.Buffer
+	params := &merkletree.VerifyParams{
+		Out:        &buf,
+		Tree:       &fdReader,
+		Size:       int64(size),
+		Name:       d.name,
+		Mode:       uint32(stat.Mode),
+		UID:        stat.UID,
+		GID:        stat.GID,
+		ReadOffset: 0,
+		// Set read size to 0 so only the metadata is verified.
+		ReadSize:              0,
+		Expected:              d.hash,
+		DataAndTreeInSameFile: false,
+	}
+	if atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR {
+		params.DataAndTreeInSameFile = true
+	}
+
+	if _, err := merkletree.Verify(params); err != nil && err != io.EOF {
+		return alertIntegrityViolation(fmt.Sprintf("Verification stat for %s failed: %v", childPath, err))
+	}
+	d.mode = uint32(stat.Mode)
+	d.uid = stat.UID
+	d.gid = stat.GID
+	d.size = uint32(size)
+	return nil
+}
+
+// Preconditions: fs.renameMu must be locked. d.dirMu must be locked.
+func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) {
+	if child, ok := parent.children[name]; ok {
+		// If verity is enabled on child, we should check again whether
+		// the file and the corresponding Merkle tree are as expected,
+		// in order to catch deletion/renaming after the last time it's
+		// accessed.
+		if child.verityEnabled() {
+			vfsObj := fs.vfsfs.VirtualFilesystem()
+			// Get the path to the child dentry. This is only used
+			// to provide path information in failure case.
+			path, err := vfsObj.PathnameWithDeleted(ctx, child.fs.rootDentry.lowerVD, child.lowerVD)
+			if err != nil {
+				return nil, err
+			}
+
+			childVD, err := parent.getLowerAt(ctx, vfsObj, name)
+			if err == syserror.ENOENT {
+				// The file was previously accessed. If the
+				// file does not exist now, it indicates an
+				// unexpected modification to the file system.
+				return nil, alertIntegrityViolation(fmt.Sprintf("Target file %s is expected but missing", path))
+			}
+			if err != nil {
+				return nil, err
+			}
+			defer childVD.DecRef(ctx)
+
+			childMerkleVD, err := parent.getLowerAt(ctx, vfsObj, merklePrefix+name)
+			// The Merkle tree file was previous accessed. If it
+			// does not exist now, it indicates an unexpected
+			// modification to the file system.
+			if err == syserror.ENOENT {
+				return nil, alertIntegrityViolation(fmt.Sprintf("Expected Merkle file for target %s but none found", path))
+			}
+			if err != nil {
+				return nil, err
+			}
+
+			defer childMerkleVD.DecRef(ctx)
+		}
+
+		// If enabling verification on files/directories is not allowed
+		// during runtime, all cached children are already verified. If
+		// runtime enable is allowed and the parent directory is
+		// enabled, we should verify the child hash here because it may
+		// be cached before enabled.
+		if fs.allowRuntimeEnable {
+			if parent.verityEnabled() {
+				if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+					return nil, err
+				}
+			}
+			if child.verityEnabled() {
+				vfsObj := fs.vfsfs.VirtualFilesystem()
+				mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
+				stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+					Root:  child.lowerVD,
+					Start: child.lowerVD,
+				}, &vfs.StatOptions{
+					Mask: mask,
+				})
+				if err != nil {
+					return nil, err
+				}
+				if err := fs.verifyStat(ctx, child, stat); err != nil {
+					return nil, err
+				}
+			}
+		}
+		return child, nil
+	}
+	child, err := fs.lookupAndVerifyLocked(ctx, parent, name)
+	if err != nil {
+		return nil, err
+	}
+	if parent.children == nil {
+		parent.children = make(map[string]*dentry)
+	}
+	parent.children[name] = child
+	// child's refcount is initially 0, so it may be dropped after traversal.
+	*ds = appendDentry(*ds, child)
+	return child, nil
+}
+
+// Preconditions: fs.renameMu must be locked. parent.dirMu must be locked.
+func (fs *filesystem) lookupAndVerifyLocked(ctx context.Context, parent *dentry, name string) (*dentry, error) {
+	vfsObj := fs.vfsfs.VirtualFilesystem()
+
+	childVD, childErr := parent.getLowerAt(ctx, vfsObj, name)
+	// We will handle ENOENT separately, as it may indicate unexpected
+	// modifications to the file system, and may cause a sentry panic.
+	if childErr != nil && childErr != syserror.ENOENT {
+		return nil, childErr
+	}
+
+	// The dentry needs to be cleaned up if any error occurs. IncRef will be
+	// called if a verity child dentry is successfully created.
+	if childErr == nil {
+		defer childVD.DecRef(ctx)
+	}
+
+	childMerkleVD, childMerkleErr := parent.getLowerAt(ctx, vfsObj, merklePrefix+name)
+	// We will handle ENOENT separately, as it may indicate unexpected
+	// modifications to the file system, and may cause a sentry panic.
+	if childMerkleErr != nil && childMerkleErr != syserror.ENOENT {
+		return nil, childMerkleErr
+	}
+
+	// The dentry needs to be cleaned up if any error occurs. IncRef will be
+	// called if a verity child dentry is successfully created.
+	if childMerkleErr == nil {
+		defer childMerkleVD.DecRef(ctx)
+	}
+
+	// Get the path to the parent dentry. This is only used to provide path
+	// information in failure case.
+	parentPath, err := vfsObj.PathnameWithDeleted(ctx, parent.fs.rootDentry.lowerVD, parent.lowerVD)
+	if err != nil {
+		return nil, err
+	}
+
+	// TODO(b/166474175): Investigate all possible errors of childErr and
+	// childMerkleErr, and make sure we differentiate all errors that
+	// indicate unexpected modifications to the file system from the ones
+	// that are not harmful.
+	if childErr == syserror.ENOENT && childMerkleErr == nil {
+		// Failed to get child file/directory dentry. However the
+		// corresponding Merkle tree is found. This indicates an
+		// unexpected modification to the file system that
+		// removed/renamed the child.
+		return nil, alertIntegrityViolation(fmt.Sprintf("Target file %s is expected but missing", parentPath+"/"+name))
+	} else if childErr == nil && childMerkleErr == syserror.ENOENT {
+		// If in allowRuntimeEnable mode, and the Merkle tree file is
+		// not created yet, we create an empty Merkle tree file, so that
+		// if the file is enabled through ioctl, we have the Merkle tree
+		// file open and ready to use.
+		// This may cause empty and unused Merkle tree files in
+		// allowRuntimeEnable mode, if they are never enabled. This
+		// does not affect verification, as we rely on cached hash to
+		// decide whether to perform verification, not the existence of
+		// the Merkle tree file. Also, those Merkle tree files are
+		// always hidden and cannot be accessed by verity fs users.
+		if fs.allowRuntimeEnable {
+			childMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+				Root:  parent.lowerVD,
+				Start: parent.lowerVD,
+				Path:  fspath.Parse(merklePrefix + name),
+			}, &vfs.OpenOptions{
+				Flags: linux.O_RDWR | linux.O_CREAT,
+				Mode:  0644,
+			})
+			if err != nil {
+				return nil, err
+			}
+			childMerkleFD.DecRef(ctx)
+			childMerkleVD, err = parent.getLowerAt(ctx, vfsObj, merklePrefix+name)
+			if err != nil {
+				return nil, err
+			}
+		} else {
+			// If runtime enable is not allowed. This indicates an
+			// unexpected modification to the file system that
+			// removed/renamed the Merkle tree file.
+			return nil, alertIntegrityViolation(fmt.Sprintf("Expected Merkle file for target %s but none found", parentPath+"/"+name))
+		}
+	} else if childErr == syserror.ENOENT && childMerkleErr == syserror.ENOENT {
+		// Both the child and the corresponding Merkle tree are missing.
+		// This could be an unexpected modification or due to incorrect
+		// parameter.
+		// TODO(b/167752508): Investigate possible ways to differentiate
+		// cases that both files are deleted from cases that they never
+		// exist in the file system.
+		return nil, alertIntegrityViolation(fmt.Sprintf("Failed to find file %s", parentPath+"/"+name))
+	}
+
+	mask := uint32(linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID)
+	stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  childVD,
+		Start: childVD,
+	}, &vfs.StatOptions{
+		Mask: mask,
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	child := fs.newDentry()
+	child.lowerVD = childVD
+	child.lowerMerkleVD = childMerkleVD
+
+	// Increase the reference for both childVD and childMerkleVD as they are
+	// held by child. If this function fails and the child is destroyed, the
+	// references will be decreased in destroyLocked.
+	childVD.IncRef()
+	childMerkleVD.IncRef()
+
+	parent.IncRef()
+	child.parent = parent
+	child.name = name
+
+	child.mode = uint32(stat.Mode)
+	child.uid = stat.UID
+	child.gid = stat.GID
+
+	// Verify child hash. This should always be performed unless in
+	// allowRuntimeEnable mode and the parent directory hasn't been enabled
+	// yet.
+	if parent.verityEnabled() {
+		if _, err := fs.verifyChild(ctx, parent, child); err != nil {
+			child.destroyLocked(ctx)
+			return nil, err
+		}
+	}
+	if child.verityEnabled() {
+		if err := fs.verifyStat(ctx, child, stat); err != nil {
+			child.destroyLocked(ctx)
+			return nil, err
+		}
+	}
+
+	return child, nil
 }
 
 // walkParentDirLocked resolves all but the last path component of rp to an
@@ -104,8 +600,39 @@ func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath,
 //
 // Preconditions: fs.renameMu must be locked. !rp.Done().
 func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
-	// TODO(b/159261227): Implement walkParentDirLocked.
-	return nil, nil
+	for !rp.Final() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// Preconditions: fs.renameMu must be locked.
+func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	for !rp.Done() {
+		d.dirMu.Lock()
+		next, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds)
+		d.dirMu.Unlock()
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
 }
 
 // AccessAt implements vfs.Filesystem.Impl.AccessAt.
@@ -179,8 +706,183 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 
 // OpenAt implements vfs.FilesystemImpl.OpenAt.
 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	//TODO(b/159261227): Implement OpenAt.
-	return nil, nil
+	// Verity fs is read-only.
+	if opts.Flags&(linux.O_WRONLY|linux.O_CREAT) != 0 {
+		return nil, syserror.EROFS
+	}
+
+	var ds *[]*dentry
+	fs.renameMu.RLock()
+	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
+
+	start := rp.Start().Impl().(*dentry)
+	if rp.Done() {
+		return start.openLocked(ctx, rp, &opts)
+	}
+
+afterTrailingSymlink:
+	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
+	if err != nil {
+		return nil, err
+	}
+
+	// Check for search permission in the parent directory.
+	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
+		return nil, err
+	}
+
+	// Open existing child or follow symlink.
+	parent.dirMu.Lock()
+	child, err := fs.stepLocked(ctx, rp, parent, false /*mayFollowSymlinks*/, &ds)
+	parent.dirMu.Unlock()
+	if err != nil {
+		return nil, err
+	}
+	if child.isSymlink() && rp.ShouldFollowSymlink() {
+		target, err := child.readlink(ctx)
+		if err != nil {
+			return nil, err
+		}
+		if err := rp.HandleSymlink(target); err != nil {
+			return nil, err
+		}
+		start = parent
+		goto afterTrailingSymlink
+	}
+	return child.openLocked(ctx, rp, &opts)
+}
+
+// Preconditions: fs.renameMu must be locked.
+func (d *dentry) openLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
+	// Users should not open the Merkle tree files. Those are for verity fs
+	// use only.
+	if strings.Contains(d.name, merklePrefix) {
+		return nil, syserror.EPERM
+	}
+	ats := vfs.AccessTypesForOpenFlags(opts)
+	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
+		return nil, err
+	}
+
+	// Verity fs is read-only.
+	if ats&vfs.MayWrite != 0 {
+		return nil, syserror.EROFS
+	}
+
+	// Get the path to the target file. This is only used to provide path
+	// information in failure case.
+	path, err := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.lowerVD)
+	if err != nil {
+		return nil, err
+	}
+
+	// Open the file in the underlying file system.
+	lowerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+	}, opts)
+
+	// The file should exist, as we succeeded in finding its dentry. If it's
+	// missing, it indicates an unexpected modification to the file system.
+	if err != nil {
+		if err == syserror.ENOENT {
+			return nil, alertIntegrityViolation(fmt.Sprintf("File %s expected but not found", path))
+		}
+		return nil, err
+	}
+
+	// lowerFD needs to be cleaned up if any error occurs. IncRef will be
+	// called if a verity FD is successfully created.
+	defer lowerFD.DecRef(ctx)
+
+	// Open the Merkle tree file corresponding to the current file/directory
+	// to be used later for verifying Read/Walk.
+	merkleReader, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.lowerMerkleVD,
+		Start: d.lowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+
+	// The Merkle tree file should exist, as we succeeded in finding its
+	// dentry. If it's missing, it indicates an unexpected modification to
+	// the file system.
+	if err != nil {
+		if err == syserror.ENOENT {
+			return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", path))
+		}
+		return nil, err
+	}
+
+	// merkleReader needs to be cleaned up if any error occurs. IncRef will
+	// be called if a verity FD is successfully created.
+	defer merkleReader.DecRef(ctx)
+
+	lowerFlags := lowerFD.StatusFlags()
+	lowerFDOpts := lowerFD.Options()
+	var merkleWriter *vfs.FileDescription
+	var parentMerkleWriter *vfs.FileDescription
+
+	// Only open the Merkle tree files for write if in allowRuntimeEnable
+	// mode.
+	if d.fs.allowRuntimeEnable {
+		merkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+			Root:  d.lowerMerkleVD,
+			Start: d.lowerMerkleVD,
+		}, &vfs.OpenOptions{
+			Flags: linux.O_WRONLY | linux.O_APPEND,
+		})
+		if err != nil {
+			if err == syserror.ENOENT {
+				return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", path))
+			}
+			return nil, err
+		}
+		// merkleWriter is cleaned up if any error occurs. IncRef will
+		// be called if a verity FD is created successfully.
+		defer merkleWriter.DecRef(ctx)
+
+		if d.parent != nil {
+			parentMerkleWriter, err = rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
+				Root:  d.parent.lowerMerkleVD,
+				Start: d.parent.lowerMerkleVD,
+			}, &vfs.OpenOptions{
+				Flags: linux.O_WRONLY | linux.O_APPEND,
+			})
+			if err != nil {
+				if err == syserror.ENOENT {
+					parentPath, _ := d.fs.vfsfs.VirtualFilesystem().PathnameWithDeleted(ctx, d.fs.rootDentry.lowerVD, d.parent.lowerVD)
+					return nil, alertIntegrityViolation(fmt.Sprintf("Merkle file for %s expected but not found", parentPath))
+				}
+				return nil, err
+			}
+			// parentMerkleWriter is cleaned up if any error occurs. IncRef
+			// will be called if a verity FD is created successfully.
+			defer parentMerkleWriter.DecRef(ctx)
+		}
+	}
+
+	fd := &fileDescription{
+		d:                  d,
+		lowerFD:            lowerFD,
+		merkleReader:       merkleReader,
+		merkleWriter:       merkleWriter,
+		parentMerkleWriter: parentMerkleWriter,
+		isDir:              d.isDir(),
+	}
+
+	if err := fd.vfsfd.Init(fd, lowerFlags, rp.Mount(), &d.vfsd, &lowerFDOpts); err != nil {
+		return nil, err
+	}
+	lowerFD.IncRef()
+	merkleReader.IncRef()
+	if merkleWriter != nil {
+		merkleWriter.IncRef()
+	}
+	if parentMerkleWriter != nil {
+		parentMerkleWriter.IncRef()
+	}
+	return &fd.vfsfd, err
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
@@ -218,6 +920,8 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 }
 
 // StatAt implements vfs.FilesystemImpl.StatAt.
+// TODO(b/170157489): Investigate whether stats other than Mode/UID/GID should
+// be verified.
 func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
@@ -235,6 +939,11 @@ func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vf
 	if err != nil {
 		return linux.Statx{}, err
 	}
+	if d.verityEnabled() {
+		if err := fs.verifyStat(ctx, d, stat); err != nil {
+			return linux.Statx{}, err
+		}
+	}
 	return stat, nil
 }
 
@@ -256,7 +965,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return syserror.EROFS
 }
 
-// BoundEndpointAt implements FilesystemImpl.BoundEndpointAt.
+// BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
@@ -267,8 +976,8 @@ func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath
 	return nil, syserror.ECONNREFUSED
 }
 
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
+// ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
+func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -277,14 +986,14 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath, si
 		return nil, err
 	}
 	lowerVD := d.lowerVD
-	return fs.vfsfs.VirtualFilesystem().ListxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+	return fs.vfsfs.VirtualFilesystem().ListXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
 		Root:  lowerVD,
 		Start: lowerVD,
 	}, size)
 }
 
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetxattrOptions) (string, error) {
+// GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
+func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
 	var ds *[]*dentry
 	fs.renameMu.RLock()
 	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
@@ -293,20 +1002,20 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 		return "", err
 	}
 	lowerVD := d.lowerVD
-	return fs.vfsfs.VirtualFilesystem().GetxattrAt(ctx, d.fs.creds, &vfs.PathOperation{
+	return fs.vfsfs.VirtualFilesystem().GetXattrAt(ctx, d.fs.creds, &vfs.PathOperation{
 		Root:  lowerVD,
 		Start: lowerVD,
 	}, &opts)
 }
 
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+// SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
+func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
 	// Verity file system is read-only.
 	return syserror.EROFS
 }
 
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+// RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
+func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	// Verity file system is read-only.
 	return syserror.EROFS
 }
diff --git a/pkg/sentry/fsimpl/verity/save_restore.go b/pkg/sentry/fsimpl/verity/save_restore.go
new file mode 100644
index 000000000..4a161163c
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/save_restore.go
@@ -0,0 +1,27 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package verity
+
+import (
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/refsvfs2"
+)
+
+func (d *dentry) afterLoad() {
+	if refsvfs2.LeakCheckEnabled() && atomic.LoadInt64(&d.refs) != -1 {
+		refsvfs2.Register(d, "verity.dentry")
+	}
+}
diff --git a/pkg/sentry/fsimpl/verity/verity.go b/pkg/sentry/fsimpl/verity/verity.go
index cb29d33a5..e2cbb206f 100644
--- a/pkg/sentry/fsimpl/verity/verity.go
+++ b/pkg/sentry/fsimpl/verity/verity.go
@@ -22,29 +22,62 @@
 package verity
 
 import (
+	"fmt"
+	"math"
+	"strconv"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/marshal/primitive"
+	"gvisor.dev/gvisor/pkg/merkletree"
+	"gvisor.dev/gvisor/pkg/refsvfs2"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
 	fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
 )
 
 // Name is the default filesystem name.
 const Name = "verity"
 
-// testOnlyDebugging allows verity file system to return error instead of
-// crashing the application when a malicious action is detected. This should
-// only be set for tests.
-var testOnlyDebugging bool
+// merklePrefix is the prefix of the Merkle tree files. For example, the Merkle
+// tree file for "/foo" is "/.merkle.verity.foo".
+const merklePrefix = ".merkle.verity."
+
+// merkleoffsetInParentXattr is the extended attribute name specifying the
+// offset of child hash in its parent's Merkle tree.
+const merkleOffsetInParentXattr = "user.merkle.offset"
+
+// merkleSizeXattr is the extended attribute name specifying the size of data
+// hashed by the corresponding Merkle tree. For a file, it's the size of the
+// whole file. For a directory, it's the size of all its children's hashes.
+const merkleSizeXattr = "user.merkle.size"
+
+// sizeOfStringInt32 is the size for a 32 bit integer stored as string in
+// extended attributes. The maximum value of a 32 bit integer is 10 digits.
+const sizeOfStringInt32 = 10
+
+// noCrashOnVerificationFailure indicates whether the sandbox should panic
+// whenever verification fails. If true, an error is returned instead of
+// panicking. This should only be set for tests.
+// TOOD(b/165661693): Decide whether to panic or return error based on this
+// flag.
+var noCrashOnVerificationFailure bool
 
 // FilesystemType implements vfs.FilesystemType.
+//
+// +stateify savable
 type FilesystemType struct{}
 
 // filesystem implements vfs.FilesystemImpl.
+//
+// +stateify savable
 type filesystem struct {
 	vfsfs vfs.Filesystem
 
@@ -69,11 +102,24 @@ type filesystem struct {
 	// renameMu synchronizes renaming with non-renaming operations in order
 	// to ensure consistent lock ordering between dentry.dirMu in different
 	// dentries.
-	renameMu sync.RWMutex
+	renameMu sync.RWMutex `state:"nosave"`
+
+	// verityMu synchronizes enabling verity files, protects files or
+	// directories from being enabled by different threads simultaneously.
+	// It also ensures that verity does not access files that are being
+	// enabled.
+	//
+	// Also, the directory Merkle trees depends on the generated trees of
+	// its children. So they shouldn't be enabled the same time. This lock
+	// is for the whole file system to ensure that no more than one file is
+	// enabled the same time.
+	verityMu sync.RWMutex
 }
 
 // InternalFilesystemOptions may be passed as
 // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
+//
+// +stateify savable
 type InternalFilesystemOptions struct {
 	// RootMerkleFileName is the name of the verity root Merkle tree file.
 	RootMerkleFileName string
@@ -93,10 +139,10 @@ type InternalFilesystemOptions struct {
 	// system wrapped by verity file system.
 	LowerGetFSOptions vfs.GetFilesystemOptions
 
-	// TestOnlyDebugging allows verity file system to return error instead
-	// of crashing the application when a malicious action is detected. This
-	// should only be set for tests.
-	TestOnlyDebugging bool
+	// NoCrashOnVerificationFailure indicates whether the sandbox should
+	// panic whenever verification fails. If true, an error is returned
+	// instead of panicking. This should only be set for tests.
+	NoCrashOnVerificationFailure bool
 }
 
 // Name implements vfs.FilesystemType.Name.
@@ -104,10 +150,129 @@ func (FilesystemType) Name() string {
 	return Name
 }
 
+// Release implements vfs.FilesystemType.Release.
+func (FilesystemType) Release(ctx context.Context) {}
+
+// alertIntegrityViolation alerts a violation of integrity, which usually means
+// unexpected modification to the file system is detected. In
+// noCrashOnVerificationFailure mode, it returns EIO, otherwise it panic.
+func alertIntegrityViolation(msg string) error {
+	if noCrashOnVerificationFailure {
+		return syserror.EIO
+	}
+	panic(msg)
+}
+
 // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	//TODO(b/159261227): Implement GetFilesystem.
-	return nil, nil, nil
+	iopts, ok := opts.InternalData.(InternalFilesystemOptions)
+	if !ok {
+		ctx.Warningf("verity.FilesystemType.GetFilesystem: missing verity configs")
+		return nil, nil, syserror.EINVAL
+	}
+	noCrashOnVerificationFailure = iopts.NoCrashOnVerificationFailure
+
+	// Mount the lower file system. The lower file system is wrapped inside
+	// verity, and should not be exposed or connected.
+	mopts := &vfs.MountOptions{
+		GetFilesystemOptions: iopts.LowerGetFSOptions,
+		InternalMount:        true,
+	}
+	mnt, err := vfsObj.MountDisconnected(ctx, creds, "", iopts.LowerName, mopts)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	fs := &filesystem{
+		creds:              creds.Fork(),
+		lowerMount:         mnt,
+		allowRuntimeEnable: iopts.AllowRuntimeEnable,
+	}
+	fs.vfsfs.Init(vfsObj, &fstype, fs)
+
+	// Construct the root dentry.
+	d := fs.newDentry()
+	d.refs = 1
+	lowerVD := vfs.MakeVirtualDentry(mnt, mnt.Root())
+	lowerVD.IncRef()
+	d.lowerVD = lowerVD
+
+	rootMerkleName := merklePrefix + iopts.RootMerkleFileName
+
+	lowerMerkleVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+		Path:  fspath.Parse(rootMerkleName),
+	}, &vfs.GetDentryOptions{})
+
+	// If runtime enable is allowed, the root merkle tree may be absent. We
+	// should create the tree file.
+	if err == syserror.ENOENT && fs.allowRuntimeEnable {
+		lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  lowerVD,
+			Start: lowerVD,
+			Path:  fspath.Parse(rootMerkleName),
+		}, &vfs.OpenOptions{
+			Flags: linux.O_RDWR | linux.O_CREAT,
+			Mode:  0644,
+		})
+		if err != nil {
+			fs.vfsfs.DecRef(ctx)
+			d.DecRef(ctx)
+			return nil, nil, err
+		}
+		lowerMerkleFD.DecRef(ctx)
+		lowerMerkleVD, err = vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
+			Root:  lowerVD,
+			Start: lowerVD,
+			Path:  fspath.Parse(rootMerkleName),
+		}, &vfs.GetDentryOptions{})
+		if err != nil {
+			fs.vfsfs.DecRef(ctx)
+			d.DecRef(ctx)
+			return nil, nil, err
+		}
+	} else if err != nil {
+		// Failed to get dentry for the root Merkle file. This
+		// indicates an unexpected modification that removed/renamed
+		// the root Merkle file, or it's never generated.
+		fs.vfsfs.DecRef(ctx)
+		d.DecRef(ctx)
+		return nil, nil, alertIntegrityViolation("Failed to find root Merkle file")
+	}
+	d.lowerMerkleVD = lowerMerkleVD
+
+	// Get metadata from the underlying file system.
+	const statMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID
+	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+	}, &vfs.StatOptions{
+		Mask: statMask,
+	})
+	if err != nil {
+		fs.vfsfs.DecRef(ctx)
+		d.DecRef(ctx)
+		return nil, nil, err
+	}
+
+	d.mode = uint32(stat.Mode)
+	d.uid = stat.UID
+	d.gid = stat.GID
+	d.hash = make([]byte, len(iopts.RootHash))
+
+	if !fs.allowRuntimeEnable {
+		if err := fs.verifyStat(ctx, d, stat); err != nil {
+			return nil, nil, err
+		}
+	}
+
+	copy(d.hash, iopts.RootHash)
+	d.vfsd.Init(d)
+
+	fs.rootDentry = d
+
+	return &fs.vfsfs, &d.vfsd, nil
 }
 
 // Release implements vfs.FilesystemImpl.Release.
@@ -116,6 +281,8 @@ func (fs *filesystem) Release(ctx context.Context) {
 }
 
 // dentry implements vfs.DentryImpl.
+//
+// +stateify savable
 type dentry struct {
 	vfsd vfs.Dentry
 
@@ -124,11 +291,12 @@ type dentry struct {
 	// fs is the owning filesystem. fs is immutable.
 	fs *filesystem
 
-	// mode, uid and gid are the file mode, owner, and group of the file in
-	// the underlying file system.
+	// mode, uid, gid and size are the file mode, owner, group, and size of
+	// the file in the underlying file system.
 	mode uint32
 	uid  uint32
 	gid  uint32
+	size uint32
 
 	// parent is the dentry corresponding to this dentry's parent directory.
 	// name is this dentry's name in parent. If this dentry is a filesystem
@@ -142,7 +310,7 @@ type dentry struct {
 	// and dirents (if not nil) is a cache of dirents as returned by
 	// directoryFDs representing this directory. children is protected by
 	// dirMu.
-	dirMu    sync.Mutex
+	dirMu    sync.Mutex `state:"nosave"`
 	children map[string]*dentry
 
 	// lowerVD is the VirtualDentry in the underlying file system.
@@ -152,8 +320,8 @@ type dentry struct {
 	// in the underlying file system.
 	lowerMerkleVD vfs.VirtualDentry
 
-	// rootHash is the rootHash for the current file or directory.
-	rootHash []byte
+	// hash is the calculated hash for the current file or directory.
+	hash []byte
 }
 
 // newDentry creates a new dentry representing the given verity file. The
@@ -166,6 +334,9 @@ func (fs *filesystem) newDentry() *dentry {
 		fs: fs,
 	}
 	d.vfsd.Init(d)
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Register(d, "verity.dentry")
+	}
 	return d
 }
 
@@ -228,6 +399,9 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 	if d.lowerVD.Ok() {
 		d.lowerVD.DecRef(ctx)
 	}
+	if refsvfs2.LeakCheckEnabled() {
+		refsvfs2.Unregister(d, "verity.dentry")
+	}
 
 	if d.lowerMerkleVD.Ok() {
 		d.lowerMerkleVD.DecRef(ctx)
@@ -247,6 +421,11 @@ func (d *dentry) destroyLocked(ctx context.Context) {
 	}
 }
 
+// LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
+func (d *dentry) LeakMessage() string {
+	return fmt.Sprintf("[verity.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
+}
+
 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
 func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
 	//TODO(b/159261227): Implement InotifyWithParent.
@@ -275,6 +454,24 @@ func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes)
 	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
 }
 
+// verityEnabled checks whether the file is enabled with verity features. It
+// should always be true if runtime enable is not allowed. In runtime enable
+// mode, it returns true if the target has been enabled with
+// ioctl(FS_IOC_ENABLE_VERITY).
+func (d *dentry) verityEnabled() bool {
+	return !d.fs.allowRuntimeEnable || len(d.hash) != 0
+}
+
+// getLowerAt returns the dentry in the underlying file system, which is
+// represented by filename relative to d.
+func (d *dentry) getLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, filename string) (vfs.VirtualDentry, error) {
+	return vfsObj.GetDentryAt(ctx, d.fs.creds, &vfs.PathOperation{
+		Root:  d.lowerVD,
+		Start: d.lowerVD,
+		Path:  fspath.Parse(filename),
+	}, &vfs.GetDentryOptions{})
+}
+
 func (d *dentry) readlink(ctx context.Context) (string, error) {
 	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
 		Root:  d.lowerVD,
@@ -286,6 +483,8 @@ func (d *dentry) readlink(ctx context.Context) (string, error) {
 // FileDescription is a wrapper of the underlying lowerFD, with support to build
 // Merkle trees through the Linux fs-verity API to verify contents read from
 // lowerFD.
+//
+// +stateify savable
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
@@ -314,6 +513,10 @@ type fileDescription struct {
 	// directory that contains the current file/directory. This is only used
 	// if allowRuntimeEnable is set to true.
 	parentMerkleWriter *vfs.FileDescription
+
+	// off is the file offset. off is protected by mu.
+	mu  sync.Mutex `state:"nosave"`
+	off int64
 }
 
 // Release implements vfs.FileDescriptionImpl.Release.
@@ -335,6 +538,11 @@ func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linu
 	if err != nil {
 		return linux.Statx{}, err
 	}
+	if fd.d.verityEnabled() {
+		if err := fd.d.fs.verifyStat(ctx, fd.d, stat); err != nil {
+			return linux.Statx{}, err
+		}
+	}
 	return stat, nil
 }
 
@@ -344,12 +552,325 @@ func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions)
 	return syserror.EPERM
 }
 
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.mu.Lock()
+	defer fd.mu.Unlock()
+	n := int64(0)
+	switch whence {
+	case linux.SEEK_SET:
+		// use offset as specified
+	case linux.SEEK_CUR:
+		n = fd.off
+	case linux.SEEK_END:
+		n = int64(fd.d.size)
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset > math.MaxInt64-n {
+		return 0, syserror.EINVAL
+	}
+	offset += n
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// generateMerkle generates a Merkle tree file for fd. If fd points to a file
+// /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The hash
+// of the generated Merkle tree and the data size is returned.  If fd points to
+// a regular file, the data is the content of the file. If fd points to a
+// directory, the data is all hahes of its children, written to the Merkle tree
+// file.
+func (fd *fileDescription) generateMerkle(ctx context.Context) ([]byte, uint64, error) {
+	fdReader := vfs.FileReadWriteSeeker{
+		FD:  fd.lowerFD,
+		Ctx: ctx,
+	}
+	merkleReader := vfs.FileReadWriteSeeker{
+		FD:  fd.merkleReader,
+		Ctx: ctx,
+	}
+	merkleWriter := vfs.FileReadWriteSeeker{
+		FD:  fd.merkleWriter,
+		Ctx: ctx,
+	}
+	params := &merkletree.GenerateParams{
+		TreeReader: &merkleReader,
+		TreeWriter: &merkleWriter,
+	}
+
+	switch atomic.LoadUint32(&fd.d.mode) & linux.S_IFMT {
+	case linux.S_IFREG:
+		// For a regular file, generate a Merkle tree based on its
+		// content.
+		var err error
+		stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			return nil, 0, err
+		}
+
+		params.File = &fdReader
+		params.Size = int64(stat.Size)
+		params.Name = fd.d.name
+		params.Mode = uint32(stat.Mode)
+		params.UID = stat.UID
+		params.GID = stat.GID
+		params.DataAndTreeInSameFile = false
+	case linux.S_IFDIR:
+		// For a directory, generate a Merkle tree based on the hashes
+		// of its children that has already been written to the Merkle
+		// tree file.
+		merkleStat, err := fd.merkleReader.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			return nil, 0, err
+		}
+
+		params.Size = int64(merkleStat.Size)
+
+		stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			return nil, 0, err
+		}
+
+		params.File = &merkleReader
+		params.Name = fd.d.name
+		params.Mode = uint32(stat.Mode)
+		params.UID = stat.UID
+		params.GID = stat.GID
+		params.DataAndTreeInSameFile = true
+	default:
+		// TODO(b/167728857): Investigate whether and how we should
+		// enable other types of file.
+		return nil, 0, syserror.EINVAL
+	}
+	hash, err := merkletree.Generate(params)
+	return hash, uint64(params.Size), err
+}
+
+// enableVerity enables verity features on fd by generating a Merkle tree file
+// and stores its hash in its parent directory's Merkle tree.
+func (fd *fileDescription) enableVerity(ctx context.Context, uio usermem.IO) (uintptr, error) {
+	if !fd.d.fs.allowRuntimeEnable {
+		return 0, syserror.EPERM
+	}
+
+	fd.d.fs.verityMu.Lock()
+	defer fd.d.fs.verityMu.Unlock()
+
+	// In allowRuntimeEnable mode, the underlying fd and read/write fd for
+	// the Merkle tree file should have all been initialized. For any file
+	// or directory other than the root, the parent Merkle tree file should
+	// have also been initialized.
+	if fd.lowerFD == nil || fd.merkleReader == nil || fd.merkleWriter == nil || (fd.parentMerkleWriter == nil && fd.d != fd.d.fs.rootDentry) {
+		return 0, alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds")
+	}
+
+	hash, dataSize, err := fd.generateMerkle(ctx)
+	if err != nil {
+		return 0, err
+	}
+
+	if fd.parentMerkleWriter != nil {
+		stat, err := fd.parentMerkleWriter.Stat(ctx, vfs.StatOptions{})
+		if err != nil {
+			return 0, err
+		}
+
+		// Write the hash of fd to the parent directory's Merkle tree
+		// file, as it should be part of the parent Merkle tree data.
+		// parentMerkleWriter is open with O_APPEND, so it should write
+		// directly to the end of the file.
+		if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(hash), vfs.WriteOptions{}); err != nil {
+			return 0, err
+		}
+
+		// Record the offset of the hash of fd in parent directory's
+		// Merkle tree file.
+		if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+			Name:  merkleOffsetInParentXattr,
+			Value: strconv.Itoa(int(stat.Size)),
+		}); err != nil {
+			return 0, err
+		}
+	}
+
+	// Record the size of the data being hashed for fd.
+	if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
+		Name:  merkleSizeXattr,
+		Value: strconv.Itoa(int(dataSize)),
+	}); err != nil {
+		return 0, err
+	}
+	fd.d.hash = append(fd.d.hash, hash...)
+	return 0, nil
+}
+
+// measureVerity returns the hash of fd, saved in verityDigest.
+func (fd *fileDescription) measureVerity(ctx context.Context, uio usermem.IO, verityDigest usermem.Addr) (uintptr, error) {
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		return 0, syserror.EINVAL
+	}
+	var metadata linux.DigestMetadata
+
+	// If allowRuntimeEnable is true, an empty fd.d.hash indicates that
+	// verity is not enabled for the file. If allowRuntimeEnable is false,
+	// this is an integrity violation because all files should have verity
+	// enabled, in which case fd.d.hash should be set.
+	if len(fd.d.hash) == 0 {
+		if fd.d.fs.allowRuntimeEnable {
+			return 0, syserror.ENODATA
+		}
+		return 0, alertIntegrityViolation("Ioctl measureVerity: no hash found")
+	}
+
+	// The first part of VerityDigest is the metadata.
+	if _, err := metadata.CopyIn(t, verityDigest); err != nil {
+		return 0, err
+	}
+	if metadata.DigestSize < uint16(len(fd.d.hash)) {
+		return 0, syserror.EOVERFLOW
+	}
+
+	// Populate the output digest size, since DigestSize is both input and
+	// output.
+	metadata.DigestSize = uint16(len(fd.d.hash))
+
+	// First copy the metadata.
+	if _, err := metadata.CopyOut(t, verityDigest); err != nil {
+		return 0, err
+	}
+
+	// Now copy the root hash bytes to the memory after metadata.
+	_, err := t.CopyOutBytes(usermem.Addr(uintptr(verityDigest)+linux.SizeOfDigestMetadata), fd.d.hash)
+	return 0, err
+}
+
+func (fd *fileDescription) verityFlags(ctx context.Context, uio usermem.IO, flags usermem.Addr) (uintptr, error) {
+	f := int32(0)
+
+	// All enabled files should store a hash. This flag is not settable via
+	// FS_IOC_SETFLAGS.
+	if len(fd.d.hash) != 0 {
+		f |= linux.FS_VERITY_FL
+	}
+
+	t := kernel.TaskFromContext(ctx)
+	if t == nil {
+		return 0, syserror.EINVAL
+	}
+	_, err := primitive.CopyInt32Out(t, flags, f)
+	return 0, err
+}
+
+// Ioctl implements vfs.FileDescriptionImpl.Ioctl.
+func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	switch cmd := args[1].Uint(); cmd {
+	case linux.FS_IOC_ENABLE_VERITY:
+		return fd.enableVerity(ctx, uio)
+	case linux.FS_IOC_MEASURE_VERITY:
+		return fd.measureVerity(ctx, uio, args[2].Pointer())
+	case linux.FS_IOC_GETFLAGS:
+		return fd.verityFlags(ctx, uio, args[2].Pointer())
+	default:
+		// TODO(b/169682228): Investigate which ioctl commands should
+		// be allowed.
+		return 0, syserror.ENOSYS
+	}
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	// Implement Read with PRead by setting offset.
+	fd.mu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.mu.Unlock()
+	return n, err
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	// No need to verify if the file is not enabled yet in
+	// allowRuntimeEnable mode.
+	if !fd.d.verityEnabled() {
+		return fd.lowerFD.PRead(ctx, dst, offset, opts)
+	}
+
+	fd.d.fs.verityMu.RLock()
+	defer fd.d.fs.verityMu.RUnlock()
+	// dataSize is the size of the whole file.
+	dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{
+		Name: merkleSizeXattr,
+		Size: sizeOfStringInt32,
+	})
+
+	// The Merkle tree file for the child should have been created and
+	// contains the expected xattrs. If the xattr does not exist, it
+	// indicates unexpected modifications to the file system.
+	if err == syserror.ENODATA {
+		return 0, alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err))
+	}
+	if err != nil {
+		return 0, err
+	}
+
+	// The dataSize xattr should be an integer. If it's not, it indicates
+	// unexpected modifications to the file system.
+	size, err := strconv.Atoi(dataSize)
+	if err != nil {
+		return 0, alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err))
+	}
+
+	dataReader := vfs.FileReadWriteSeeker{
+		FD:  fd.lowerFD,
+		Ctx: ctx,
+	}
+
+	merkleReader := vfs.FileReadWriteSeeker{
+		FD:  fd.merkleReader,
+		Ctx: ctx,
+	}
+
+	n, err := merkletree.Verify(&merkletree.VerifyParams{
+		Out:                   dst.Writer(ctx),
+		File:                  &dataReader,
+		Tree:                  &merkleReader,
+		Size:                  int64(size),
+		Name:                  fd.d.name,
+		Mode:                  fd.d.mode,
+		UID:                   fd.d.uid,
+		GID:                   fd.d.gid,
+		ReadOffset:            offset,
+		ReadSize:              dst.NumBytes(),
+		Expected:              fd.d.hash,
+		DataAndTreeInSameFile: false,
+	})
+	if err != nil {
+		return 0, alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err))
+	}
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.EROFS
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	return 0, syserror.EROFS
+}
+
 // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
 func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, start, length uint64, whence int16, block fslock.Blocker) error {
-	return fd.Locks().LockPOSIX(ctx, &fd.vfsfd, uid, t, start, length, whence, block)
+	return fd.lowerFD.LockPOSIX(ctx, uid, t, start, length, whence, block)
 }
 
 // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
 func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, start, length uint64, whence int16) error {
-	return fd.Locks().UnlockPOSIX(ctx, &fd.vfsfd, uid, start, length, whence)
+	return fd.lowerFD.UnlockPOSIX(ctx, uid, start, length, whence)
 }
diff --git a/pkg/sentry/fsimpl/verity/verity_test.go b/pkg/sentry/fsimpl/verity/verity_test.go
new file mode 100644
index 000000000..c647cbfd3
--- /dev/null
+++ b/pkg/sentry/fsimpl/verity/verity_test.go
@@ -0,0 +1,700 @@
+// Copyright 2020 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package verity
+
+import (
+	"fmt"
+	"io"
+	"math/rand"
+	"testing"
+	"time"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/context"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/arch"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/testutil"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+	"gvisor.dev/gvisor/pkg/usermem"
+)
+
+// rootMerkleFilename is the name of the root Merkle tree file.
+const rootMerkleFilename = "root.verity"
+
+// maxDataSize is the maximum data size written to the file for test.
+const maxDataSize = 100000
+
+// newVerityRoot creates a new verity mount, and returns the root. The
+// underlying file system is tmpfs. If the error is not nil, then cleanup
+// should be called when the root is no longer needed.
+func newVerityRoot(t *testing.T) (*vfs.VirtualFilesystem, vfs.VirtualDentry, *kernel.Task, error) {
+	k, err := testutil.Boot()
+	if err != nil {
+		t.Fatalf("testutil.Boot: %v", err)
+	}
+
+	ctx := k.SupervisorContext()
+
+	rand.Seed(time.Now().UnixNano())
+	vfsObj := &vfs.VirtualFilesystem{}
+	if err := vfsObj.Init(ctx); err != nil {
+		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("VFS init: %v", err)
+	}
+
+	vfsObj.MustRegisterFilesystemType("verity", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+
+	vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+
+	mntns, err := vfsObj.NewMountNamespace(ctx, auth.CredentialsFromContext(ctx), "", "verity", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: InternalFilesystemOptions{
+				RootMerkleFileName:           rootMerkleFilename,
+				LowerName:                    "tmpfs",
+				AllowRuntimeEnable:           true,
+				NoCrashOnVerificationFailure: true,
+			},
+		},
+	})
+	if err != nil {
+		return nil, vfs.VirtualDentry{}, nil, fmt.Errorf("NewMountNamespace: %v", err)
+	}
+	root := mntns.Root()
+	root.IncRef()
+
+	// Use lowerRoot in the task as we modify the lower file system
+	// directly in many tests.
+	lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+	tc := k.NewThreadGroup(nil, k.RootPIDNamespace(), kernel.NewSignalHandlers(), linux.SIGCHLD, k.GlobalInit().Limits())
+	task, err := testutil.CreateTask(ctx, "name", tc, mntns, lowerRoot, lowerRoot)
+	if err != nil {
+		t.Fatalf("testutil.CreateTask: %v", err)
+	}
+
+	t.Helper()
+	t.Cleanup(func() {
+		root.DecRef(ctx)
+		mntns.DecRef(ctx)
+	})
+	return vfsObj, root, task, nil
+}
+
+// newFileFD creates a new file in the verity mount, and returns the FD. The FD
+// points to a file that has random data generated.
+func newFileFD(ctx context.Context, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, filePath string, mode linux.FileMode) (*vfs.FileDescription, int, error) {
+	creds := auth.CredentialsFromContext(ctx)
+	lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+
+	// Create the file in the underlying file system.
+	lowerFD, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:  lowerRoot,
+		Start: lowerRoot,
+		Path:  fspath.Parse(filePath),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+		Mode:  linux.ModeRegular | mode,
+	})
+	if err != nil {
+		return nil, 0, err
+	}
+
+	// Generate random data to be written to the file.
+	dataSize := rand.Intn(maxDataSize) + 1
+	data := make([]byte, dataSize)
+	rand.Read(data)
+
+	// Write directly to the underlying FD, since verity FD is read-only.
+	n, err := lowerFD.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		return nil, 0, err
+	}
+
+	if n != int64(len(data)) {
+		return nil, 0, fmt.Errorf("lowerFD.Write got write length %d, want %d", n, len(data))
+	}
+
+	lowerFD.DecRef(ctx)
+
+	// Now open the verity file descriptor.
+	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(filePath),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+		Mode:  linux.ModeRegular | mode,
+	})
+	return fd, dataSize, err
+}
+
+// corruptRandomBit randomly flips a bit in the file represented by fd.
+func corruptRandomBit(ctx context.Context, fd *vfs.FileDescription, size int) error {
+	// Flip a random bit in the underlying file.
+	randomPos := int64(rand.Intn(size))
+	byteToModify := make([]byte, 1)
+	if _, err := fd.PRead(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.ReadOptions{}); err != nil {
+		return fmt.Errorf("lowerFD.PRead: %v", err)
+	}
+	byteToModify[0] ^= 1
+	if _, err := fd.PWrite(ctx, usermem.BytesIOSequence(byteToModify), randomPos, vfs.WriteOptions{}); err != nil {
+		return fmt.Errorf("lowerFD.PWrite: %v", err)
+	}
+	return nil
+}
+
+// TestOpen ensures that when a file is created, the corresponding Merkle tree
+// file and the root Merkle tree file exist.
+func TestOpen(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	if _, _, err := newFileFD(ctx, vfsObj, root, filename, 0644); err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Ensure that the corresponding Merkle tree file is created.
+	lowerRoot := root.Dentry().Impl().(*dentry).lowerVD
+	if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  lowerRoot,
+		Start: lowerRoot,
+		Path:  fspath.Parse(merklePrefix + filename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	}); err != nil {
+		t.Errorf("OpenAt Merkle tree file %s: %v", merklePrefix+filename, err)
+	}
+
+	// Ensure the root merkle tree file is created.
+	if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  lowerRoot,
+		Start: lowerRoot,
+		Path:  fspath.Parse(merklePrefix + rootMerkleFilename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	}); err != nil {
+		t.Errorf("OpenAt root Merkle tree file %s: %v", merklePrefix+rootMerkleFilename, err)
+	}
+}
+
+// TestPReadUnmodifiedFileSucceeds ensures that pread from an untouched verity
+// file succeeds after enabling verity for it.
+func TestPReadUnmodifiedFileSucceeds(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file and confirm a normal read succeeds.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	buf := make([]byte, size)
+	n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{})
+	if err != nil && err != io.EOF {
+		t.Fatalf("fd.PRead: %v", err)
+	}
+
+	if n != int64(size) {
+		t.Errorf("fd.PRead got read length %d, want %d", n, size)
+	}
+}
+
+// TestReadUnmodifiedFileSucceeds ensures that read from an untouched verity
+// file succeeds after enabling verity for it.
+func TestReadUnmodifiedFileSucceeds(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file and confirm a normal read succeeds.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	buf := make([]byte, size)
+	n, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+	if err != nil && err != io.EOF {
+		t.Fatalf("fd.Read: %v", err)
+	}
+
+	if n != int64(size) {
+		t.Errorf("fd.PRead got read length %d, want %d", n, size)
+	}
+}
+
+// TestReopenUnmodifiedFileSucceeds ensures that reopen an untouched verity file
+// succeeds after enabling verity for it.
+func TestReopenUnmodifiedFileSucceeds(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file and confirms a normal read succeeds.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Ensure reopening the verity enabled file succeeds.
+	if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(filename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+		Mode:  linux.ModeRegular,
+	}); err != nil {
+		t.Errorf("reopen enabled file failed: %v", err)
+	}
+}
+
+// TestPReadModifiedFileFails ensures that read from a modified verity file
+// fails.
+func TestPReadModifiedFileFails(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Open a new lowerFD that's read/writable.
+	lowerVD := fd.Impl().(*fileDescription).d.lowerVD
+
+	lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt: %v", err)
+	}
+
+	if err := corruptRandomBit(ctx, lowerFD, size); err != nil {
+		t.Fatalf("corruptRandomBit: %v", err)
+	}
+
+	// Confirm that read from the modified file fails.
+	buf := make([]byte, size)
+	if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
+		t.Fatalf("fd.PRead succeeded, expected failure")
+	}
+}
+
+// TestReadModifiedFileFails ensures that read from a modified verity file
+// fails.
+func TestReadModifiedFileFails(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Open a new lowerFD that's read/writable.
+	lowerVD := fd.Impl().(*fileDescription).d.lowerVD
+
+	lowerFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  lowerVD,
+		Start: lowerVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt: %v", err)
+	}
+
+	if err := corruptRandomBit(ctx, lowerFD, size); err != nil {
+		t.Fatalf("corruptRandomBit: %v", err)
+	}
+
+	// Confirm that read from the modified file fails.
+	buf := make([]byte, size)
+	if _, err := fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{}); err == nil {
+		t.Fatalf("fd.Read succeeded, expected failure")
+	}
+}
+
+// TestModifiedMerkleFails ensures that read from a verity file fails if the
+// corresponding Merkle tree file is modified.
+func TestModifiedMerkleFails(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, size, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Open a new lowerMerkleFD that's read/writable.
+	lowerMerkleVD := fd.Impl().(*fileDescription).d.lowerMerkleVD
+
+	lowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  lowerMerkleVD,
+		Start: lowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt: %v", err)
+	}
+
+	// Flip a random bit in the Merkle tree file.
+	stat, err := lowerMerkleFD.Stat(ctx, vfs.StatOptions{})
+	if err != nil {
+		t.Fatalf("stat: %v", err)
+	}
+	merkleSize := int(stat.Size)
+	if err := corruptRandomBit(ctx, lowerMerkleFD, merkleSize); err != nil {
+		t.Fatalf("corruptRandomBit: %v", err)
+	}
+
+	// Confirm that read from a file with modified Merkle tree fails.
+	buf := make([]byte, size)
+	if _, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), 0 /* offset */, vfs.ReadOptions{}); err == nil {
+		fmt.Println(buf)
+		t.Fatalf("fd.PRead succeeded with modified Merkle file")
+	}
+}
+
+// TestModifiedParentMerkleFails ensures that open a verity enabled file in a
+// verity enabled directory fails if the hashes related to the target file in
+// the parent Merkle tree file is modified.
+func TestModifiedParentMerkleFails(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Enable verity on the parent directory.
+	parentFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt: %v", err)
+	}
+
+	if _, err := parentFD.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("Ioctl: %v", err)
+	}
+
+	// Open a new lowerMerkleFD that's read/writable.
+	parentLowerMerkleVD := fd.Impl().(*fileDescription).d.parent.lowerMerkleVD
+
+	parentLowerMerkleFD, err := vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  parentLowerMerkleVD,
+		Start: parentLowerMerkleVD,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR,
+	})
+	if err != nil {
+		t.Fatalf("OpenAt: %v", err)
+	}
+
+	// Flip a random bit in the parent Merkle tree file.
+	// This parent directory contains only one child, so any random
+	// modification in the parent Merkle tree should cause verification
+	// failure when opening the child file.
+	stat, err := parentLowerMerkleFD.Stat(ctx, vfs.StatOptions{})
+	if err != nil {
+		t.Fatalf("stat: %v", err)
+	}
+	parentMerkleSize := int(stat.Size)
+	if err := corruptRandomBit(ctx, parentLowerMerkleFD, parentMerkleSize); err != nil {
+		t.Fatalf("corruptRandomBit: %v", err)
+	}
+
+	parentLowerMerkleFD.DecRef(ctx)
+
+	// Ensure reopening the verity enabled file fails.
+	if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(filename),
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDONLY,
+		Mode:  linux.ModeRegular,
+	}); err == nil {
+		t.Errorf("OpenAt file with modified parent Merkle succeeded")
+	}
+}
+
+// TestUnmodifiedStatSucceeds ensures that stat of an untouched verity file
+// succeeds after enabling verity for it.
+func TestUnmodifiedStatSucceeds(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file and confirms stat succeeds.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("fd.Ioctl: %v", err)
+	}
+
+	if _, err := fd.Stat(ctx, vfs.StatOptions{}); err != nil {
+		t.Errorf("fd.Stat: %v", err)
+	}
+}
+
+// TestModifiedStatFails checks that getting stat for a file with modified stat
+// should fail.
+func TestModifiedStatFails(t *testing.T) {
+	vfsObj, root, ctx, err := newVerityRoot(t)
+	if err != nil {
+		t.Fatalf("newVerityRoot: %v", err)
+	}
+
+	filename := "verity-test-file"
+	fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+	if err != nil {
+		t.Fatalf("newFileFD: %v", err)
+	}
+
+	// Enable verity on the file.
+	var args arch.SyscallArguments
+	args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+	if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+		t.Fatalf("fd.Ioctl: %v", err)
+	}
+
+	lowerFD := fd.Impl().(*fileDescription).lowerFD
+	// Change the stat of the underlying file, and check that stat fails.
+	if err := lowerFD.SetStat(ctx, vfs.SetStatOptions{
+		Stat: linux.Statx{
+			Mask: uint32(linux.STATX_MODE),
+			Mode: 0777,
+		},
+	}); err != nil {
+		t.Fatalf("lowerFD.SetStat: %v", err)
+	}
+
+	if _, err := fd.Stat(ctx, vfs.StatOptions{}); err == nil {
+		t.Errorf("fd.Stat succeeded when it should fail")
+	}
+}
+
+// TestOpenDeletedOrRenamedFileFails ensures that opening a deleted/renamed
+// verity enabled file or the corresponding Merkle tree file fails with the
+// verify error.
+func TestOpenDeletedFileFails(t *testing.T) {
+	testCases := []struct {
+		// Tests removing files is remove is true. Otherwise tests
+		// renaming files.
+		remove bool
+		// The original file is removed/renamed if changeFile is true.
+		changeFile bool
+		// The Merkle tree file is removed/renamed if changeMerkleFile
+		// is true.
+		changeMerkleFile bool
+	}{
+		{
+			remove:           true,
+			changeFile:       true,
+			changeMerkleFile: false,
+		},
+		{
+			remove:           true,
+			changeFile:       false,
+			changeMerkleFile: true,
+		},
+		{
+			remove:           false,
+			changeFile:       true,
+			changeMerkleFile: false,
+		},
+		{
+			remove:           false,
+			changeFile:       true,
+			changeMerkleFile: false,
+		},
+	}
+	for _, tc := range testCases {
+		t.Run(fmt.Sprintf("remove:%t", tc.remove), func(t *testing.T) {
+			vfsObj, root, ctx, err := newVerityRoot(t)
+			if err != nil {
+				t.Fatalf("newVerityRoot: %v", err)
+			}
+
+			filename := "verity-test-file"
+			fd, _, err := newFileFD(ctx, vfsObj, root, filename, 0644)
+			if err != nil {
+				t.Fatalf("newFileFD: %v", err)
+			}
+
+			// Enable verity on the file.
+			var args arch.SyscallArguments
+			args[1] = arch.SyscallArgument{Value: linux.FS_IOC_ENABLE_VERITY}
+			if _, err := fd.Ioctl(ctx, nil /* uio */, args); err != nil {
+				t.Fatalf("Ioctl: %v", err)
+			}
+
+			rootLowerVD := root.Dentry().Impl().(*dentry).lowerVD
+			if tc.remove {
+				if tc.changeFile {
+					if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+						Root:  rootLowerVD,
+						Start: rootLowerVD,
+						Path:  fspath.Parse(filename),
+					}); err != nil {
+						t.Fatalf("UnlinkAt: %v", err)
+					}
+				}
+				if tc.changeMerkleFile {
+					if err := vfsObj.UnlinkAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+						Root:  rootLowerVD,
+						Start: rootLowerVD,
+						Path:  fspath.Parse(merklePrefix + filename),
+					}); err != nil {
+						t.Fatalf("UnlinkAt: %v", err)
+					}
+				}
+			} else {
+				newFilename := "renamed-test-file"
+				if tc.changeFile {
+					if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+						Root:  rootLowerVD,
+						Start: rootLowerVD,
+						Path:  fspath.Parse(filename),
+					}, &vfs.PathOperation{
+						Root:  rootLowerVD,
+						Start: rootLowerVD,
+						Path:  fspath.Parse(newFilename),
+					}, &vfs.RenameOptions{}); err != nil {
+						t.Fatalf("RenameAt: %v", err)
+					}
+				}
+				if tc.changeMerkleFile {
+					if err := vfsObj.RenameAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+						Root:  rootLowerVD,
+						Start: rootLowerVD,
+						Path:  fspath.Parse(merklePrefix + filename),
+					}, &vfs.PathOperation{
+						Root:  rootLowerVD,
+						Start: rootLowerVD,
+						Path:  fspath.Parse(merklePrefix + newFilename),
+					}, &vfs.RenameOptions{}); err != nil {
+						t.Fatalf("UnlinkAt: %v", err)
+					}
+				}
+			}
+
+			// Ensure reopening the verity enabled file fails.
+			if _, err = vfsObj.OpenAt(ctx, auth.CredentialsFromContext(ctx), &vfs.PathOperation{
+				Root:  root,
+				Start: root,
+				Path:  fspath.Parse(filename),
+			}, &vfs.OpenOptions{
+				Flags: linux.O_RDONLY,
+				Mode:  linux.ModeRegular,
+			}); err != syserror.EIO {
+				t.Errorf("got OpenAt error: %v, expected EIO", err)
+			}
+		})
+	}
+}