From c0f89eba6ebdec08460bd796fc62d6aef674d141 Mon Sep 17 00:00:00 2001
From: Adin Scannell <ascannell@google.com>
Date: Thu, 21 Nov 2019 11:29:49 -0800
Subject: Import and structure cleanup.

PiperOrigin-RevId: 281795269
---
 pkg/sentry/fsimpl/memfs/BUILD | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'pkg/sentry/fsimpl/memfs')

diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index 04d667273..952b20c51 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -1,10 +1,9 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
 go_template_instance(
     name = "dentry_list",
     out = "dentry_list.go",
-- 
cgit v1.2.3


From 128948d6ae94009c6ad13a0bd96e03e45a560477 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 25 Nov 2019 15:20:25 -0800
Subject: Implement basic umounting for vfs2.

This is required to test filesystems with a non-trivial implementation of
FilesystemImpl.Release(). Propagation isn't handled yet, and umount isn't yet
plumbed out to VirtualFilesystem.UmountAt(), but otherwise the implementation
of umount is believed to be correct.

- Move entering mountTable.seq writer critical sections to callers of
  mountTable.{insert,remove}Seqed. This is required since umount(2) must ensure
  that no new references are taken on the candidate mount after checking that
  it isn't busy, which is only possible by entering a vfs.mountTable.seq writer
  critical section before the check and remaining in it until after
  VFS.umountRecursiveLocked() is complete. (Linux does the same thing:
  fs/namespace.c:do_umount() => lock_mount_hash(),
  fs/pnode.c:propagate_mount_busy(), umount_tree(), unlock_mount_hash().)

- It's not possible for dentry deletion to umount while only holding
  VFS.mountMu for reading, but it's also very unappealing to hold VFS.mountMu
  exclusively around e.g. gofer unlink RPCs. Introduce dentry.mu to avoid these
  problems. This means that VFS.mountMu is never acquired for reading, so
  change it to a sync.Mutex.

PiperOrigin-RevId: 282444343
---
 pkg/sentry/fsimpl/memfs/BUILD             |   4 +-
 pkg/sentry/fsimpl/memfs/benchmark_test.go |  22 ++-
 pkg/sentry/vfs/README.md                  |   4 +-
 pkg/sentry/vfs/dentry.go                  | 128 +++++++++---
 pkg/sentry/vfs/mount.go                   | 319 +++++++++++++++++++++---------
 pkg/sentry/vfs/mount_test.go              |  34 ++--
 pkg/sentry/vfs/mount_unsafe.go            |  60 +++---
 pkg/sentry/vfs/resolving_path.go          |   8 +-
 pkg/sentry/vfs/syscalls.go                |   2 +
 pkg/sentry/vfs/vfs.go                     |  11 +-
 10 files changed, 423 insertions(+), 169 deletions(-)

(limited to 'pkg/sentry/fsimpl/memfs')

diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index 952b20c51..bc5c0b591 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -1,9 +1,10 @@
 load("//tools/go_stateify:defs.bzl", "go_library")
 load("@io_bazel_rules_go//go:def.bzl", "go_test")
-load("//tools/go_generics:defs.bzl", "go_template_instance")
 
 package(licenses = ["notice"])
 
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
 go_template_instance(
     name = "dentry_list",
     out = "dentry_list.go",
@@ -48,6 +49,7 @@ go_test(
     deps = [
         ":memfs",
         "//pkg/abi/linux",
+        "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fs",
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
index a94b17db6..23a846c08 100644
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go
@@ -21,6 +21,7 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fs"
@@ -160,6 +161,8 @@ func BenchmarkVFS1TmpfsStat(b *testing.B) {
 					b.Fatalf("stat(%q) failed: %v", filePath, err)
 				}
 			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
 		})
 	}
 }
@@ -177,6 +180,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
+			defer mntns.DecRef(vfsObj)
 
 			var filePathBuilder strings.Builder
 			filePathBuilder.WriteByte('/')
@@ -186,7 +190,6 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			defer root.DecRef()
 			vd := root
 			vd.IncRef()
-			defer vd.DecRef()
 			for i := depth; i > 0; i-- {
 				name := fmt.Sprintf("%d", i)
 				pop := vfs.PathOperation{
@@ -219,6 +222,8 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
 				Mode:  0644,
 			})
+			vd.DecRef()
+			vd = vfs.VirtualDentry{}
 			if err != nil {
 				b.Fatalf("failed to create file %q: %v", filename, err)
 			}
@@ -243,6 +248,8 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
 				}
 			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
 		})
 	}
 }
@@ -343,6 +350,8 @@ func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
 					b.Fatalf("stat(%q) failed: %v", filePath, err)
 				}
 			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
 		})
 	}
 }
@@ -360,6 +369,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
+			defer mntns.DecRef(vfsObj)
 
 			var filePathBuilder strings.Builder
 			filePathBuilder.WriteByte('/')
@@ -395,7 +405,6 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			if err != nil {
 				b.Fatalf("failed to walk to mount root: %v", err)
 			}
-			defer vd.DecRef()
 			for i := depth; i > 0; i-- {
 				name := fmt.Sprintf("%d", i)
 				pop := vfs.PathOperation{
@@ -435,6 +444,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
 				Mode:  0644,
 			})
+			vd.DecRef()
 			if err != nil {
 				b.Fatalf("failed to create file %q: %v", filename, err)
 			}
@@ -459,6 +469,14 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
 				}
 			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
 		})
 	}
 }
+
+func init() {
+	// Turn off reference leak checking for a fair comparison between vfs1 and
+	// vfs2.
+	refs.SetLeakMode(refs.NoLeakChecking)
+}
diff --git a/pkg/sentry/vfs/README.md b/pkg/sentry/vfs/README.md
index 7847854bc..9aa133bcb 100644
--- a/pkg/sentry/vfs/README.md
+++ b/pkg/sentry/vfs/README.md
@@ -39,8 +39,8 @@ Mount references are held by:
 -   Mount: Each referenced Mount holds a reference on its parent, which is the
     mount containing its mount point.
 
--   VirtualFilesystem: A reference is held on all Mounts that are attached
-    (reachable by Mount traversal).
+-   VirtualFilesystem: A reference is held on each Mount that has not been
+    umounted.
 
 MountNamespace and FileDescription references are held by users of VFS. The
 expectation is that each `kernel.Task` holds a reference on its corresponding
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 45912fc58..09ed5a70e 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -16,6 +16,7 @@ package vfs
 
 import (
 	"fmt"
+	"sync"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -50,7 +51,7 @@ import (
 // and not inodes. Furthermore, when parties outside the scope of VFS can
 // rename inodes on such filesystems, VFS generally cannot "follow" the rename,
 // both due to synchronization issues and because it may not even be able to
-// name the destination path; this implies that it would in fact be *incorrect*
+// name the destination path; this implies that it would in fact be incorrect
 // for Dentries to be associated with inodes on such filesystems. Consequently,
 // operations that are inode operations in Linux are FilesystemImpl methods
 // and/or FileDescriptionImpl methods in gVisor's VFS. Filesystems that do
@@ -84,6 +85,9 @@ type Dentry struct {
 	// mounts is accessed using atomic memory operations.
 	mounts uint32
 
+	// mu synchronizes disowning and mounting over this Dentry.
+	mu sync.Mutex
+
 	// children are child Dentries.
 	children map[string]*Dentry
 
@@ -228,36 +232,48 @@ func (vfs *VirtualFilesystem) PrepareDeleteDentry(mntns *MountNamespace, d *Dent
 			panic("d is already disowned")
 		}
 	}
-	vfs.mountMu.RLock()
-	if _, ok := mntns.mountpoints[d]; ok {
-		vfs.mountMu.RUnlock()
+	vfs.mountMu.Lock()
+	if mntns.mountpoints[d] != 0 {
+		vfs.mountMu.Unlock()
 		return syserror.EBUSY
 	}
-	// Return with vfs.mountMu locked, which will be unlocked by
-	// AbortDeleteDentry or CommitDeleteDentry.
+	d.mu.Lock()
+	vfs.mountMu.Unlock()
+	// Return with d.mu locked to block attempts to mount over it; it will be
+	// unlocked by AbortDeleteDentry or CommitDeleteDentry.
 	return nil
 }
 
 // AbortDeleteDentry must be called after PrepareDeleteDentry if the deletion
 // fails.
-func (vfs *VirtualFilesystem) AbortDeleteDentry() {
-	vfs.mountMu.RUnlock()
+func (vfs *VirtualFilesystem) AbortDeleteDentry(d *Dentry) {
+	d.mu.Unlock()
 }
 
 // CommitDeleteDentry must be called after the file represented by d is
 // deleted, and causes d to become disowned.
 //
+// CommitDeleteDentry is a mutator of d and d.Parent().
+//
 // Preconditions: PrepareDeleteDentry was previously called on d.
 func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
-	delete(d.parent.children, d.name)
+	if d.parent != nil {
+		delete(d.parent.children, d.name)
+	}
 	d.setDisowned()
-	// TODO: lazily unmount mounts at d
-	vfs.mountMu.RUnlock()
+	d.mu.Unlock()
+	if d.isMounted() {
+		vfs.forgetDisownedMountpoint(d)
+	}
 }
 
 // DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as
 // appropriate for in-memory filesystems that don't need to ensure that some
 // external state change succeeds before committing the deletion.
+//
+// DeleteDentry is a mutator of d and d.Parent().
+//
+// Preconditions: d is a child Dentry.
 func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error {
 	if err := vfs.PrepareDeleteDentry(mntns, d); err != nil {
 		return err
@@ -266,6 +282,27 @@ func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) err
 	return nil
 }
 
+// ForceDeleteDentry causes d to become disowned. It should only be used in
+// cases where VFS has no ability to stop the deletion (e.g. d represents the
+// local state of a file on a remote filesystem on which the file has already
+// been deleted).
+//
+// ForceDeleteDentry is a mutator of d and d.Parent().
+//
+// Preconditions: d is a child Dentry.
+func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) {
+	if checkInvariants {
+		if d.parent == nil {
+			panic("d is independent")
+		}
+		if d.IsDisowned() {
+			panic("d is already disowned")
+		}
+	}
+	d.mu.Lock()
+	vfs.CommitDeleteDentry(d)
+}
+
 // PrepareRenameDentry must be called before attempting to rename the file
 // represented by from. If to is not nil, it represents the file that will be
 // replaced or exchanged by the rename. If PrepareRenameDentry succeeds, the
@@ -291,18 +328,21 @@ func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, t
 			}
 		}
 	}
-	vfs.mountMu.RLock()
-	if _, ok := mntns.mountpoints[from]; ok {
-		vfs.mountMu.RUnlock()
+	vfs.mountMu.Lock()
+	if mntns.mountpoints[from] != 0 {
+		vfs.mountMu.Unlock()
 		return syserror.EBUSY
 	}
 	if to != nil {
-		if _, ok := mntns.mountpoints[to]; ok {
-			vfs.mountMu.RUnlock()
+		if mntns.mountpoints[to] != 0 {
+			vfs.mountMu.Unlock()
 			return syserror.EBUSY
 		}
+		to.mu.Lock()
 	}
-	// Return with vfs.mountMu locked, which will be unlocked by
+	from.mu.Lock()
+	vfs.mountMu.Unlock()
+	// Return with from.mu and to.mu locked, which will be unlocked by
 	// AbortRenameDentry, CommitRenameReplaceDentry, or
 	// CommitRenameExchangeDentry.
 	return nil
@@ -310,38 +350,76 @@ func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, t
 
 // AbortRenameDentry must be called after PrepareRenameDentry if the rename
 // fails.
-func (vfs *VirtualFilesystem) AbortRenameDentry() {
-	vfs.mountMu.RUnlock()
+func (vfs *VirtualFilesystem) AbortRenameDentry(from, to *Dentry) {
+	from.mu.Unlock()
+	if to != nil {
+		to.mu.Unlock()
+	}
 }
 
 // CommitRenameReplaceDentry must be called after the file represented by from
 // is renamed without RENAME_EXCHANGE. If to is not nil, it represents the file
 // that was replaced by from.
 //
+// CommitRenameReplaceDentry is a mutator of from, to, from.Parent(), and
+// to.Parent().
+//
 // Preconditions: PrepareRenameDentry was previously called on from and to.
 // newParent.Child(newName) == to.
 func (vfs *VirtualFilesystem) CommitRenameReplaceDentry(from, newParent *Dentry, newName string, to *Dentry) {
-	if to != nil {
-		to.setDisowned()
-		// TODO: lazily unmount mounts at d
-	}
 	if newParent.children == nil {
 		newParent.children = make(map[string]*Dentry)
 	}
 	newParent.children[newName] = from
 	from.parent = newParent
 	from.name = newName
-	vfs.mountMu.RUnlock()
+	from.mu.Unlock()
+	if to != nil {
+		to.setDisowned()
+		to.mu.Unlock()
+		if to.isMounted() {
+			vfs.forgetDisownedMountpoint(to)
+		}
+	}
 }
 
 // CommitRenameExchangeDentry must be called after the files represented by
 // from and to are exchanged by rename(RENAME_EXCHANGE).
 //
+// CommitRenameExchangeDentry is a mutator of from, to, from.Parent(), and
+// to.Parent().
+//
 // Preconditions: PrepareRenameDentry was previously called on from and to.
 func (vfs *VirtualFilesystem) CommitRenameExchangeDentry(from, to *Dentry) {
 	from.parent, to.parent = to.parent, from.parent
 	from.name, to.name = to.name, from.name
 	from.parent.children[from.name] = from
 	to.parent.children[to.name] = to
-	vfs.mountMu.RUnlock()
+	from.mu.Unlock()
+	to.mu.Unlock()
+}
+
+// forgetDisownedMountpoint is called when a mount point is deleted to umount
+// all mounts using it in all other mount namespaces.
+//
+// forgetDisownedMountpoint is analogous to Linux's
+// fs/namespace.c:__detach_mounts().
+func (vfs *VirtualFilesystem) forgetDisownedMountpoint(d *Dentry) {
+	var (
+		vdsToDecRef    []VirtualDentry
+		mountsToDecRef []*Mount
+	)
+	vfs.mountMu.Lock()
+	vfs.mounts.seq.BeginWrite()
+	for mnt := range vfs.mountpoints[d] {
+		vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(mnt, &umountRecursiveOptions{}, vdsToDecRef, mountsToDecRef)
+	}
+	vfs.mounts.seq.EndWrite()
+	vfs.mountMu.Unlock()
+	for _, vd := range vdsToDecRef {
+		vd.DecRef()
+	}
+	for _, mnt := range mountsToDecRef {
+		mnt.decRef()
+	}
 }
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 11702f720..198fb8067 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -38,16 +38,12 @@ import (
 // Mount is analogous to Linux's struct mount. (gVisor does not distinguish
 // between struct mount and struct vfsmount.)
 type Mount struct {
-	// The lower 63 bits of refs are a reference count. The MSB of refs is set
-	// if the Mount has been eagerly unmounted, as by umount(2) without the
-	// MNT_DETACH flag. refs is accessed using atomic memory operations.
-	refs int64
-
-	// The lower 63 bits of writers is the number of calls to
-	// Mount.CheckBeginWrite() that have not yet been paired with a call to
-	// Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
-	// writers is accessed using atomic memory operations.
-	writers int64
+	// vfs, fs, and root are immutable. References are held on fs and root.
+	//
+	// Invariant: root belongs to fs.
+	vfs  *VirtualFilesystem
+	fs   *Filesystem
+	root *Dentry
 
 	// key is protected by VirtualFilesystem.mountMu and
 	// VirtualFilesystem.mounts.seq, and may be nil. References are held on
@@ -57,13 +53,29 @@ type Mount struct {
 	// key.parent.fs.
 	key mountKey
 
-	// fs, root, and ns are immutable. References are held on fs and root (but
-	// not ns).
-	//
-	// Invariant: root belongs to fs.
-	fs   *Filesystem
-	root *Dentry
-	ns   *MountNamespace
+	// ns is the namespace in which this Mount was mounted. ns is protected by
+	// VirtualFilesystem.mountMu.
+	ns *MountNamespace
+
+	// The lower 63 bits of refs are a reference count. The MSB of refs is set
+	// if the Mount has been eagerly umounted, as by umount(2) without the
+	// MNT_DETACH flag. refs is accessed using atomic memory operations.
+	refs int64
+
+	// children is the set of all Mounts for which Mount.key.parent is this
+	// Mount. children is protected by VirtualFilesystem.mountMu.
+	children map[*Mount]struct{}
+
+	// umounted is true if VFS.umountRecursiveLocked() has been called on this
+	// Mount. VirtualFilesystem does not hold a reference on Mounts for which
+	// umounted is true. umounted is protected by VirtualFilesystem.mountMu.
+	umounted bool
+
+	// The lower 63 bits of writers is the number of calls to
+	// Mount.CheckBeginWrite() that have not yet been paired with a call to
+	// Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect.
+	// writers is accessed using atomic memory operations.
+	writers int64
 }
 
 // A MountNamespace is a collection of Mounts.
@@ -73,13 +85,16 @@ type Mount struct {
 //
 // MountNamespace is analogous to Linux's struct mnt_namespace.
 type MountNamespace struct {
-	refs int64 // accessed using atomic memory operations
-
 	// root is the MountNamespace's root mount. root is immutable.
 	root *Mount
 
-	// mountpoints contains all Dentries which are mount points in this
-	// namespace. mountpoints is protected by VirtualFilesystem.mountMu.
+	// refs is the reference count. refs is accessed using atomic memory
+	// operations.
+	refs int64
+
+	// mountpoints maps all Dentries which are mount points in this namespace
+	// to the number of Mounts for which they are mount points. mountpoints is
+	// protected by VirtualFilesystem.mountMu.
 	//
 	// mountpoints is used to determine if a Dentry can be moved or removed
 	// (which requires that the Dentry is not a mount point in the calling
@@ -89,7 +104,7 @@ type MountNamespace struct {
 	// MountNamespace; this is required to ensure that
 	// VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate
 	// correctly on unreferenced MountNamespaces.
-	mountpoints map[*Dentry]struct{}
+	mountpoints map[*Dentry]uint32
 }
 
 // NewMountNamespace returns a new mount namespace with a root filesystem
@@ -106,9 +121,10 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 	}
 	mntns := &MountNamespace{
 		refs:        1,
-		mountpoints: make(map[*Dentry]struct{}),
+		mountpoints: make(map[*Dentry]uint32),
 	}
 	mntns.root = &Mount{
+		vfs:  vfs,
 		fs:   fs,
 		root: root,
 		ns:   mntns,
@@ -136,8 +152,10 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
 		return err
 	}
 	vfs.mountMu.Lock()
+	vd.dentry.mu.Lock()
 	for {
 		if vd.dentry.IsDisowned() {
+			vd.dentry.mu.Unlock()
 			vfs.mountMu.Unlock()
 			vd.DecRef()
 			root.decRef(fs)
@@ -153,36 +171,208 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
 		if nextmnt == nil {
 			break
 		}
-		nextmnt.incRef()
+		// It's possible that nextmnt has been umounted but not disconnected,
+		// in which case vfs no longer holds a reference on it, and the last
+		// reference may be concurrently dropped even though we're holding
+		// vfs.mountMu.
+		if !nextmnt.tryIncMountedRef() {
+			break
+		}
+		// This can't fail since we're holding vfs.mountMu.
 		nextmnt.root.incRef(nextmnt.fs)
+		vd.dentry.mu.Unlock()
 		vd.DecRef()
 		vd = VirtualDentry{
 			mount:  nextmnt,
 			dentry: nextmnt.root,
 		}
+		vd.dentry.mu.Lock()
 	}
 	// TODO: Linux requires that either both the mount point and the mount root
 	// are directories, or neither are, and returns ENOTDIR if this is not the
 	// case.
 	mntns := vd.mount.ns
 	mnt := &Mount{
+		vfs:  vfs,
 		fs:   fs,
 		root: root,
 		ns:   mntns,
 		refs: 1,
 	}
-	mnt.storeKey(vd.mount, vd.dentry)
+	vfs.mounts.seq.BeginWrite()
+	vfs.connectLocked(mnt, vd, mntns)
+	vfs.mounts.seq.EndWrite()
+	vd.dentry.mu.Unlock()
+	vfs.mountMu.Unlock()
+	return nil
+}
+
+type umountRecursiveOptions struct {
+	// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
+	// on umounted mounts fail.
+	//
+	// eager is analogous to Linux's UMOUNT_SYNC.
+	eager bool
+
+	// If disconnectHierarchy is true, Mounts that are umounted hierarchically
+	// should be disconnected from their parents. (Mounts whose parents are not
+	// umounted, which in most cases means the Mount passed to the initial call
+	// to umountRecursiveLocked, are unconditionally disconnected for
+	// consistency with Linux.)
+	//
+	// disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED.
+	disconnectHierarchy bool
+}
+
+// umountRecursiveLocked marks mnt and its descendants as umounted. It does not
+// release mount or dentry references; instead, it appends VirtualDentries and
+// Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef
+// respectively, and returns updated slices. (This is necessary because
+// filesystem locks possibly taken by DentryImpl.DecRef() may precede
+// vfs.mountMu in the lock order, and Mount.decRef() may lock vfs.mountMu.)
+//
+// umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree().
+//
+// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
+// writer critical section.
+func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) {
+	if !mnt.umounted {
+		mnt.umounted = true
+		mountsToDecRef = append(mountsToDecRef, mnt)
+		if parent := mnt.parent(); parent != nil && (opts.disconnectHierarchy || !parent.umounted) {
+			vdsToDecRef = append(vdsToDecRef, vfs.disconnectLocked(mnt))
+		}
+	}
+	if opts.eager {
+		for {
+			refs := atomic.LoadInt64(&mnt.refs)
+			if refs < 0 {
+				break
+			}
+			if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs|math.MinInt64) {
+				break
+			}
+		}
+	}
+	for child := range mnt.children {
+		vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(child, opts, vdsToDecRef, mountsToDecRef)
+	}
+	return vdsToDecRef, mountsToDecRef
+}
+
+// connectLocked makes vd the mount parent/point for mnt. It consumes
+// references held by vd.
+//
+// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
+// writer critical section. d.mu must be locked. mnt.parent() == nil.
+func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) {
+	mnt.storeKey(vd)
+	if vd.mount.children == nil {
+		vd.mount.children = make(map[*Mount]struct{})
+	}
+	vd.mount.children[mnt] = struct{}{}
 	atomic.AddUint32(&vd.dentry.mounts, 1)
-	mntns.mountpoints[vd.dentry] = struct{}{}
+	mntns.mountpoints[vd.dentry]++
+	vfs.mounts.insertSeqed(mnt)
 	vfsmpmounts, ok := vfs.mountpoints[vd.dentry]
 	if !ok {
 		vfsmpmounts = make(map[*Mount]struct{})
 		vfs.mountpoints[vd.dentry] = vfsmpmounts
 	}
 	vfsmpmounts[mnt] = struct{}{}
-	vfs.mounts.Insert(mnt)
-	vfs.mountMu.Unlock()
-	return nil
+}
+
+// disconnectLocked makes vd have no mount parent/point and returns its old
+// mount parent/point with a reference held.
+//
+// Preconditions: vfs.mountMu must be locked. vfs.mounts.seq must be in a
+// writer critical section. mnt.parent() != nil.
+func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry {
+	vd := mnt.loadKey()
+	mnt.storeKey(VirtualDentry{})
+	delete(vd.mount.children, mnt)
+	atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1
+	mnt.ns.mountpoints[vd.dentry]--
+	if mnt.ns.mountpoints[vd.dentry] == 0 {
+		delete(mnt.ns.mountpoints, vd.dentry)
+	}
+	vfs.mounts.removeSeqed(mnt)
+	vfsmpmounts := vfs.mountpoints[vd.dentry]
+	delete(vfsmpmounts, mnt)
+	if len(vfsmpmounts) == 0 {
+		delete(vfs.mountpoints, vd.dentry)
+	}
+	return vd
+}
+
+// tryIncMountedRef increments mnt's reference count and returns true. If mnt's
+// reference count is already zero, or has been eagerly umounted,
+// tryIncMountedRef does nothing and returns false.
+//
+// tryIncMountedRef does not require that a reference is held on mnt.
+func (mnt *Mount) tryIncMountedRef() bool {
+	for {
+		refs := atomic.LoadInt64(&mnt.refs)
+		if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+func (mnt *Mount) incRef() {
+	// In general, negative values for mnt.refs are valid because the MSB is
+	// the eager-unmount bit.
+	atomic.AddInt64(&mnt.refs, 1)
+}
+
+func (mnt *Mount) decRef() {
+	refs := atomic.AddInt64(&mnt.refs, -1)
+	if refs&^math.MinInt64 == 0 { // mask out MSB
+		var vd VirtualDentry
+		if mnt.parent() != nil {
+			mnt.vfs.mountMu.Lock()
+			mnt.vfs.mounts.seq.BeginWrite()
+			vd = mnt.vfs.disconnectLocked(mnt)
+			mnt.vfs.mounts.seq.EndWrite()
+			mnt.vfs.mountMu.Unlock()
+		}
+		mnt.root.decRef(mnt.fs)
+		mnt.fs.decRef()
+		if vd.Ok() {
+			vd.DecRef()
+		}
+	}
+}
+
+// IncRef increments mntns' reference count.
+func (mntns *MountNamespace) IncRef() {
+	if atomic.AddInt64(&mntns.refs, 1) <= 1 {
+		panic("MountNamespace.IncRef() called without holding a reference")
+	}
+}
+
+// DecRef decrements mntns' reference count.
+func (mntns *MountNamespace) DecRef(vfs *VirtualFilesystem) {
+	if refs := atomic.AddInt64(&mntns.refs, -1); refs == 0 {
+		vfs.mountMu.Lock()
+		vfs.mounts.seq.BeginWrite()
+		vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{
+			disconnectHierarchy: true,
+		}, nil, nil)
+		vfs.mounts.seq.EndWrite()
+		vfs.mountMu.Unlock()
+		for _, vd := range vdsToDecRef {
+			vd.DecRef()
+		}
+		for _, mnt := range mountsToDecRef {
+			mnt.decRef()
+		}
+	} else if refs < 0 {
+		panic("MountNamespace.DecRef() called without holding a reference")
+	}
 }
 
 // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes
@@ -231,12 +421,12 @@ retryFirst:
 }
 
 // getMountpointAt returns the mount point for the stack of Mounts including
-// mnt. It takes a reference on the returned Mount and Dentry. If no such mount
+// mnt. It takes a reference on the returned VirtualDentry. If no such mount
 // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil).
 //
 // Preconditions: References are held on mnt and root. vfsroot is not (mnt,
 // mnt.root).
-func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) (*Mount, *Dentry) {
+func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry) VirtualDentry {
 	// The first mount is special-cased:
 	//
 	// - The caller must have already checked mnt against vfsroot.
@@ -246,12 +436,12 @@ func (vfs *VirtualFilesystem) getMountpointAt(mnt *Mount, vfsroot VirtualDentry)
 	// - We don't drop the caller's reference on mnt.
 retryFirst:
 	epoch := vfs.mounts.seq.BeginRead()
-	parent, point := mnt.loadKey()
+	parent, point := mnt.parent(), mnt.point()
 	if !vfs.mounts.seq.ReadOk(epoch) {
 		goto retryFirst
 	}
 	if parent == nil {
-		return nil, nil
+		return VirtualDentry{}
 	}
 	if !parent.tryIncMountedRef() {
 		// Raced with umount.
@@ -263,6 +453,11 @@ retryFirst:
 		parent.decRef()
 		goto retryFirst
 	}
+	if !vfs.mounts.seq.ReadOk(epoch) {
+		point.decRef(parent.fs)
+		parent.decRef()
+		goto retryFirst
+	}
 	mnt = parent
 	d := point
 	for {
@@ -274,7 +469,7 @@ retryFirst:
 		}
 	retryNotFirst:
 		epoch := vfs.mounts.seq.BeginRead()
-		parent, point := mnt.loadKey()
+		parent, point := mnt.parent(), mnt.point()
 		if !vfs.mounts.seq.ReadOk(epoch) {
 			goto retryNotFirst
 		}
@@ -301,43 +496,7 @@ retryFirst:
 		mnt = parent
 		d = point
 	}
-	return mnt, d
-}
-
-// tryIncMountedRef increments mnt's reference count and returns true. If mnt's
-// reference count is already zero, or has been eagerly unmounted,
-// tryIncMountedRef does nothing and returns false.
-//
-// tryIncMountedRef does not require that a reference is held on mnt.
-func (mnt *Mount) tryIncMountedRef() bool {
-	for {
-		refs := atomic.LoadInt64(&mnt.refs)
-		if refs <= 0 { // refs < 0 => MSB set => eagerly unmounted
-			return false
-		}
-		if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs+1) {
-			return true
-		}
-	}
-}
-
-func (mnt *Mount) incRef() {
-	// In general, negative values for mnt.refs are valid because the MSB is
-	// the eager-unmount bit.
-	atomic.AddInt64(&mnt.refs, 1)
-}
-
-func (mnt *Mount) decRef() {
-	refs := atomic.AddInt64(&mnt.refs, -1)
-	if refs&^math.MinInt64 == 0 { // mask out MSB
-		parent, point := mnt.loadKey()
-		if point != nil {
-			point.decRef(parent.fs)
-			parent.decRef()
-		}
-		mnt.root.decRef(mnt.fs)
-		mnt.fs.decRef()
-	}
+	return VirtualDentry{mnt, d}
 }
 
 // CheckBeginWrite increments the counter of in-progress write operations on
@@ -360,7 +519,7 @@ func (mnt *Mount) EndWrite() {
 	atomic.AddInt64(&mnt.writers, -1)
 }
 
-// Preconditions: VirtualFilesystem.mountMu must be locked for writing.
+// Preconditions: VirtualFilesystem.mountMu must be locked.
 func (mnt *Mount) setReadOnlyLocked(ro bool) error {
 	if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro {
 		return nil
@@ -383,22 +542,6 @@ func (mnt *Mount) Filesystem() *Filesystem {
 	return mnt.fs
 }
 
-// IncRef increments mntns' reference count.
-func (mntns *MountNamespace) IncRef() {
-	if atomic.AddInt64(&mntns.refs, 1) <= 1 {
-		panic("MountNamespace.IncRef() called without holding a reference")
-	}
-}
-
-// DecRef decrements mntns' reference count.
-func (mntns *MountNamespace) DecRef() {
-	if refs := atomic.AddInt64(&mntns.refs, 0); refs == 0 {
-		// TODO: unmount mntns.root
-	} else if refs < 0 {
-		panic("MountNamespace.DecRef() called without holding a reference")
-	}
-}
-
 // Root returns mntns' root. A reference is taken on the returned
 // VirtualDentry.
 func (mntns *MountNamespace) Root() VirtualDentry {
diff --git a/pkg/sentry/vfs/mount_test.go b/pkg/sentry/vfs/mount_test.go
index f394d7483..adff0b94b 100644
--- a/pkg/sentry/vfs/mount_test.go
+++ b/pkg/sentry/vfs/mount_test.go
@@ -37,7 +37,7 @@ func TestMountTableInsertLookup(t *testing.T) {
 	mt.Init()
 
 	mount := &Mount{}
-	mount.storeKey(&Mount{}, &Dentry{})
+	mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
 	mt.Insert(mount)
 
 	if m := mt.Lookup(mount.parent(), mount.point()); m != mount {
@@ -78,18 +78,10 @@ const enableComparativeBenchmarks = false
 
 func newBenchMount() *Mount {
 	mount := &Mount{}
-	mount.storeKey(&Mount{}, &Dentry{})
+	mount.storeKey(VirtualDentry{&Mount{}, &Dentry{}})
 	return mount
 }
 
-func vdkey(mnt *Mount) VirtualDentry {
-	parent, point := mnt.loadKey()
-	return VirtualDentry{
-		mount:  parent,
-		dentry: point,
-	}
-}
-
 func BenchmarkMountTableParallelLookup(b *testing.B) {
 	for numG, maxG := 1, runtime.GOMAXPROCS(0); numG >= 0 && numG <= maxG; numG *= 2 {
 		for _, numMounts := range benchNumMounts {
@@ -101,7 +93,7 @@ func BenchmarkMountTableParallelLookup(b *testing.B) {
 				for i := 0; i < numMounts; i++ {
 					mount := newBenchMount()
 					mt.Insert(mount)
-					keys = append(keys, vdkey(mount))
+					keys = append(keys, mount.loadKey())
 				}
 
 				var ready sync.WaitGroup
@@ -153,7 +145,7 @@ func BenchmarkMountMapParallelLookup(b *testing.B) {
 				keys := make([]VirtualDentry, 0, numMounts)
 				for i := 0; i < numMounts; i++ {
 					mount := newBenchMount()
-					key := vdkey(mount)
+					key := mount.loadKey()
 					ms[key] = mount
 					keys = append(keys, key)
 				}
@@ -208,7 +200,7 @@ func BenchmarkMountSyncMapParallelLookup(b *testing.B) {
 				keys := make([]VirtualDentry, 0, numMounts)
 				for i := 0; i < numMounts; i++ {
 					mount := newBenchMount()
-					key := vdkey(mount)
+					key := mount.loadKey()
 					ms.Store(key, mount)
 					keys = append(keys, key)
 				}
@@ -290,7 +282,7 @@ func BenchmarkMountMapNegativeLookup(b *testing.B) {
 			ms := make(map[VirtualDentry]*Mount)
 			for i := 0; i < numMounts; i++ {
 				mount := newBenchMount()
-				ms[vdkey(mount)] = mount
+				ms[mount.loadKey()] = mount
 			}
 			negkeys := make([]VirtualDentry, 0, numMounts)
 			for i := 0; i < numMounts; i++ {
@@ -325,7 +317,7 @@ func BenchmarkMountSyncMapNegativeLookup(b *testing.B) {
 			var ms sync.Map
 			for i := 0; i < numMounts; i++ {
 				mount := newBenchMount()
-				ms.Store(vdkey(mount), mount)
+				ms.Store(mount.loadKey(), mount)
 			}
 			negkeys := make([]VirtualDentry, 0, numMounts)
 			for i := 0; i < numMounts; i++ {
@@ -379,7 +371,7 @@ func BenchmarkMountMapInsert(b *testing.B) {
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		ms[vdkey(mount)] = mount
+		ms[mount.loadKey()] = mount
 	}
 }
 
@@ -399,7 +391,7 @@ func BenchmarkMountSyncMapInsert(b *testing.B) {
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		ms.Store(vdkey(mount), mount)
+		ms.Store(mount.loadKey(), mount)
 	}
 }
 
@@ -432,13 +424,13 @@ func BenchmarkMountMapRemove(b *testing.B) {
 	ms := make(map[VirtualDentry]*Mount)
 	for i := range mounts {
 		mount := mounts[i]
-		ms[vdkey(mount)] = mount
+		ms[mount.loadKey()] = mount
 	}
 
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		delete(ms, vdkey(mount))
+		delete(ms, mount.loadKey())
 	}
 }
 
@@ -454,12 +446,12 @@ func BenchmarkMountSyncMapRemove(b *testing.B) {
 	var ms sync.Map
 	for i := range mounts {
 		mount := mounts[i]
-		ms.Store(vdkey(mount), mount)
+		ms.Store(mount.loadKey(), mount)
 	}
 
 	b.ResetTimer()
 	for i := range mounts {
 		mount := mounts[i]
-		ms.Delete(vdkey(mount))
+		ms.Delete(mount.loadKey())
 	}
 }
diff --git a/pkg/sentry/vfs/mount_unsafe.go b/pkg/sentry/vfs/mount_unsafe.go
index c98b42f91..ab13fa461 100644
--- a/pkg/sentry/vfs/mount_unsafe.go
+++ b/pkg/sentry/vfs/mount_unsafe.go
@@ -38,16 +38,6 @@ type mountKey struct {
 	point  unsafe.Pointer // *Dentry
 }
 
-// Invariant: mnt.key's fields are nil. parent and point are non-nil.
-func (mnt *Mount) storeKey(parent *Mount, point *Dentry) {
-	atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(parent))
-	atomic.StorePointer(&mnt.key.point, unsafe.Pointer(point))
-}
-
-func (mnt *Mount) loadKey() (*Mount, *Dentry) {
-	return (*Mount)(atomic.LoadPointer(&mnt.key.parent)), (*Dentry)(atomic.LoadPointer(&mnt.key.point))
-}
-
 func (mnt *Mount) parent() *Mount {
 	return (*Mount)(atomic.LoadPointer(&mnt.key.parent))
 }
@@ -56,6 +46,19 @@ func (mnt *Mount) point() *Dentry {
 	return (*Dentry)(atomic.LoadPointer(&mnt.key.point))
 }
 
+func (mnt *Mount) loadKey() VirtualDentry {
+	return VirtualDentry{
+		mount:  mnt.parent(),
+		dentry: mnt.point(),
+	}
+}
+
+// Invariant: mnt.key.parent == nil. vd.Ok().
+func (mnt *Mount) storeKey(vd VirtualDentry) {
+	atomic.StorePointer(&mnt.key.parent, unsafe.Pointer(vd.mount))
+	atomic.StorePointer(&mnt.key.point, unsafe.Pointer(vd.dentry))
+}
+
 // mountTable maps (mount parent, mount point) pairs to mounts. It supports
 // efficient concurrent lookup, even in the presence of concurrent mutators
 // (provided mutation is sufficiently uncommon).
@@ -201,9 +204,19 @@ loop:
 
 // Insert inserts the given mount into mt.
 //
-// Preconditions: There are no concurrent mutators of mt. mt must not already
-// contain a Mount with the same mount point and parent.
+// Preconditions: mt must not already contain a Mount with the same mount point
+// and parent.
 func (mt *mountTable) Insert(mount *Mount) {
+	mt.seq.BeginWrite()
+	mt.insertSeqed(mount)
+	mt.seq.EndWrite()
+}
+
+// insertSeqed inserts the given mount into mt.
+//
+// Preconditions: mt.seq must be in a writer critical section. mt must not
+// already contain a Mount with the same mount point and parent.
+func (mt *mountTable) insertSeqed(mount *Mount) {
 	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
 
 	// We're under the maximum load factor if:
@@ -215,10 +228,8 @@ func (mt *mountTable) Insert(mount *Mount) {
 	tcap := uintptr(1) << order
 	if ((tlen + 1) * mtMaxLoadDen) <= (uint64(mtMaxLoadNum) << order) {
 		// Atomically insert the new element into the table.
-		mt.seq.BeginWrite()
 		atomic.AddUint64(&mt.size, mtSizeLenOne)
 		mtInsertLocked(mt.slots, tcap, unsafe.Pointer(mount), hash)
-		mt.seq.EndWrite()
 		return
 	}
 
@@ -241,8 +252,6 @@ func (mt *mountTable) Insert(mount *Mount) {
 	for {
 		oldSlot := (*mountSlot)(oldCur)
 		if oldSlot.value != nil {
-			// Don't need to lock mt.seq yet since newSlots isn't visible
-			// to readers.
 			mtInsertLocked(newSlots, newCap, oldSlot.value, oldSlot.hash)
 		}
 		if oldCur == oldLast {
@@ -252,11 +261,9 @@ func (mt *mountTable) Insert(mount *Mount) {
 	}
 	// Insert the new element into the new table.
 	mtInsertLocked(newSlots, newCap, unsafe.Pointer(mount), hash)
-	// Atomically switch to the new table.
-	mt.seq.BeginWrite()
+	// Switch to the new table.
 	atomic.AddUint64(&mt.size, mtSizeLenOne|mtSizeOrderOne)
 	atomic.StorePointer(&mt.slots, newSlots)
-	mt.seq.EndWrite()
 }
 
 // Preconditions: There are no concurrent mutators of the table (slots, cap).
@@ -294,9 +301,18 @@ func mtInsertLocked(slots unsafe.Pointer, cap uintptr, value unsafe.Pointer, has
 
 // Remove removes the given mount from mt.
 //
-// Preconditions: There are no concurrent mutators of mt. mt must contain
-// mount.
+// Preconditions: mt must contain mount.
 func (mt *mountTable) Remove(mount *Mount) {
+	mt.seq.BeginWrite()
+	mt.removeSeqed(mount)
+	mt.seq.EndWrite()
+}
+
+// removeSeqed removes the given mount from mt.
+//
+// Preconditions: mt.seq must be in a writer critical section. mt must contain
+// mount.
+func (mt *mountTable) removeSeqed(mount *Mount) {
 	hash := memhash(unsafe.Pointer(&mount.key), uintptr(mt.seed), mountKeyBytes)
 	tcap := uintptr(1) << (mt.size & mtSizeOrderMask)
 	mask := tcap - 1
@@ -311,7 +327,6 @@ func (mt *mountTable) Remove(mount *Mount) {
 			// backward until we either find an empty slot, or an element that
 			// is already in its first-probed slot. (This is backward shift
 			// deletion.)
-			mt.seq.BeginWrite()
 			for {
 				nextOff := (off + mountSlotBytes) & offmask
 				nextSlot := (*mountSlot)(unsafe.Pointer(uintptr(slots) + nextOff))
@@ -330,7 +345,6 @@ func (mt *mountTable) Remove(mount *Mount) {
 			}
 			atomic.StorePointer(&slot.value, nil)
 			atomic.AddUint64(&mt.size, mtSizeLenNegOne)
-			mt.seq.EndWrite()
 			return
 		}
 		if checkInvariants && slotValue == nil {
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 8d05c8583..61bce6426 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -269,11 +269,11 @@ func (rp *ResolvingPath) ResolveParent(d *Dentry) (*Dentry, error) {
 		parent = d
 	} else if d == rp.mount.root {
 		// At mount root ...
-		mnt, mntpt := rp.vfs.getMountpointAt(rp.mount, rp.root)
-		if mnt != nil {
+		vd := rp.vfs.getMountpointAt(rp.mount, rp.root)
+		if vd.Ok() {
 			// ... of non-root mount.
-			rp.nextMount = mnt
-			rp.nextStart = mntpt
+			rp.nextMount = vd.mount
+			rp.nextStart = vd.dentry
 			return nil, resolveMountRootError{}
 		}
 		// ... of root mount.
diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go
index abde0feaa..49952b2cc 100644
--- a/pkg/sentry/vfs/syscalls.go
+++ b/pkg/sentry/vfs/syscalls.go
@@ -230,6 +230,8 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) err
 //
 // - VFS.SymlinkAt()
 //
+// - VFS.UmountAt()
+//
 // - VFS.UnlinkAt()
 //
 // - FileDescription.(almost everything)
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 4a8a69540..88e865d86 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -16,9 +16,14 @@
 //
 // Lock order:
 //
-// Filesystem implementation locks
+// FilesystemImpl/FileDescriptionImpl locks
 //   VirtualFilesystem.mountMu
+//     Dentry.mu
+//       Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
 // VirtualFilesystem.fsTypesMu
+//
+// Locking Dentry.mu in multiple Dentries requires holding
+// VirtualFilesystem.mountMu.
 package vfs
 
 import (
@@ -33,7 +38,7 @@ type VirtualFilesystem struct {
 	// mountMu serializes mount mutations.
 	//
 	// mountMu is analogous to Linux's namespace_sem.
-	mountMu sync.RWMutex
+	mountMu sync.Mutex
 
 	// mounts maps (mount parent, mount point) pairs to mounts. (Since mounts
 	// are uniquely namespaced, including mount parent in the key correctly
@@ -52,7 +57,7 @@ type VirtualFilesystem struct {
 	// mountpoints maps mount points to mounts at those points in all
 	// namespaces. mountpoints is protected by mountMu.
 	//
-	// mountpoints is used to find mounts that must be unmounted due to
+	// mountpoints is used to find mounts that must be umounted due to
 	// removal of a mount point Dentry from another mount namespace. ("A file
 	// or directory that is a mount point in one namespace that is not a mount
 	// point in another namespace, may be renamed, unlinked, or removed
-- 
cgit v1.2.3


From b72e1b3c0873ea29d031db42e39ca053923eecff Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 25 Nov 2019 18:09:15 -0800
Subject: Minor VFS2 interface changes.

- Remove the Filesystem argument from DentryImpl.*Ref(); in general DentryImpls
  that need the Filesystem for reference counting will probably also need it
  for other interface methods that don't plumb Filesystem, so it's easier to
  just store a pointer to the filesystem in the DentryImpl.

- Add a pointer to the VirtualFilesystem to Filesystem, which is needed by the
  gofer client to disown dentries for cache eviction triggered by dentry
  reference count changes.

- Rename FilesystemType.NewFilesystem to GetFilesystem; in some cases (e.g.
  sysfs, cgroupfs) it's much cleaner for there to be only one Filesystem that
  is used by all mounts, and in at least one case (devtmpfs) it's visibly
  incorrect not to do so, so NewFilesystem doesn't always actually create and
  return a *new* Filesystem.

- Require callers of FileDescription.Init() to increment Mount/Dentry
  references. This is because the gofer client may, in the OpenAt() path, take
  a reference on a dentry with 0 references, which is safe due to
  synchronization that is outside the scope of this CL, and it would be safer
  to still have its implementation of DentryImpl.IncRef() check for an
  increment for 0 references in other cases.

- Add FileDescription.TryIncRef. This is used by the gofer client to take
  references on "special file descriptions" (FDs for files such as pipes,
  sockets, and devices), which use per-FD handles (fids) instead of
  dentry-shared handles, for sync() and syncfs().

PiperOrigin-RevId: 282473364
---
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go |  4 +-
 pkg/sentry/fsimpl/ext/block_map_file.go           |  4 +-
 pkg/sentry/fsimpl/ext/block_map_test.go           |  4 +-
 pkg/sentry/fsimpl/ext/dentry.go                   | 10 ++--
 pkg/sentry/fsimpl/ext/ext.go                      | 10 ++--
 pkg/sentry/fsimpl/ext/ext_test.go                 | 14 +++---
 pkg/sentry/fsimpl/ext/extent_file.go              |  6 +--
 pkg/sentry/fsimpl/ext/extent_test.go              |  4 +-
 pkg/sentry/fsimpl/ext/file_description.go         |  4 +-
 pkg/sentry/fsimpl/ext/inode.go                    | 28 +++++++-----
 pkg/sentry/fsimpl/memfs/benchmark_test.go         |  6 +--
 pkg/sentry/fsimpl/memfs/filesystem.go             | 15 ++++--
 pkg/sentry/fsimpl/memfs/memfs.go                  | 16 +++----
 pkg/sentry/fsimpl/memfs/named_pipe.go             |  5 +-
 pkg/sentry/fsimpl/memfs/pipe_test.go              |  2 +-
 pkg/sentry/vfs/dentry.go                          | 24 ++++++----
 pkg/sentry/vfs/file_description.go                | 33 +++++++++++--
 pkg/sentry/vfs/file_description_impl_util_test.go |  2 +-
 pkg/sentry/vfs/filesystem.go                      | 20 ++++++--
 pkg/sentry/vfs/filesystem_type.go                 | 10 ++--
 pkg/sentry/vfs/mount.go                           | 56 ++++++++++++-----------
 pkg/sentry/vfs/resolving_path.go                  |  8 ++--
 pkg/sentry/vfs/syscalls.go                        |  2 +-
 pkg/sentry/vfs/testutil.go                        | 12 ++---
 pkg/sentry/vfs/vfs.go                             |  8 ++--
 25 files changed, 186 insertions(+), 121 deletions(-)

(limited to 'pkg/sentry/fsimpl/memfs')

diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 10a8083a0..94cd74095 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -50,7 +50,7 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
 	// Create VFS.
 	vfsObj := vfs.New()
 	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
@@ -81,7 +81,7 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf
 	ctx := contexttest.Context(b)
 	creds := auth.CredentialsFromContext(ctx)
 
-	if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())}); err != nil {
+	if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}); err != nil {
 		b.Fatalf("failed to mount tmpfs submount: %v", err)
 	}
 	return func() {
diff --git a/pkg/sentry/fsimpl/ext/block_map_file.go b/pkg/sentry/fsimpl/ext/block_map_file.go
index cea89bcd9..a2d8c3ad6 100644
--- a/pkg/sentry/fsimpl/ext/block_map_file.go
+++ b/pkg/sentry/fsimpl/ext/block_map_file.go
@@ -154,7 +154,7 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds
 			toRead = len(dst)
 		}
 
-		n, _ := f.regFile.inode.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff))
+		n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], curPhyBlkOff+int64(relFileOff))
 		if n < toRead {
 			return n, syserror.EIO
 		}
@@ -174,7 +174,7 @@ func (f *blockMapFile) read(curPhyBlk uint32, relFileOff uint64, height uint, ds
 	curChildOff := relFileOff % childCov
 	for i := startIdx; i < endIdx; i++ {
 		var childPhyBlk uint32
-		err := readFromDisk(f.regFile.inode.dev, curPhyBlkOff+int64(i*4), &childPhyBlk)
+		err := readFromDisk(f.regFile.inode.fs.dev, curPhyBlkOff+int64(i*4), &childPhyBlk)
 		if err != nil {
 			return read, err
 		}
diff --git a/pkg/sentry/fsimpl/ext/block_map_test.go b/pkg/sentry/fsimpl/ext/block_map_test.go
index 213aa3919..181727ef7 100644
--- a/pkg/sentry/fsimpl/ext/block_map_test.go
+++ b/pkg/sentry/fsimpl/ext/block_map_test.go
@@ -87,12 +87,14 @@ func blockMapSetUp(t *testing.T) (*blockMapFile, []byte) {
 	mockDisk := make([]byte, mockBMDiskSize)
 	regFile := regularFile{
 		inode: inode{
+			fs: &filesystem{
+				dev: bytes.NewReader(mockDisk),
+			},
 			diskInode: &disklayout.InodeNew{
 				InodeOld: disklayout.InodeOld{
 					SizeLo: getMockBMFileFize(),
 				},
 			},
-			dev:     bytes.NewReader(mockDisk),
 			blkSize: uint64(mockBMBlkSize),
 		},
 	}
diff --git a/pkg/sentry/fsimpl/ext/dentry.go b/pkg/sentry/fsimpl/ext/dentry.go
index 054fb42b6..a080cb189 100644
--- a/pkg/sentry/fsimpl/ext/dentry.go
+++ b/pkg/sentry/fsimpl/ext/dentry.go
@@ -41,16 +41,18 @@ func newDentry(in *inode) *dentry {
 }
 
 // IncRef implements vfs.DentryImpl.IncRef.
-func (d *dentry) IncRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) IncRef() {
 	d.inode.incRef()
 }
 
 // TryIncRef implements vfs.DentryImpl.TryIncRef.
-func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
+func (d *dentry) TryIncRef() bool {
 	return d.inode.tryIncRef()
 }
 
 // DecRef implements vfs.DentryImpl.DecRef.
-func (d *dentry) DecRef(vfsfs *vfs.Filesystem) {
-	d.inode.decRef(vfsfs.Impl().(*filesystem))
+func (d *dentry) DecRef() {
+	// FIXME(b/134676337): filesystem.mu may not be locked as required by
+	// inode.decRef().
+	d.inode.decRef()
 }
diff --git a/pkg/sentry/fsimpl/ext/ext.go b/pkg/sentry/fsimpl/ext/ext.go
index f10accafc..4b7d17dc6 100644
--- a/pkg/sentry/fsimpl/ext/ext.go
+++ b/pkg/sentry/fsimpl/ext/ext.go
@@ -40,14 +40,14 @@ var _ vfs.FilesystemType = (*FilesystemType)(nil)
 // Currently there are two ways of mounting an ext(2/3/4) fs:
 //   1. Specify a mount with our internal special MountType in the OCI spec.
 //   2. Expose the device to the container and mount it from application layer.
-func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, error) {
+func getDeviceFd(source string, opts vfs.GetFilesystemOptions) (io.ReaderAt, error) {
 	if opts.InternalData == nil {
 		// User mount call.
 		// TODO(b/134676337): Open the device specified by `source` and return that.
 		panic("unimplemented")
 	}
 
-	// NewFilesystem call originated from within the sentry.
+	// GetFilesystem call originated from within the sentry.
 	devFd, ok := opts.InternalData.(int)
 	if !ok {
 		return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device")
@@ -91,8 +91,8 @@ func isCompatible(sb disklayout.SuperBlock) bool {
 	return true
 }
 
-// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
-func (FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	// TODO(b/134676337): Ensure that the user is mounting readonly. If not,
 	// EACCESS should be returned according to mount(2). Filesystem independent
 	// flags (like readonly) are currently not available in pkg/sentry/vfs.
@@ -103,7 +103,7 @@ func (FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials
 	}
 
 	fs := filesystem{dev: dev, inodeCache: make(map[uint32]*inode)}
-	fs.vfsfs.Init(&fs)
+	fs.vfsfs.Init(vfsObj, &fs)
 	fs.sb, err = readSuperBlock(dev)
 	if err != nil {
 		return nil, nil, err
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 1aa2bd6a4..307e4d68c 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -66,7 +66,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
 	// Create VFS.
 	vfsObj := vfs.New()
 	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
@@ -509,27 +509,27 @@ func TestIterDirents(t *testing.T) {
 	}
 
 	wantDirents := []vfs.Dirent{
-		vfs.Dirent{
+		{
 			Name: ".",
 			Type: linux.DT_DIR,
 		},
-		vfs.Dirent{
+		{
 			Name: "..",
 			Type: linux.DT_DIR,
 		},
-		vfs.Dirent{
+		{
 			Name: "lost+found",
 			Type: linux.DT_DIR,
 		},
-		vfs.Dirent{
+		{
 			Name: "file.txt",
 			Type: linux.DT_REG,
 		},
-		vfs.Dirent{
+		{
 			Name: "bigfile.txt",
 			Type: linux.DT_REG,
 		},
-		vfs.Dirent{
+		{
 			Name: "symlink.txt",
 			Type: linux.DT_LNK,
 		},
diff --git a/pkg/sentry/fsimpl/ext/extent_file.go b/pkg/sentry/fsimpl/ext/extent_file.go
index 38b68a2d3..3d3ebaca6 100644
--- a/pkg/sentry/fsimpl/ext/extent_file.go
+++ b/pkg/sentry/fsimpl/ext/extent_file.go
@@ -99,7 +99,7 @@ func (f *extentFile) buildExtTree() error {
 func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*disklayout.ExtentNode, error) {
 	var header disklayout.ExtentHeader
 	off := entry.PhysicalBlock() * f.regFile.inode.blkSize
-	err := readFromDisk(f.regFile.inode.dev, int64(off), &header)
+	err := readFromDisk(f.regFile.inode.fs.dev, int64(off), &header)
 	if err != nil {
 		return nil, err
 	}
@@ -115,7 +115,7 @@ func (f *extentFile) buildExtTreeFromDisk(entry disklayout.ExtentEntry) (*diskla
 			curEntry = &disklayout.ExtentIdx{}
 		}
 
-		err := readFromDisk(f.regFile.inode.dev, int64(off), curEntry)
+		err := readFromDisk(f.regFile.inode.fs.dev, int64(off), curEntry)
 		if err != nil {
 			return nil, err
 		}
@@ -229,7 +229,7 @@ func (f *extentFile) readFromExtent(ex *disklayout.Extent, off uint64, dst []byt
 		toRead = len(dst)
 	}
 
-	n, _ := f.regFile.inode.dev.ReadAt(dst[:toRead], int64(readStart))
+	n, _ := f.regFile.inode.fs.dev.ReadAt(dst[:toRead], int64(readStart))
 	if n < toRead {
 		return n, syserror.EIO
 	}
diff --git a/pkg/sentry/fsimpl/ext/extent_test.go b/pkg/sentry/fsimpl/ext/extent_test.go
index 42d0a484b..a2382daa3 100644
--- a/pkg/sentry/fsimpl/ext/extent_test.go
+++ b/pkg/sentry/fsimpl/ext/extent_test.go
@@ -180,13 +180,15 @@ func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (*extentFile, []
 	mockExtentFile := &extentFile{
 		regFile: regularFile{
 			inode: inode{
+				fs: &filesystem{
+					dev: bytes.NewReader(mockDisk),
+				},
 				diskInode: &disklayout.InodeNew{
 					InodeOld: disklayout.InodeOld{
 						SizeLo: uint32(mockExtentBlkSize) * getNumPhyBlks(root),
 					},
 				},
 				blkSize: mockExtentBlkSize,
-				dev:     bytes.NewReader(mockDisk),
 			},
 		},
 	}
diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
index 4d18b28cb..5eca2b83f 100644
--- a/pkg/sentry/fsimpl/ext/file_description.go
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -36,11 +36,11 @@ type fileDescription struct {
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
-	return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
 }
 
 func (fd *fileDescription) inode() *inode {
-	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
+	return fd.vfsfd.Dentry().Impl().(*dentry).inode
 }
 
 // StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index e6c847a71..24249525c 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -16,7 +16,6 @@ package ext
 
 import (
 	"fmt"
-	"io"
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
@@ -42,13 +41,13 @@ type inode struct {
 	// refs is a reference count. refs is accessed using atomic memory operations.
 	refs int64
 
+	// fs is the containing filesystem.
+	fs *filesystem
+
 	// inodeNum is the inode number of this inode on disk. This is used to
 	// identify inodes within the ext filesystem.
 	inodeNum uint32
 
-	// dev represents the underlying device. Same as filesystem.dev.
-	dev io.ReaderAt
-
 	// blkSize is the fs data block size. Same as filesystem.sb.BlockSize().
 	blkSize uint64
 
@@ -81,10 +80,10 @@ func (in *inode) tryIncRef() bool {
 // decRef decrements the inode ref count and releases the inode resources if
 // the ref count hits 0.
 //
-// Precondition: Must have locked fs.mu.
-func (in *inode) decRef(fs *filesystem) {
+// Precondition: Must have locked filesystem.mu.
+func (in *inode) decRef() {
 	if refs := atomic.AddInt64(&in.refs, -1); refs == 0 {
-		delete(fs.inodeCache, in.inodeNum)
+		delete(in.fs.inodeCache, in.inodeNum)
 	} else if refs < 0 {
 		panic("ext.inode.decRef() called without holding a reference")
 	}
@@ -117,8 +116,8 @@ func newInode(fs *filesystem, inodeNum uint32) (*inode, error) {
 
 	// Build the inode based on its type.
 	inode := inode{
+		fs:        fs,
 		inodeNum:  inodeNum,
-		dev:       fs.dev,
 		blkSize:   blkSize,
 		diskInode: diskInode,
 	}
@@ -154,11 +153,14 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 	if err := in.checkPermissions(rp.Credentials(), ats); err != nil {
 		return nil, err
 	}
+	mnt := rp.Mount()
 	switch in.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
 		fd.flags = flags
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		mnt.IncRef()
+		vfsd.IncRef()
+		fd.vfsfd.Init(&fd, mnt, vfsd)
 		return &fd.vfsfd, nil
 	case *directory:
 		// Can't open directories writably. This check is not necessary for a read
@@ -167,8 +169,10 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
 		fd.flags = flags
+		mnt.IncRef()
+		vfsd.IncRef()
+		fd.vfsfd.Init(&fd, mnt, vfsd)
 		return &fd.vfsfd, nil
 	case *symlink:
 		if flags&linux.O_PATH == 0 {
@@ -177,7 +181,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 		}
 		var fd symlinkFD
 		fd.flags = flags
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		mnt.IncRef()
+		vfsd.IncRef()
+		fd.vfsfd.Init(&fd, mnt, vfsd)
 		return &fd.vfsfd, nil
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", in.impl))
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
index 23a846c08..ea6417ce7 100644
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go
@@ -176,7 +176,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			// Create VFS.
 			vfsObj := vfs.New()
 			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
@@ -365,7 +365,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			// Create VFS.
 			vfsObj := vfs.New()
 			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
 			}
@@ -394,7 +394,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			}
 			defer mountPoint.DecRef()
 			// Create and mount the submount.
-			if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.NewFilesystemOptions{}); err != nil {
+			if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.GetFilesystemOptions{}); err != nil {
 				b.Fatalf("failed to mount tmpfs submount: %v", err)
 			}
 			filePathBuilder.WriteString(mountPointName)
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index f006c40cd..08a9cb8ef 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -159,7 +159,7 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 			return nil, err
 		}
 	}
-	inode.incRef() // vfsd.IncRef(&fs.vfsfs)
+	inode.incRef()
 	return vfsd, nil
 }
 
@@ -379,6 +379,7 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 			return nil, err
 		}
 	}
+	mnt := rp.Mount()
 	switch impl := i.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
@@ -386,12 +387,14 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
 		fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
 		if fd.writable {
-			if err := rp.Mount().CheckBeginWrite(); err != nil {
+			if err := mnt.CheckBeginWrite(); err != nil {
 				return nil, err
 			}
-			// Mount.EndWrite() is called by regularFileFD.Release().
+			// mnt.EndWrite() is called by regularFileFD.Release().
 		}
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		mnt.IncRef()
+		vfsd.IncRef()
+		fd.vfsfd.Init(&fd, mnt, vfsd)
 		if flags&linux.O_TRUNC != 0 {
 			impl.mu.Lock()
 			impl.data = impl.data[:0]
@@ -405,7 +408,9 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+		mnt.IncRef()
+		vfsd.IncRef()
+		fd.vfsfd.Init(&fd, mnt, vfsd)
 		fd.flags = flags
 		return &fd.vfsfd, nil
 	case *symlink:
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
index 64c851c1a..4cb2a4e0f 100644
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -52,10 +52,10 @@ type filesystem struct {
 	nextInoMinusOne uint64 // accessed using atomic memory operations
 }
 
-// NewFilesystem implements vfs.FilesystemType.NewFilesystem.
-func (fstype FilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts vfs.NewFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
 	var fs filesystem
-	fs.vfsfs.Init(&fs)
+	fs.vfsfs.Init(vfsObj, &fs)
 	root := fs.newDentry(fs.newDirectory(creds, 01777))
 	return &fs.vfsfs, &root.vfsd, nil
 }
@@ -99,17 +99,17 @@ func (fs *filesystem) newDentry(inode *inode) *dentry {
 }
 
 // IncRef implements vfs.DentryImpl.IncRef.
-func (d *dentry) IncRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) IncRef() {
 	d.inode.incRef()
 }
 
 // TryIncRef implements vfs.DentryImpl.TryIncRef.
-func (d *dentry) TryIncRef(vfsfs *vfs.Filesystem) bool {
+func (d *dentry) TryIncRef() bool {
 	return d.inode.tryIncRef()
 }
 
 // DecRef implements vfs.DentryImpl.DecRef.
-func (d *dentry) DecRef(vfsfs *vfs.Filesystem) {
+func (d *dentry) DecRef() {
 	d.inode.decRef()
 }
 
@@ -266,11 +266,11 @@ type fileDescription struct {
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
-	return fd.vfsfd.VirtualDentry().Mount().Filesystem().Impl().(*filesystem)
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
 }
 
 func (fd *fileDescription) inode() *inode {
-	return fd.vfsfd.VirtualDentry().Dentry().Impl().(*dentry).inode
+	return fd.vfsfd.Dentry().Impl().(*dentry).inode
 }
 
 // StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go
index 732ed7c58..91cb4b1fc 100644
--- a/pkg/sentry/fsimpl/memfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/memfs/named_pipe.go
@@ -54,6 +54,9 @@ func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, v
 	if err != nil {
 		return nil, err
 	}
-	fd.vfsfd.Init(&fd, rp.Mount(), vfsd)
+	mnt := rp.Mount()
+	mnt.IncRef()
+	vfsd.IncRef()
+	fd.vfsfd.Init(&fd, mnt, vfsd)
 	return &fd.vfsfd, nil
 }
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go
index 0674b81a3..a3a870571 100644
--- a/pkg/sentry/fsimpl/memfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/memfs/pipe_test.go
@@ -152,7 +152,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	// Create VFS.
 	vfsObj := vfs.New()
 	vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.NewFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
 	}
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 09ed5a70e..40f4c1d09 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -118,7 +118,7 @@ func (d *Dentry) Impl() DentryImpl {
 type DentryImpl interface {
 	// IncRef increments the Dentry's reference count. A Dentry with a non-zero
 	// reference count must remain coherent with the state of the filesystem.
-	IncRef(fs *Filesystem)
+	IncRef()
 
 	// TryIncRef increments the Dentry's reference count and returns true. If
 	// the Dentry's reference count is zero, TryIncRef may do nothing and
@@ -126,10 +126,10 @@ type DentryImpl interface {
 	// guarantee that the Dentry is coherent with the state of the filesystem.)
 	//
 	// TryIncRef does not require that a reference is held on the Dentry.
-	TryIncRef(fs *Filesystem) bool
+	TryIncRef() bool
 
 	// DecRef decrements the Dentry's reference count.
-	DecRef(fs *Filesystem)
+	DecRef()
 }
 
 // IsDisowned returns true if d is disowned.
@@ -146,16 +146,20 @@ func (d *Dentry) isMounted() bool {
 	return atomic.LoadUint32(&d.mounts) != 0
 }
 
-func (d *Dentry) incRef(fs *Filesystem) {
-	d.impl.IncRef(fs)
+// IncRef increments d's reference count.
+func (d *Dentry) IncRef() {
+	d.impl.IncRef()
 }
 
-func (d *Dentry) tryIncRef(fs *Filesystem) bool {
-	return d.impl.TryIncRef(fs)
+// TryIncRef increments d's reference count and returns true. If d's reference
+// count is zero, TryIncRef may instead do nothing and return false.
+func (d *Dentry) TryIncRef() bool {
+	return d.impl.TryIncRef()
 }
 
-func (d *Dentry) decRef(fs *Filesystem) {
-	d.impl.DecRef(fs)
+// DecRef decrements d's reference count.
+func (d *Dentry) DecRef() {
+	d.impl.DecRef()
 }
 
 // These functions are exported so that filesystem implementations can use
@@ -420,6 +424,6 @@ func (vfs *VirtualFilesystem) forgetDisownedMountpoint(d *Dentry) {
 		vd.DecRef()
 	}
 	for _, mnt := range mountsToDecRef {
-		mnt.decRef()
+		mnt.DecRef()
 	}
 }
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 3a9665800..34007eb57 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -47,15 +47,14 @@ type FileDescription struct {
 	impl FileDescriptionImpl
 }
 
-// Init must be called before first use of fd. It takes references on mnt and
-// d.
+// Init must be called before first use of fd. It takes ownership of references
+// on mnt and d held by the caller.
 func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) {
 	fd.refs = 1
 	fd.vd = VirtualDentry{
 		mount:  mnt,
 		dentry: d,
 	}
-	fd.vd.IncRef()
 	fd.impl = impl
 }
 
@@ -64,6 +63,18 @@ func (fd *FileDescription) Impl() FileDescriptionImpl {
 	return fd.impl
 }
 
+// Mount returns the mount on which fd was opened. It does not take a reference
+// on the returned Mount.
+func (fd *FileDescription) Mount() *Mount {
+	return fd.vd.mount
+}
+
+// Dentry returns the dentry at which fd was opened. It does not take a
+// reference on the returned Dentry.
+func (fd *FileDescription) Dentry() *Dentry {
+	return fd.vd.dentry
+}
+
 // VirtualDentry returns the location at which fd was opened. It does not take
 // a reference on the returned VirtualDentry.
 func (fd *FileDescription) VirtualDentry() VirtualDentry {
@@ -75,6 +86,22 @@ func (fd *FileDescription) IncRef() {
 	atomic.AddInt64(&fd.refs, 1)
 }
 
+// TryIncRef increments fd's reference count and returns true. If fd's
+// reference count is already zero, TryIncRef does nothing and returns false.
+//
+// TryIncRef does not require that a reference is held on fd.
+func (fd *FileDescription) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&fd.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
 // DecRef decrements fd's reference count.
 func (fd *FileDescription) DecRef() {
 	if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 511b829fc..a5561dcbe 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -90,7 +90,7 @@ func TestGenCountFD(t *testing.T) {
 
 	vfsObj := New() // vfs.New()
 	vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &NewFilesystemOptions{})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("failed to create testfs root mount: %v", err)
 	}
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 7a074b718..76ff8cf51 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -33,29 +33,41 @@ type Filesystem struct {
 	// operations.
 	refs int64
 
+	// vfs is the VirtualFilesystem that uses this Filesystem. vfs is
+	// immutable.
+	vfs *VirtualFilesystem
+
 	// impl is the FilesystemImpl associated with this Filesystem. impl is
 	// immutable. This should be the last field in Dentry.
 	impl FilesystemImpl
 }
 
 // Init must be called before first use of fs.
-func (fs *Filesystem) Init(impl FilesystemImpl) {
+func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, impl FilesystemImpl) {
 	fs.refs = 1
+	fs.vfs = vfsObj
 	fs.impl = impl
 }
 
+// VirtualFilesystem returns the containing VirtualFilesystem.
+func (fs *Filesystem) VirtualFilesystem() *VirtualFilesystem {
+	return fs.vfs
+}
+
 // Impl returns the FilesystemImpl associated with fs.
 func (fs *Filesystem) Impl() FilesystemImpl {
 	return fs.impl
 }
 
-func (fs *Filesystem) incRef() {
+// IncRef increments fs' reference count.
+func (fs *Filesystem) IncRef() {
 	if atomic.AddInt64(&fs.refs, 1) <= 1 {
-		panic("Filesystem.incRef() called without holding a reference")
+		panic("Filesystem.IncRef() called without holding a reference")
 	}
 }
 
-func (fs *Filesystem) decRef() {
+// DecRef decrements fs' reference count.
+func (fs *Filesystem) DecRef() {
 	if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
 		fs.impl.Release()
 	} else if refs < 0 {
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index f401ad7f3..c335e206d 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -25,21 +25,21 @@ import (
 //
 // FilesystemType is analogous to Linux's struct file_system_type.
 type FilesystemType interface {
-	// NewFilesystem returns a Filesystem configured by the given options,
+	// GetFilesystem returns a Filesystem configured by the given options,
 	// along with its mount root. A reference is taken on the returned
 	// Filesystem and Dentry.
-	NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error)
+	GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error)
 }
 
-// NewFilesystemOptions contains options to FilesystemType.NewFilesystem.
-type NewFilesystemOptions struct {
+// GetFilesystemOptions contains options to FilesystemType.GetFilesystem.
+type GetFilesystemOptions struct {
 	// Data is the string passed as the 5th argument to mount(2), which is
 	// usually a comma-separated list of filesystem-specific mount options.
 	Data string
 
 	// InternalData holds opaque FilesystemType-specific data. There is
 	// intentionally no way for applications to specify InternalData; if it is
-	// not nil, the call to NewFilesystem originates from within the sentry.
+	// not nil, the call to GetFilesystem originates from within the sentry.
 	InternalData interface{}
 }
 
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 198fb8067..1c3b2e987 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -110,12 +110,12 @@ type MountNamespace struct {
 // NewMountNamespace returns a new mount namespace with a root filesystem
 // configured by the given arguments. A reference is taken on the returned
 // MountNamespace.
-func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *NewFilesystemOptions) (*MountNamespace, error) {
+func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
 	fsType := vfs.getFilesystemType(fsTypeName)
 	if fsType == nil {
 		return nil, syserror.ENODEV
 	}
-	fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts)
+	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
 	if err != nil {
 		return nil, err
 	}
@@ -133,13 +133,13 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 	return mntns, nil
 }
 
-// NewMount creates and mounts a new Filesystem.
-func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *NewFilesystemOptions) error {
+// NewMount creates and mounts a Filesystem configured by the given arguments.
+func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *GetFilesystemOptions) error {
 	fsType := vfs.getFilesystemType(fsTypeName)
 	if fsType == nil {
 		return syserror.ENODEV
 	}
-	fs, root, err := fsType.NewFilesystem(ctx, creds, source, *opts)
+	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
 	if err != nil {
 		return err
 	}
@@ -147,8 +147,8 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
 	// lock ordering.
 	vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{})
 	if err != nil {
-		root.decRef(fs)
-		fs.decRef()
+		root.DecRef()
+		fs.DecRef()
 		return err
 	}
 	vfs.mountMu.Lock()
@@ -158,8 +158,8 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
 			vd.dentry.mu.Unlock()
 			vfs.mountMu.Unlock()
 			vd.DecRef()
-			root.decRef(fs)
-			fs.decRef()
+			root.DecRef()
+			fs.DecRef()
 			return syserror.ENOENT
 		}
 		// vd might have been mounted over between vfs.GetDentryAt() and
@@ -179,7 +179,7 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
 			break
 		}
 		// This can't fail since we're holding vfs.mountMu.
-		nextmnt.root.incRef(nextmnt.fs)
+		nextmnt.root.IncRef()
 		vd.dentry.mu.Unlock()
 		vd.DecRef()
 		vd = VirtualDentry{
@@ -229,7 +229,7 @@ type umountRecursiveOptions struct {
 // Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef
 // respectively, and returns updated slices. (This is necessary because
 // filesystem locks possibly taken by DentryImpl.DecRef() may precede
-// vfs.mountMu in the lock order, and Mount.decRef() may lock vfs.mountMu.)
+// vfs.mountMu in the lock order, and Mount.DecRef() may lock vfs.mountMu.)
 //
 // umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree().
 //
@@ -322,13 +322,15 @@ func (mnt *Mount) tryIncMountedRef() bool {
 	}
 }
 
-func (mnt *Mount) incRef() {
+// IncRef increments mnt's reference count.
+func (mnt *Mount) IncRef() {
 	// In general, negative values for mnt.refs are valid because the MSB is
 	// the eager-unmount bit.
 	atomic.AddInt64(&mnt.refs, 1)
 }
 
-func (mnt *Mount) decRef() {
+// DecRef decrements mnt's reference count.
+func (mnt *Mount) DecRef() {
 	refs := atomic.AddInt64(&mnt.refs, -1)
 	if refs&^math.MinInt64 == 0 { // mask out MSB
 		var vd VirtualDentry
@@ -339,8 +341,8 @@ func (mnt *Mount) decRef() {
 			mnt.vfs.mounts.seq.EndWrite()
 			mnt.vfs.mountMu.Unlock()
 		}
-		mnt.root.decRef(mnt.fs)
-		mnt.fs.decRef()
+		mnt.root.DecRef()
+		mnt.fs.DecRef()
 		if vd.Ok() {
 			vd.DecRef()
 		}
@@ -368,7 +370,7 @@ func (mntns *MountNamespace) DecRef(vfs *VirtualFilesystem) {
 			vd.DecRef()
 		}
 		for _, mnt := range mountsToDecRef {
-			mnt.decRef()
+			mnt.DecRef()
 		}
 	} else if refs < 0 {
 		panic("MountNamespace.DecRef() called without holding a reference")
@@ -413,7 +415,7 @@ retryFirst:
 			// Raced with umount.
 			continue
 		}
-		mnt.decRef()
+		mnt.DecRef()
 		mnt = next
 		d = next.root
 	}
@@ -447,15 +449,15 @@ retryFirst:
 		// Raced with umount.
 		goto retryFirst
 	}
-	if !point.tryIncRef(parent.fs) {
+	if !point.TryIncRef() {
 		// Since Mount holds a reference on Mount.key.point, this can only
 		// happen due to a racing change to Mount.key.
-		parent.decRef()
+		parent.DecRef()
 		goto retryFirst
 	}
 	if !vfs.mounts.seq.ReadOk(epoch) {
-		point.decRef(parent.fs)
-		parent.decRef()
+		point.DecRef()
+		parent.DecRef()
 		goto retryFirst
 	}
 	mnt = parent
@@ -480,19 +482,19 @@ retryFirst:
 			// Raced with umount.
 			goto retryNotFirst
 		}
-		if !point.tryIncRef(parent.fs) {
+		if !point.TryIncRef() {
 			// Since Mount holds a reference on Mount.key.point, this can
 			// only happen due to a racing change to Mount.key.
-			parent.decRef()
+			parent.DecRef()
 			goto retryNotFirst
 		}
 		if !vfs.mounts.seq.ReadOk(epoch) {
-			point.decRef(parent.fs)
-			parent.decRef()
+			point.DecRef()
+			parent.DecRef()
 			goto retryNotFirst
 		}
-		d.decRef(mnt.fs)
-		mnt.decRef()
+		d.DecRef()
+		mnt.DecRef()
 		mnt = parent
 		d = point
 	}
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index 61bce6426..621f5a6f8 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -149,20 +149,20 @@ func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) {
 
 func (rp *ResolvingPath) decRefStartAndMount() {
 	if rp.flags&rpflagsHaveStartRef != 0 {
-		rp.start.decRef(rp.mount.fs)
+		rp.start.DecRef()
 	}
 	if rp.flags&rpflagsHaveMountRef != 0 {
-		rp.mount.decRef()
+		rp.mount.DecRef()
 	}
 }
 
 func (rp *ResolvingPath) releaseErrorState() {
 	if rp.nextStart != nil {
-		rp.nextStart.decRef(rp.nextMount.fs)
+		rp.nextStart.DecRef()
 		rp.nextStart = nil
 	}
 	if rp.nextMount != nil {
-		rp.nextMount.decRef()
+		rp.nextMount.DecRef()
 		rp.nextMount = nil
 	}
 }
diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go
index 49952b2cc..436151afa 100644
--- a/pkg/sentry/vfs/syscalls.go
+++ b/pkg/sentry/vfs/syscalls.go
@@ -63,7 +63,7 @@ func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Crede
 				mount:  rp.mount,
 				dentry: d,
 			}
-			rp.mount.incRef()
+			rp.mount.IncRef()
 			vfs.putResolvingPath(rp)
 			return vd, nil
 		}
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
index 70b192ece..593144cb7 100644
--- a/pkg/sentry/vfs/testutil.go
+++ b/pkg/sentry/vfs/testutil.go
@@ -33,10 +33,10 @@ type FDTestFilesystem struct {
 	vfsfs Filesystem
 }
 
-// NewFilesystem implements FilesystemType.NewFilesystem.
-func (fstype FDTestFilesystemType) NewFilesystem(ctx context.Context, creds *auth.Credentials, source string, opts NewFilesystemOptions) (*Filesystem, *Dentry, error) {
+// GetFilesystem implements FilesystemType.GetFilesystem.
+func (fstype FDTestFilesystemType) GetFilesystem(ctx context.Context, vfsObj *VirtualFilesystem, creds *auth.Credentials, source string, opts GetFilesystemOptions) (*Filesystem, *Dentry, error) {
 	var fs FDTestFilesystem
-	fs.vfsfs.Init(&fs)
+	fs.vfsfs.Init(vfsObj, &fs)
 	return &fs.vfsfs, fs.NewDentry(), nil
 }
 
@@ -126,14 +126,14 @@ func (fs *FDTestFilesystem) NewDentry() *Dentry {
 }
 
 // IncRef implements DentryImpl.IncRef.
-func (d *fdTestDentry) IncRef(vfsfs *Filesystem) {
+func (d *fdTestDentry) IncRef() {
 }
 
 // TryIncRef implements DentryImpl.TryIncRef.
-func (d *fdTestDentry) TryIncRef(vfsfs *Filesystem) bool {
+func (d *fdTestDentry) TryIncRef() bool {
 	return true
 }
 
 // DecRef implements DentryImpl.DecRef.
-func (d *fdTestDentry) DecRef(vfsfs *Filesystem) {
+func (d *fdTestDentry) DecRef() {
 }
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 88e865d86..f0cd3ffe5 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -116,15 +116,15 @@ func (vd VirtualDentry) Ok() bool {
 // IncRef increments the reference counts on the Mount and Dentry represented
 // by vd.
 func (vd VirtualDentry) IncRef() {
-	vd.mount.incRef()
-	vd.dentry.incRef(vd.mount.fs)
+	vd.mount.IncRef()
+	vd.dentry.IncRef()
 }
 
 // DecRef decrements the reference counts on the Mount and Dentry represented
 // by vd.
 func (vd VirtualDentry) DecRef() {
-	vd.dentry.decRef(vd.mount.fs)
-	vd.mount.decRef()
+	vd.dentry.DecRef()
+	vd.mount.DecRef()
 }
 
 // Mount returns the Mount associated with vd. It does not take a reference on
-- 
cgit v1.2.3


From 46651a7d26559bdc69d460bdeb4de5968212d615 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Tue, 10 Dec 2019 18:16:47 -0800
Subject: Add most VFS methods for syscalls.

PiperOrigin-RevId: 284892289
---
 pkg/abi/linux/file.go                             |  10 +-
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go |   6 +-
 pkg/sentry/fsimpl/ext/ext_test.go                 |  29 +-
 pkg/sentry/fsimpl/memfs/benchmark_test.go         |   2 +-
 pkg/sentry/fsimpl/memfs/pipe_test.go              |   6 +-
 pkg/sentry/vfs/BUILD                              |   1 -
 pkg/sentry/vfs/file_description.go                |  93 ++++++
 pkg/sentry/vfs/file_description_impl_util_test.go |  10 +-
 pkg/sentry/vfs/filesystem.go                      |  22 ++
 pkg/sentry/vfs/mount.go                           |  69 +++-
 pkg/sentry/vfs/options.go                         |  12 +
 pkg/sentry/vfs/syscalls.go                        | 237 --------------
 pkg/sentry/vfs/vfs.go                             | 378 ++++++++++++++++++++++
 13 files changed, 606 insertions(+), 269 deletions(-)
 delete mode 100644 pkg/sentry/vfs/syscalls.go

(limited to 'pkg/sentry/fsimpl/memfs')

diff --git a/pkg/abi/linux/file.go b/pkg/abi/linux/file.go
index c9ee098f4..0f014d27f 100644
--- a/pkg/abi/linux/file.go
+++ b/pkg/abi/linux/file.go
@@ -144,9 +144,13 @@ const (
 	ModeCharacterDevice = S_IFCHR
 	ModeNamedPipe       = S_IFIFO
 
-	ModeSetUID = 04000
-	ModeSetGID = 02000
-	ModeSticky = 01000
+	S_ISUID = 04000
+	S_ISGID = 02000
+	S_ISVTX = 01000
+
+	ModeSetUID = S_ISUID
+	ModeSetGID = S_ISGID
+	ModeSticky = S_ISVTX
 
 	ModeUserAll     = 0700
 	ModeUserRead    = 0400
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 94cd74095..177ce2cb9 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -81,7 +81,11 @@ func mount(b *testing.B, imagePath string, vfsfs *vfs.VirtualFilesystem, pop *vf
 	ctx := contexttest.Context(b)
 	creds := auth.CredentialsFromContext(ctx)
 
-	if err := vfsfs.NewMount(ctx, creds, imagePath, pop, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())}); err != nil {
+	if err := vfsfs.MountAt(ctx, creds, imagePath, pop, "extfs", &vfs.MountOptions{
+		GetFilesystemOptions: vfs.GetFilesystemOptions{
+			InternalData: int(f.Fd()),
+		},
+	}); err != nil {
 		b.Fatalf("failed to mount tmpfs submount: %v", err)
 	}
 	return func() {
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 307e4d68c..e9f756732 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -147,55 +147,54 @@ func TestSeek(t *testing.T) {
 				t.Fatalf("vfsfs.OpenAt failed: %v", err)
 			}
 
-			if n, err := fd.Impl().Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil {
+			if n, err := fd.Seek(ctx, 0, linux.SEEK_SET); n != 0 || err != nil {
 				t.Errorf("expected seek position 0, got %d and error %v", n, err)
 			}
 
-			stat, err := fd.Impl().Stat(ctx, vfs.StatOptions{})
+			stat, err := fd.Stat(ctx, vfs.StatOptions{})
 			if err != nil {
 				t.Errorf("fd.stat failed for file %s in image %s: %v", test.path, test.image, err)
 			}
 
 			// We should be able to seek beyond the end of file.
 			size := int64(stat.Size)
-			if n, err := fd.Impl().Seek(ctx, size, linux.SEEK_SET); n != size || err != nil {
+			if n, err := fd.Seek(ctx, size, linux.SEEK_SET); n != size || err != nil {
 				t.Errorf("expected seek position %d, got %d and error %v", size, n, err)
 			}
 
 			// EINVAL should be returned if the resulting offset is negative.
-			if _, err := fd.Impl().Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL {
+			if _, err := fd.Seek(ctx, -1, linux.SEEK_SET); err != syserror.EINVAL {
 				t.Errorf("expected error EINVAL but got %v", err)
 			}
 
-			if n, err := fd.Impl().Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil {
+			if n, err := fd.Seek(ctx, 3, linux.SEEK_CUR); n != size+3 || err != nil {
 				t.Errorf("expected seek position %d, got %d and error %v", size+3, n, err)
 			}
 
 			// Make sure negative offsets work with SEEK_CUR.
-			if n, err := fd.Impl().Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil {
+			if n, err := fd.Seek(ctx, -2, linux.SEEK_CUR); n != size+1 || err != nil {
 				t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
 			}
 
 			// EINVAL should be returned if the resulting offset is negative.
-			if _, err := fd.Impl().Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL {
+			if _, err := fd.Seek(ctx, -(size + 2), linux.SEEK_CUR); err != syserror.EINVAL {
 				t.Errorf("expected error EINVAL but got %v", err)
 			}
 
 			// Make sure SEEK_END works with regular files.
-			switch fd.Impl().(type) {
-			case *regularFileFD:
+			if _, ok := fd.Impl().(*regularFileFD); ok {
 				// Seek back to 0.
-				if n, err := fd.Impl().Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil {
+				if n, err := fd.Seek(ctx, -size, linux.SEEK_END); n != 0 || err != nil {
 					t.Errorf("expected seek position %d, got %d and error %v", 0, n, err)
 				}
 
 				// Seek forward beyond EOF.
-				if n, err := fd.Impl().Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil {
+				if n, err := fd.Seek(ctx, 1, linux.SEEK_END); n != size+1 || err != nil {
 					t.Errorf("expected seek position %d, got %d and error %v", size+1, n, err)
 				}
 
 				// EINVAL should be returned if the resulting offset is negative.
-				if _, err := fd.Impl().Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL {
+				if _, err := fd.Seek(ctx, -(size + 1), linux.SEEK_END); err != syserror.EINVAL {
 					t.Errorf("expected error EINVAL but got %v", err)
 				}
 			}
@@ -456,7 +455,7 @@ func TestRead(t *testing.T) {
 			want := make([]byte, 1)
 			for {
 				n, err := f.Read(want)
-				fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{})
+				fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{})
 
 				if diff := cmp.Diff(got, want); diff != "" {
 					t.Errorf("file data mismatch (-want +got):\n%s", diff)
@@ -464,7 +463,7 @@ func TestRead(t *testing.T) {
 
 				// Make sure there is no more file data left after getting EOF.
 				if n == 0 || err == io.EOF {
-					if n, _ := fd.Impl().Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 {
+					if n, _ := fd.Read(ctx, usermem.BytesIOSequence(got), vfs.ReadOptions{}); n != 0 {
 						t.Errorf("extra unexpected file data in file %s in image %s", test.absPath, test.image)
 					}
 
@@ -574,7 +573,7 @@ func TestIterDirents(t *testing.T) {
 			}
 
 			cb := &iterDirentsCb{}
-			if err = fd.Impl().IterDirents(ctx, cb); err != nil {
+			if err = fd.IterDirents(ctx, cb); err != nil {
 				t.Fatalf("dir fd.IterDirents() failed: %v", err)
 			}
 
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
index ea6417ce7..4a7a94a52 100644
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go
@@ -394,7 +394,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			}
 			defer mountPoint.DecRef()
 			// Create and mount the submount.
-			if err := vfsObj.NewMount(ctx, creds, "", &pop, "memfs", &vfs.GetFilesystemOptions{}); err != nil {
+			if err := vfsObj.MountAt(ctx, creds, "", &pop, "memfs", &vfs.MountOptions{}); err != nil {
 				b.Fatalf("failed to mount tmpfs submount: %v", err)
 			}
 			filePathBuilder.WriteString(mountPointName)
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go
index a3a870571..5bf527c80 100644
--- a/pkg/sentry/fsimpl/memfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/memfs/pipe_test.go
@@ -194,7 +194,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
 	readData := make([]byte, 1)
 	dst := usermem.BytesIOSequence(readData)
-	bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{})
+	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
 	if err != syserror.ErrWouldBlock {
 		t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err)
 	}
@@ -207,7 +207,7 @@ func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
 func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
 	writeData := []byte(msg)
 	src := usermem.BytesIOSequence(writeData)
-	bytesWritten, err := fd.Impl().Write(ctx, src, vfs.WriteOptions{})
+	bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{})
 	if err != nil {
 		t.Fatalf("error writing to pipe %q: %v", fileName, err)
 	}
@@ -220,7 +220,7 @@ func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg
 func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
 	readData := make([]byte, len(msg))
 	dst := usermem.BytesIOSequence(readData)
-	bytesRead, err := fd.Impl().Read(ctx, dst, vfs.ReadOptions{})
+	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
 	if err != nil {
 		t.Fatalf("error reading from pipe %q: %v", fileName, err)
 	}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 74a325309..59237c3b9 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -19,7 +19,6 @@ go_library(
         "options.go",
         "permissions.go",
         "resolving_path.go",
-        "syscalls.go",
         "testutil.go",
         "vfs.go",
     ],
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 34007eb57..4473dfce8 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -241,3 +241,96 @@ type IterDirentsCallback interface {
 	// called.
 	Handle(dirent Dirent) bool
 }
+
+// OnClose is called when a file descriptor representing the FileDescription is
+// closed. Returning a non-nil error should not prevent the file descriptor
+// from being closed.
+func (fd *FileDescription) OnClose(ctx context.Context) error {
+	return fd.impl.OnClose(ctx)
+}
+
+// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
+func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) {
+	flags, err := fd.impl.StatusFlags(ctx)
+	flags |= linux.O_LARGEFILE
+	return flags, err
+}
+
+// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
+func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
+	return fd.impl.SetStatusFlags(ctx, flags)
+}
+
+// Stat returns metadata for the file represented by fd.
+func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+	return fd.impl.Stat(ctx, opts)
+}
+
+// SetStat updates metadata for the file represented by fd.
+func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error {
+	return fd.impl.SetStat(ctx, opts)
+}
+
+// StatFS returns metadata for the filesystem containing the file represented
+// by fd.
+func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+	return fd.impl.StatFS(ctx)
+}
+
+// PRead reads from the file represented by fd into dst, starting at the given
+// offset, and returns the number of bytes read. PRead is permitted to return
+// partial reads with a nil error.
+func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
+	return fd.impl.PRead(ctx, dst, offset, opts)
+}
+
+// Read is similar to PRead, but does not specify an offset.
+func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
+	return fd.impl.Read(ctx, dst, opts)
+}
+
+// PWrite writes src to the file represented by fd, starting at the given
+// offset, and returns the number of bytes written. PWrite is permitted to
+// return partial writes with a nil error.
+func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
+	return fd.impl.PWrite(ctx, src, offset, opts)
+}
+
+// Write is similar to PWrite, but does not specify an offset.
+func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
+	return fd.impl.Write(ctx, src, opts)
+}
+
+// IterDirents invokes cb on each entry in the directory represented by fd. If
+// IterDirents has been called since the last call to Seek, it continues
+// iteration from the end of the last call.
+func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
+	return fd.impl.IterDirents(ctx, cb)
+}
+
+// Seek changes fd's offset (assuming one exists) and returns its new value.
+func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	return fd.impl.Seek(ctx, offset, whence)
+}
+
+// Sync has the semantics of fsync(2).
+func (fd *FileDescription) Sync(ctx context.Context) error {
+	return fd.impl.Sync(ctx)
+}
+
+// ConfigureMMap mutates opts to implement mmap(2) for the file represented by
+// fd.
+func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
+	return fd.impl.ConfigureMMap(ctx, opts)
+}
+
+// Ioctl implements the ioctl(2) syscall.
+func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
+	return fd.impl.Ioctl(ctx, uio, args)
+}
+
+// SyncFS instructs the filesystem containing fd to execute the semantics of
+// syncfs(2).
+func (fd *FileDescription) SyncFS(ctx context.Context) error {
+	return fd.vd.mount.fs.impl.Sync(ctx)
+}
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index a5561dcbe..ac7799296 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -103,7 +103,7 @@ func TestGenCountFD(t *testing.T) {
 	// The first read causes Generate to be called to fill the FD's buffer.
 	buf := make([]byte, 2)
 	ioseq := usermem.BytesIOSequence(buf)
-	n, err := fd.Impl().Read(ctx, ioseq, ReadOptions{})
+	n, err := fd.Read(ctx, ioseq, ReadOptions{})
 	if n != 1 || (err != nil && err != io.EOF) {
 		t.Fatalf("first Read: got (%d, %v), wanted (1, nil or EOF)", n, err)
 	}
@@ -112,17 +112,17 @@ func TestGenCountFD(t *testing.T) {
 	}
 
 	// A second read without seeking is still at EOF.
-	n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{})
+	n, err = fd.Read(ctx, ioseq, ReadOptions{})
 	if n != 0 || err != io.EOF {
 		t.Fatalf("second Read: got (%d, %v), wanted (0, EOF)", n, err)
 	}
 
 	// Seeking to the beginning of the file causes it to be regenerated.
-	n, err = fd.Impl().Seek(ctx, 0, linux.SEEK_SET)
+	n, err = fd.Seek(ctx, 0, linux.SEEK_SET)
 	if n != 0 || err != nil {
 		t.Fatalf("Seek: got (%d, %v), wanted (0, nil)", n, err)
 	}
-	n, err = fd.Impl().Read(ctx, ioseq, ReadOptions{})
+	n, err = fd.Read(ctx, ioseq, ReadOptions{})
 	if n != 1 || (err != nil && err != io.EOF) {
 		t.Fatalf("Read after Seek: got (%d, %v), wanted (1, nil or EOF)", n, err)
 	}
@@ -131,7 +131,7 @@ func TestGenCountFD(t *testing.T) {
 	}
 
 	// PRead at the beginning of the file also causes it to be regenerated.
-	n, err = fd.Impl().PRead(ctx, ioseq, 0, ReadOptions{})
+	n, err = fd.PRead(ctx, ioseq, 0, ReadOptions{})
 	if n != 1 || (err != nil && err != io.EOF) {
 		t.Fatalf("PRead: got (%d, %v), wanted (1, nil or EOF)", n, err)
 	}
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 76ff8cf51..dfbd2372a 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -47,6 +47,9 @@ func (fs *Filesystem) Init(vfsObj *VirtualFilesystem, impl FilesystemImpl) {
 	fs.refs = 1
 	fs.vfs = vfsObj
 	fs.impl = impl
+	vfsObj.filesystemsMu.Lock()
+	vfsObj.filesystems[fs] = struct{}{}
+	vfsObj.filesystemsMu.Unlock()
 }
 
 // VirtualFilesystem returns the containing VirtualFilesystem.
@@ -66,9 +69,28 @@ func (fs *Filesystem) IncRef() {
 	}
 }
 
+// TryIncRef increments fs' reference count and returns true. If fs' reference
+// count is zero, TryIncRef does nothing and returns false.
+//
+// TryIncRef does not require that a reference is held on fs.
+func (fs *Filesystem) TryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&fs.refs)
+		if refs <= 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&fs.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
 // DecRef decrements fs' reference count.
 func (fs *Filesystem) DecRef() {
 	if refs := atomic.AddInt64(&fs.refs, -1); refs == 0 {
+		fs.vfs.filesystemsMu.Lock()
+		delete(fs.vfs.filesystems, fs)
+		fs.vfs.filesystemsMu.Unlock()
 		fs.impl.Release()
 	} else if refs < 0 {
 		panic("Filesystem.decRef() called without holding a reference")
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index 1c3b2e987..ec23ab0dd 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -18,6 +18,7 @@ import (
 	"math"
 	"sync/atomic"
 
+	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -133,13 +134,13 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 	return mntns, nil
 }
 
-// NewMount creates and mounts a Filesystem configured by the given arguments.
-func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *GetFilesystemOptions) error {
+// MountAt creates and mounts a Filesystem configured by the given arguments.
+func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
 	fsType := vfs.getFilesystemType(fsTypeName)
 	if fsType == nil {
 		return syserror.ENODEV
 	}
-	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
 	if err != nil {
 		return err
 	}
@@ -207,6 +208,68 @@ func (vfs *VirtualFilesystem) NewMount(ctx context.Context, creds *auth.Credenti
 	return nil
 }
 
+// UmountAt removes the Mount at the given path.
+func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error {
+	if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 {
+		return syserror.EINVAL
+	}
+
+	// MNT_FORCE is currently unimplemented except for the permission check.
+	if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) {
+		return syserror.EPERM
+	}
+
+	vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{})
+	if err != nil {
+		return err
+	}
+	defer vd.DecRef()
+	if vd.dentry != vd.mount.root {
+		return syserror.EINVAL
+	}
+	vfs.mountMu.Lock()
+	if mntns := MountNamespaceFromContext(ctx); mntns != nil && mntns != vd.mount.ns {
+		vfs.mountMu.Unlock()
+		return syserror.EINVAL
+	}
+
+	// TODO(jamieliu): Linux special-cases umount of the caller's root, which
+	// we don't implement yet (we'll just fail it since the caller holds a
+	// reference on it).
+
+	vfs.mounts.seq.BeginWrite()
+	if opts.Flags&linux.MNT_DETACH == 0 {
+		if len(vd.mount.children) != 0 {
+			vfs.mounts.seq.EndWrite()
+			vfs.mountMu.Unlock()
+			return syserror.EBUSY
+		}
+		// We are holding a reference on vd.mount.
+		expectedRefs := int64(1)
+		if !vd.mount.umounted {
+			expectedRefs = 2
+		}
+		if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB
+			vfs.mounts.seq.EndWrite()
+			vfs.mountMu.Unlock()
+			return syserror.EBUSY
+		}
+	}
+	vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{
+		eager:               opts.Flags&linux.MNT_DETACH == 0,
+		disconnectHierarchy: true,
+	}, nil, nil)
+	vfs.mounts.seq.EndWrite()
+	vfs.mountMu.Unlock()
+	for _, vd := range vdsToDecRef {
+		vd.DecRef()
+	}
+	for _, mnt := range mountsToDecRef {
+		mnt.DecRef()
+	}
+	return nil
+}
+
 type umountRecursiveOptions struct {
 	// If eager is true, ensure that future calls to Mount.tryIncMountedRef()
 	// on umounted mounts fail.
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 3aa73d911..3ecbc8fc1 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -46,6 +46,12 @@ type MknodOptions struct {
 	DevMinor uint32
 }
 
+// MountOptions contains options to VirtualFilesystem.MountAt().
+type MountOptions struct {
+	// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
+	GetFilesystemOptions GetFilesystemOptions
+}
+
 // OpenOptions contains options to VirtualFilesystem.OpenAt() and
 // FilesystemImpl.OpenAt().
 type OpenOptions struct {
@@ -114,6 +120,12 @@ type StatOptions struct {
 	Sync uint32
 }
 
+// UmountOptions contains options to VirtualFilesystem.UmountAt().
+type UmountOptions struct {
+	// Flags contains flags as specified for umount2(2).
+	Flags uint32
+}
+
 // WriteOptions contains options to FileDescription.PWrite(),
 // FileDescriptionImpl.PWrite(), FileDescription.Write(), and
 // FileDescriptionImpl.Write().
diff --git a/pkg/sentry/vfs/syscalls.go b/pkg/sentry/vfs/syscalls.go
deleted file mode 100644
index 436151afa..000000000
--- a/pkg/sentry/vfs/syscalls.go
+++ /dev/null
@@ -1,237 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package vfs
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// PathOperation specifies the path operated on by a VFS method.
-//
-// PathOperation is passed to VFS methods by pointer to reduce memory copying:
-// it's somewhat large and should never escape. (Options structs are passed by
-// pointer to VFS and FileDescription methods for the same reason.)
-type PathOperation struct {
-	// Root is the VFS root. References on Root are borrowed from the provider
-	// of the PathOperation.
-	//
-	// Invariants: Root.Ok().
-	Root VirtualDentry
-
-	// Start is the starting point for the path traversal. References on Start
-	// are borrowed from the provider of the PathOperation (i.e. the caller of
-	// the VFS method to which the PathOperation was passed).
-	//
-	// Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root.
-	Start VirtualDentry
-
-	// Path is the pathname traversed by this operation.
-	Pathname string
-
-	// If FollowFinalSymlink is true, and the Dentry traversed by the final
-	// path component represents a symbolic link, the symbolic link should be
-	// followed.
-	FollowFinalSymlink bool
-}
-
-// GetDentryAt returns a VirtualDentry representing the given path, at which a
-// file must exist. A reference is taken on the returned VirtualDentry.
-func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return VirtualDentry{}, err
-	}
-	for {
-		d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
-		if err == nil {
-			vd := VirtualDentry{
-				mount:  rp.mount,
-				dentry: d,
-			}
-			rp.mount.IncRef()
-			vfs.putResolvingPath(rp)
-			return vd, nil
-		}
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return VirtualDentry{}, err
-		}
-	}
-}
-
-// MkdirAt creates a directory at the given path.
-func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
-	// "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
-	// also honored." - mkdir(2)
-	opts.Mode &= 01777
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
-	}
-	for {
-		err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
-		if err == nil {
-			vfs.putResolvingPath(rp)
-			return nil
-		}
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return err
-		}
-	}
-}
-
-// MknodAt creates a file of the given mode at the given path. It returns an
-// error from the syserror package.
-func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return nil
-	}
-	for {
-		if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil {
-			vfs.putResolvingPath(rp)
-			return nil
-		}
-		// Handle mount traversals.
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return err
-		}
-	}
-}
-
-// OpenAt returns a FileDescription providing access to the file at the given
-// path. A reference is taken on the returned FileDescription.
-func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
-	// Remove:
-	//
-	// - O_LARGEFILE, which we always report in FileDescription status flags
-	// since only 64-bit architectures are supported at this time.
-	//
-	// - O_CLOEXEC, which affects file descriptors and therefore must be
-	// handled outside of VFS.
-	//
-	// - Unknown flags.
-	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
-	// Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
-	if opts.Flags&linux.O_SYNC != 0 {
-		opts.Flags |= linux.O_DSYNC
-	}
-	// Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
-	// with O_DIRECTORY and a writable access mode (to ensure that it fails on
-	// filesystem implementations that do not support it).
-	if opts.Flags&linux.O_TMPFILE != 0 {
-		if opts.Flags&linux.O_DIRECTORY == 0 {
-			return nil, syserror.EINVAL
-		}
-		if opts.Flags&linux.O_CREAT != 0 {
-			return nil, syserror.EINVAL
-		}
-		if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
-			return nil, syserror.EINVAL
-		}
-	}
-	// O_PATH causes most other flags to be ignored.
-	if opts.Flags&linux.O_PATH != 0 {
-		opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
-	}
-	// "On Linux, the following bits are also honored in mode: [S_ISUID,
-	// S_ISGID, S_ISVTX]" - open(2)
-	opts.Mode &= 07777
-
-	if opts.Flags&linux.O_NOFOLLOW != 0 {
-		pop.FollowFinalSymlink = false
-	}
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return nil, err
-	}
-	if opts.Flags&linux.O_DIRECTORY != 0 {
-		rp.mustBeDir = true
-		rp.mustBeDirOrig = true
-	}
-	for {
-		fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
-		if err == nil {
-			vfs.putResolvingPath(rp)
-			return fd, nil
-		}
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return nil, err
-		}
-	}
-}
-
-// StatAt returns metadata for the file at the given path.
-func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return linux.Statx{}, err
-	}
-	for {
-		stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
-		if err == nil {
-			vfs.putResolvingPath(rp)
-			return stat, nil
-		}
-		if !rp.handleError(err) {
-			vfs.putResolvingPath(rp)
-			return linux.Statx{}, err
-		}
-	}
-}
-
-// StatusFlags returns file description status flags.
-func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) {
-	flags, err := fd.impl.StatusFlags(ctx)
-	flags |= linux.O_LARGEFILE
-	return flags, err
-}
-
-// SetStatusFlags sets file description status flags.
-func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
-	return fd.impl.SetStatusFlags(ctx, flags)
-}
-
-// TODO:
-//
-// - VFS.SyncAllFilesystems() for sync(2)
-//
-// - Something for syncfs(2)
-//
-// - VFS.LinkAt()
-//
-// - VFS.ReadlinkAt()
-//
-// - VFS.RenameAt()
-//
-// - VFS.RmdirAt()
-//
-// - VFS.SetStatAt()
-//
-// - VFS.StatFSAt()
-//
-// - VFS.SymlinkAt()
-//
-// - VFS.UmountAt()
-//
-// - VFS.UnlinkAt()
-//
-// - FileDescription.(almost everything)
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index f0cd3ffe5..7262b0d0a 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -20,6 +20,7 @@
 //   VirtualFilesystem.mountMu
 //     Dentry.mu
 //       Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry
+//     VirtualFilesystem.filesystemsMu
 // VirtualFilesystem.fsTypesMu
 //
 // Locking Dentry.mu in multiple Dentries requires holding
@@ -28,6 +29,11 @@ package vfs
 
 import (
 	"sync"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts.
@@ -67,6 +73,11 @@ type VirtualFilesystem struct {
 	// mountpoints is analogous to Linux's mountpoint_hashtable.
 	mountpoints map[*Dentry]map[*Mount]struct{}
 
+	// filesystems contains all Filesystems. filesystems is protected by
+	// filesystemsMu.
+	filesystemsMu sync.Mutex
+	filesystems   map[*Filesystem]struct{}
+
 	// fsTypes contains all FilesystemTypes that are usable in the
 	// VirtualFilesystem. fsTypes is protected by fsTypesMu.
 	fsTypesMu sync.RWMutex
@@ -77,12 +88,379 @@ type VirtualFilesystem struct {
 func New() *VirtualFilesystem {
 	vfs := &VirtualFilesystem{
 		mountpoints: make(map[*Dentry]map[*Mount]struct{}),
+		filesystems: make(map[*Filesystem]struct{}),
 		fsTypes:     make(map[string]FilesystemType),
 	}
 	vfs.mounts.Init()
 	return vfs
 }
 
+// PathOperation specifies the path operated on by a VFS method.
+//
+// PathOperation is passed to VFS methods by pointer to reduce memory copying:
+// it's somewhat large and should never escape. (Options structs are passed by
+// pointer to VFS and FileDescription methods for the same reason.)
+type PathOperation struct {
+	// Root is the VFS root. References on Root are borrowed from the provider
+	// of the PathOperation.
+	//
+	// Invariants: Root.Ok().
+	Root VirtualDentry
+
+	// Start is the starting point for the path traversal. References on Start
+	// are borrowed from the provider of the PathOperation (i.e. the caller of
+	// the VFS method to which the PathOperation was passed).
+	//
+	// Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root.
+	Start VirtualDentry
+
+	// Path is the pathname traversed by this operation.
+	Pathname string
+
+	// If FollowFinalSymlink is true, and the Dentry traversed by the final
+	// path component represents a symbolic link, the symbolic link should be
+	// followed.
+	FollowFinalSymlink bool
+}
+
+// GetDentryAt returns a VirtualDentry representing the given path, at which a
+// file must exist. A reference is taken on the returned VirtualDentry.
+func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return VirtualDentry{}, err
+	}
+	for {
+		d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
+		if err == nil {
+			vd := VirtualDentry{
+				mount:  rp.mount,
+				dentry: d,
+			}
+			rp.mount.IncRef()
+			vfs.putResolvingPath(rp)
+			return vd, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return VirtualDentry{}, err
+		}
+	}
+}
+
+// LinkAt creates a hard link at newpop representing the existing file at
+// oldpop.
+func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error {
+	oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
+	if err != nil {
+		return err
+	}
+	rp, err := vfs.getResolvingPath(creds, newpop)
+	if err != nil {
+		oldVD.DecRef()
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
+		if err == nil {
+			oldVD.DecRef()
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			oldVD.DecRef()
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// MkdirAt creates a directory at the given path.
+func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
+	// "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
+	// also honored." - mkdir(2)
+	opts.Mode &= 0777 | linux.S_ISVTX
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// MknodAt creates a file of the given mode at the given path. It returns an
+// error from the syserror package.
+func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return nil
+	}
+	for {
+		if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		// Handle mount traversals.
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// OpenAt returns a FileDescription providing access to the file at the given
+// path. A reference is taken on the returned FileDescription.
+func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
+	// Remove:
+	//
+	// - O_LARGEFILE, which we always report in FileDescription status flags
+	// since only 64-bit architectures are supported at this time.
+	//
+	// - O_CLOEXEC, which affects file descriptors and therefore must be
+	// handled outside of VFS.
+	//
+	// - Unknown flags.
+	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
+	// Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
+	if opts.Flags&linux.O_SYNC != 0 {
+		opts.Flags |= linux.O_DSYNC
+	}
+	// Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
+	// with O_DIRECTORY and a writable access mode (to ensure that it fails on
+	// filesystem implementations that do not support it).
+	if opts.Flags&linux.O_TMPFILE != 0 {
+		if opts.Flags&linux.O_DIRECTORY == 0 {
+			return nil, syserror.EINVAL
+		}
+		if opts.Flags&linux.O_CREAT != 0 {
+			return nil, syserror.EINVAL
+		}
+		if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
+			return nil, syserror.EINVAL
+		}
+	}
+	// O_PATH causes most other flags to be ignored.
+	if opts.Flags&linux.O_PATH != 0 {
+		opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
+	}
+	// "On Linux, the following bits are also honored in mode: [S_ISUID,
+	// S_ISGID, S_ISVTX]" - open(2)
+	opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
+
+	if opts.Flags&linux.O_NOFOLLOW != 0 {
+		pop.FollowFinalSymlink = false
+	}
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return nil, err
+	}
+	if opts.Flags&linux.O_DIRECTORY != 0 {
+		rp.mustBeDir = true
+		rp.mustBeDirOrig = true
+	}
+	for {
+		fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return fd, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return nil, err
+		}
+	}
+}
+
+// ReadlinkAt returns the target of the symbolic link at the given path.
+func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return "", err
+	}
+	for {
+		target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return target, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return "", err
+		}
+	}
+}
+
+// RenameAt renames the file at oldpop to newpop.
+func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error {
+	oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
+	if err != nil {
+		return err
+	}
+	rp, err := vfs.getResolvingPath(creds, newpop)
+	if err != nil {
+		oldVD.DecRef()
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.RenameAt(ctx, rp, oldVD, *opts)
+		if err == nil {
+			oldVD.DecRef()
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			oldVD.DecRef()
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// RmdirAt removes the directory at the given path.
+func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.RmdirAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// SetStatAt changes metadata for the file at the given path.
+func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// StatAt returns metadata for the file at the given path.
+func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	for {
+		stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return stat, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return linux.Statx{}, err
+		}
+	}
+}
+
+// StatFSAt returns metadata for the filesystem containing the file at the
+// given path.
+func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	for {
+		statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return statfs, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return linux.Statfs{}, err
+		}
+	}
+}
+
+// SymlinkAt creates a symbolic link at the given path with the given target.
+func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// UnlinkAt deletes the non-directory file at the given path.
+func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// SyncAllFilesystems has the semantics of Linux's sync(2).
+func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
+	fss := make(map[*Filesystem]struct{})
+	vfs.filesystemsMu.Lock()
+	for fs := range vfs.filesystems {
+		if !fs.TryIncRef() {
+			continue
+		}
+		fss[fs] = struct{}{}
+	}
+	vfs.filesystemsMu.Unlock()
+	var retErr error
+	for fs := range fss {
+		if err := fs.impl.Sync(ctx); err != nil && retErr == nil {
+			retErr = err
+		}
+		fs.DecRef()
+	}
+	return retErr
+}
+
 // A VirtualDentry represents a node in a VFS tree, by combining a Dentry
 // (which represents a node in a Filesystem's tree) and a Mount (which
 // represents the Filesystem's position in a VFS mount tree).
-- 
cgit v1.2.3


From 481dbfa5ab24ec2c0752b9e748d3617285603c4e Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 11 Dec 2019 13:40:57 -0800
Subject: Add vfs.Pathname{WithDeleted,ForGetcwd}.

The former is needed for vfs.FileDescription to implement
memmap.MappingIdentity, and the latter is needed to implement getcwd(2).

PiperOrigin-RevId: 285051855
---
 pkg/sentry/fsimpl/ext/BUILD            |   1 +
 pkg/sentry/fsimpl/ext/filesystem.go    |   8 ++
 pkg/sentry/fsimpl/memfs/BUILD          |   1 +
 pkg/sentry/fsimpl/memfs/filesystem.go  |   8 ++
 pkg/sentry/vfs/BUILD                   |   1 +
 pkg/sentry/vfs/filesystem.go           |  54 +++++++++++-
 pkg/sentry/vfs/filesystem_impl_util.go |  26 ++++++
 pkg/sentry/vfs/pathname.go             | 153 +++++++++++++++++++++++++++++++++
 pkg/sentry/vfs/testutil.go             |   9 ++
 9 files changed, 260 insertions(+), 1 deletion(-)
 create mode 100644 pkg/sentry/vfs/pathname.go

(limited to 'pkg/sentry/fsimpl/memfs')

diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index 7ccff8b0d..880b7bcd3 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -38,6 +38,7 @@ go_library(
         "//pkg/abi/linux",
         "//pkg/binary",
         "//pkg/fd",
+        "//pkg/fspath",
         "//pkg/log",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index 2d15e8aaf..e7aa3b41b 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -20,6 +20,7 @@ import (
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
@@ -441,3 +442,10 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 
 	return syserror.EROFS
 }
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index bc5c0b591..0cc751eb8 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -32,6 +32,7 @@ go_library(
     deps = [
         "//pkg/abi/linux",
         "//pkg/amutex",
+        "//pkg/fspath",
         "//pkg/sentry/arch",
         "//pkg/sentry/context",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index 08a9cb8ef..1f2a5122a 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -19,6 +19,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/vfs"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -582,3 +583,10 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	inode.decLinksLocked()
 	return nil
 }
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index 59237c3b9..e3e554b88 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -17,6 +17,7 @@ go_library(
         "mount.go",
         "mount_unsafe.go",
         "options.go",
+        "pathname.go",
         "permissions.go",
         "resolving_path.go",
         "testutil.go",
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index dfbd2372a..8011eba3f 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -18,6 +18,7 @@ import (
 	"sync/atomic"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 )
 
@@ -185,5 +186,56 @@ type FilesystemImpl interface {
 	// UnlinkAt removes the non-directory file at rp.
 	UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 
-	// TODO: d_path(); extended attributes; inotify_add_watch(); bind()
+	// PrependPath prepends a path from vd to vd.Mount().Root() to b.
+	//
+	// If vfsroot.Ok(), it is the contextual VFS root; if it is encountered
+	// before vd.Mount().Root(), PrependPath should stop prepending path
+	// components and return a PrependPathAtVFSRootError.
+	//
+	// If traversal of vd.Dentry()'s ancestors encounters an independent
+	// ("root") Dentry that is not vd.Mount().Root() (i.e. vd.Dentry() is not a
+	// descendant of vd.Mount().Root()), PrependPath should stop prepending
+	// path components and return a PrependPathAtNonMountRootError.
+	//
+	// Filesystems for which Dentries do not have meaningful paths may prepend
+	// an arbitrary descriptive string to b and then return a
+	// PrependPathSyntheticError.
+	//
+	// Most implementations can acquire the appropriate locks to ensure that
+	// Dentry.Name() and Dentry.Parent() are fixed for vd.Dentry() and all of
+	// its ancestors, then call GenericPrependPath.
+	//
+	// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
+	PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
+
+	// TODO: extended attributes; inotify_add_watch(); bind()
+}
+
+// PrependPathAtVFSRootError is returned by implementations of
+// FilesystemImpl.PrependPath() when they encounter the contextual VFS root.
+type PrependPathAtVFSRootError struct{}
+
+// Error implements error.Error.
+func (PrependPathAtVFSRootError) Error() string {
+	return "vfs.FilesystemImpl.PrependPath() reached VFS root"
+}
+
+// PrependPathAtNonMountRootError is returned by implementations of
+// FilesystemImpl.PrependPath() when they encounter an independent ancestor
+// Dentry that is not the Mount root.
+type PrependPathAtNonMountRootError struct{}
+
+// Error implements error.Error.
+func (PrependPathAtNonMountRootError) Error() string {
+	return "vfs.FilesystemImpl.PrependPath() reached root other than Mount root"
+}
+
+// PrependPathSyntheticError is returned by implementations of
+// FilesystemImpl.PrependPath() for which prepended names do not represent real
+// paths.
+type PrependPathSyntheticError struct{}
+
+// Error implements error.Error.
+func (PrependPathSyntheticError) Error() string {
+	return "vfs.FilesystemImpl.PrependPath() prepended synthetic name"
 }
diff --git a/pkg/sentry/vfs/filesystem_impl_util.go b/pkg/sentry/vfs/filesystem_impl_util.go
index 465e610e0..7315a588e 100644
--- a/pkg/sentry/vfs/filesystem_impl_util.go
+++ b/pkg/sentry/vfs/filesystem_impl_util.go
@@ -16,6 +16,8 @@ package vfs
 
 import (
 	"strings"
+
+	"gvisor.dev/gvisor/pkg/fspath"
 )
 
 // GenericParseMountOptions parses a comma-separated list of options of the
@@ -41,3 +43,27 @@ func GenericParseMountOptions(str string) map[string]string {
 	}
 	return m
 }
+
+// GenericPrependPath may be used by implementations of
+// FilesystemImpl.PrependPath() for which a single statically-determined lock
+// or set of locks is sufficient to ensure its preconditions (as opposed to
+// e.g. per-Dentry locks).
+//
+// Preconditions: Dentry.Name() and Dentry.Parent() must be held constant for
+// vd.Dentry() and all of its ancestors.
+func GenericPrependPath(vfsroot, vd VirtualDentry, b *fspath.Builder) error {
+	mnt, d := vd.mount, vd.dentry
+	for {
+		if mnt == vfsroot.mount && d == vfsroot.dentry {
+			return PrependPathAtVFSRootError{}
+		}
+		if d == mnt.root {
+			return nil
+		}
+		if d.parent == nil {
+			return PrependPathAtNonMountRootError{}
+		}
+		b.PrependComponent(d.name)
+		d = d.parent
+	}
+}
diff --git a/pkg/sentry/vfs/pathname.go b/pkg/sentry/vfs/pathname.go
new file mode 100644
index 000000000..8e155654f
--- /dev/null
+++ b/pkg/sentry/vfs/pathname.go
@@ -0,0 +1,153 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"sync"
+
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+var fspathBuilderPool = sync.Pool{
+	New: func() interface{} {
+		return &fspath.Builder{}
+	},
+}
+
+func getFSPathBuilder() *fspath.Builder {
+	return fspathBuilderPool.Get().(*fspath.Builder)
+}
+
+func putFSPathBuilder(b *fspath.Builder) {
+	// No methods can be called on b after b.String(), so reset it to its zero
+	// value (as returned by fspathBuilderPool.New) instead.
+	*b = fspath.Builder{}
+	fspathBuilderPool.Put(b)
+}
+
+// PathnameWithDeleted returns an absolute pathname to vd, consistent with
+// Linux's d_path(). In particular, if vd.Dentry() has been disowned,
+// PathnameWithDeleted appends " (deleted)" to the returned pathname.
+func (vfs *VirtualFilesystem) PathnameWithDeleted(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+	b := getFSPathBuilder()
+	defer putFSPathBuilder(b)
+	haveRef := false
+	defer func() {
+		if haveRef {
+			vd.DecRef()
+		}
+	}()
+
+	origD := vd.dentry
+loop:
+	for {
+		err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+		switch err.(type) {
+		case nil:
+			if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+				// GenericPrependPath() will have returned
+				// PrependPathAtVFSRootError in this case since it checks
+				// against vfsroot before mnt.root, but other implementations
+				// of FilesystemImpl.PrependPath() may return nil instead.
+				break loop
+			}
+			nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+			if !nextVD.Ok() {
+				break loop
+			}
+			if haveRef {
+				vd.DecRef()
+			}
+			vd = nextVD
+			haveRef = true
+			// continue loop
+		case PrependPathSyntheticError:
+			// Skip prepending "/" and appending " (deleted)".
+			return b.String(), nil
+		case PrependPathAtVFSRootError, PrependPathAtNonMountRootError:
+			break loop
+		default:
+			return "", err
+		}
+	}
+	b.PrependByte('/')
+	if origD.IsDisowned() {
+		b.AppendString(" (deleted)")
+	}
+	return b.String(), nil
+}
+
+// PathnameForGetcwd returns an absolute pathname to vd, consistent with
+// Linux's sys_getcwd().
+func (vfs *VirtualFilesystem) PathnameForGetcwd(ctx context.Context, vfsroot, vd VirtualDentry) (string, error) {
+	if vd.dentry.IsDisowned() {
+		return "", syserror.ENOENT
+	}
+
+	b := getFSPathBuilder()
+	defer putFSPathBuilder(b)
+	haveRef := false
+	defer func() {
+		if haveRef {
+			vd.DecRef()
+		}
+	}()
+	unreachable := false
+loop:
+	for {
+		err := vd.mount.fs.impl.PrependPath(ctx, vfsroot, vd, b)
+		switch err.(type) {
+		case nil:
+			if vd.mount == vfsroot.mount && vd.mount.root == vfsroot.dentry {
+				break loop
+			}
+			nextVD := vfs.getMountpointAt(vd.mount, vfsroot)
+			if !nextVD.Ok() {
+				unreachable = true
+				break loop
+			}
+			if haveRef {
+				vd.DecRef()
+			}
+			vd = nextVD
+			haveRef = true
+		case PrependPathAtVFSRootError:
+			break loop
+		case PrependPathAtNonMountRootError, PrependPathSyntheticError:
+			unreachable = true
+			break loop
+		default:
+			return "", err
+		}
+	}
+	b.PrependByte('/')
+	if unreachable {
+		b.PrependString("(unreachable)")
+	}
+	return b.String(), nil
+}
+
+// As of this writing, we do not have equivalents to:
+//
+// - d_absolute_path(), which returns EINVAL if (effectively) any call to
+// FilesystemImpl.PrependPath() would return PrependPathAtNonMountRootError.
+//
+// - dentry_path(), which does not walk up mounts (and only returns the path
+// relative to Filesystem root), but also appends "//deleted" for disowned
+// Dentries.
+//
+// These should be added as necessary.
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
index 593144cb7..7a1d9e383 100644
--- a/pkg/sentry/vfs/testutil.go
+++ b/pkg/sentry/vfs/testutil.go
@@ -15,7 +15,10 @@
 package vfs
 
 import (
+	"fmt"
+
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -114,6 +117,12 @@ func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) err
 	return syserror.EPERM
 }
 
+// PrependPath implements FilesystemImpl.PrependPath.
+func (fs *FDTestFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
+	b.PrependComponent(fmt.Sprintf("vfs.fdTestDentry:%p", vd.dentry.impl.(*fdTestDentry)))
+	return PrependPathSyntheticError{}
+}
+
 type fdTestDentry struct {
 	vfsd Dentry
 }
-- 
cgit v1.2.3


From 744401297a8c93ce5992ba99aa84f3dcdc19ae9e Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Wed, 18 Dec 2019 15:47:24 -0800
Subject: Add VFS2 plumbing for extended attributes.

PiperOrigin-RevId: 286281274
---
 pkg/sentry/fsimpl/ext/filesystem.go          | 36 ++++++++++++
 pkg/sentry/fsimpl/kernfs/filesystem.go       | 52 +++++++++++++++++
 pkg/sentry/fsimpl/memfs/filesystem.go        | 48 +++++++++++++++
 pkg/sentry/vfs/file_description.go           | 49 +++++++++++++++-
 pkg/sentry/vfs/file_description_impl_util.go | 25 ++++++++
 pkg/sentry/vfs/filesystem.go                 | 16 ++++-
 pkg/sentry/vfs/options.go                    | 14 +++++
 pkg/sentry/vfs/testutil.go                   | 20 +++++++
 pkg/sentry/vfs/vfs.go                        | 87 ++++++++++++++++++++++++++++
 9 files changed, 345 insertions(+), 2 deletions(-)

(limited to 'pkg/sentry/fsimpl/memfs')

diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index e7aa3b41b..d7e87979a 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -443,6 +443,42 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return syserror.EROFS
 }
 
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return nil, err
+	}
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return "", err
+	}
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	_, _, err := fs.walk(rp, false)
+	if err != nil {
+		return err
+	}
+	return syserror.ENOTSUP
+}
+
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.mu.RLock()
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index db486b6c1..3cbbe4b20 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -683,6 +683,58 @@ func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return nil
 }
 
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *Filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return nil, err
+	}
+	// kernfs currently does not support extended attributes.
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *Filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return "", err
+	}
+	// kernfs currently does not support extended attributes.
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *Filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return err
+	}
+	// kernfs currently does not support extended attributes.
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *Filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	fs.mu.RLock()
+	_, _, err := fs.walkExistingLocked(ctx, rp)
+	fs.mu.RUnlock()
+	fs.processDeferredDecRefs()
+	if err != nil {
+		return err
+	}
+	// kernfs currently does not support extended attributes.
+	return syserror.ENOTSUP
+}
+
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
 func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.mu.RLock()
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index 1f2a5122a..22f1e811f 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -584,6 +584,54 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 	return nil
 }
 
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, _, err := walkExistingLocked(rp)
+	if err != nil {
+		return nil, err
+	}
+	// TODO(b/127675828): support extended attributes
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, _, err := walkExistingLocked(rp)
+	if err != nil {
+		return "", err
+	}
+	// TODO(b/127675828): support extended attributes
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, _, err := walkExistingLocked(rp)
+	if err != nil {
+		return err
+	}
+	// TODO(b/127675828): support extended attributes
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, _, err := walkExistingLocked(rp)
+	if err != nil {
+		return err
+	}
+	// TODO(b/127675828): support extended attributes
+	return syserror.ENOTSUP
+}
+
 // PrependPath implements vfs.FilesystemImpl.PrependPath.
 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
 	fs.mu.RLock()
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 6575afd16..c5a9adca3 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -22,6 +22,7 @@ import (
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/syserror"
 	"gvisor.dev/gvisor/pkg/waiter"
 )
 
@@ -212,7 +213,21 @@ type FileDescriptionImpl interface {
 	// Ioctl implements the ioctl(2) syscall.
 	Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
 
-	// TODO: extended attributes; file locking
+	// Listxattr returns all extended attribute names for the file.
+	Listxattr(ctx context.Context) ([]string, error)
+
+	// Getxattr returns the value associated with the given extended attribute
+	// for the file.
+	Getxattr(ctx context.Context, name string) (string, error)
+
+	// Setxattr changes the value associated with the given extended attribute
+	// for the file.
+	Setxattr(ctx context.Context, opts SetxattrOptions) error
+
+	// Removexattr removes the given extended attribute from the file.
+	Removexattr(ctx context.Context, name string) error
+
+	// TODO: file locking
 }
 
 // Dirent holds the information contained in struct linux_dirent64.
@@ -329,6 +344,38 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
 	return fd.impl.Ioctl(ctx, uio, args)
 }
 
+// Listxattr returns all extended attribute names for the file represented by
+// fd.
+func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
+	names, err := fd.impl.Listxattr(ctx)
+	if err == syserror.ENOTSUP {
+		// Linux doesn't actually return ENOTSUP in this case; instead,
+		// fs/xattr.c:vfs_listxattr() falls back to allowing the security
+		// subsystem to return security extended attributes, which by default
+		// don't exist.
+		return nil, nil
+	}
+	return names, err
+}
+
+// Getxattr returns the value associated with the given extended attribute for
+// the file represented by fd.
+func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+	return fd.impl.Getxattr(ctx, name)
+}
+
+// Setxattr changes the value associated with the given extended attribute for
+// the file represented by fd.
+func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+	return fd.impl.Setxattr(ctx, opts)
+}
+
+// Removexattr removes the given extended attribute from the file represented
+// by fd.
+func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
+	return fd.impl.Removexattr(ctx, name)
+}
+
 // SyncFS instructs the filesystem containing fd to execute the semantics of
 // syncfs(2).
 func (fd *FileDescription) SyncFS(ctx context.Context) error {
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index aae023254..3df49991c 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -127,6 +127,31 @@ func (FileDescriptionDefaultImpl) Ioctl(ctx context.Context, uio usermem.IO, arg
 	return 0, syserror.ENOTTY
 }
 
+// Listxattr implements FileDescriptionImpl.Listxattr analogously to
+// inode_operations::listxattr == NULL in Linux.
+func (FileDescriptionDefaultImpl) Listxattr(ctx context.Context) ([]string, error) {
+	// This isn't exactly accurate; see FileDescription.Listxattr.
+	return nil, syserror.ENOTSUP
+}
+
+// Getxattr implements FileDescriptionImpl.Getxattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Getxattr(ctx context.Context, name string) (string, error) {
+	return "", syserror.ENOTSUP
+}
+
+// Setxattr implements FileDescriptionImpl.Setxattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+	return syserror.ENOTSUP
+}
+
+// Removexattr implements FileDescriptionImpl.Removexattr analogously to
+// inode::i_opflags & IOP_XATTR == 0 in Linux.
+func (FileDescriptionDefaultImpl) Removexattr(ctx context.Context, name string) error {
+	return syserror.ENOTSUP
+}
+
 // DirectoryFileDescriptionDefaultImpl may be embedded by implementations of
 // FileDescriptionImpl that always represent directories to obtain
 // implementations of non-directory I/O methods that return EISDIR.
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 8011eba3f..b766614e7 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -186,6 +186,20 @@ type FilesystemImpl interface {
 	// UnlinkAt removes the non-directory file at rp.
 	UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 
+	// ListxattrAt returns all extended attribute names for the file at rp.
+	ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error)
+
+	// GetxattrAt returns the value associated with the given extended
+	// attribute for the file at rp.
+	GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error)
+
+	// SetxattrAt changes the value associated with the given extended
+	// attribute for the file at rp.
+	SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
+
+	// RemovexattrAt removes the given extended attribute from the file at rp.
+	RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
+
 	// PrependPath prepends a path from vd to vd.Mount().Root() to b.
 	//
 	// If vfsroot.Ok(), it is the contextual VFS root; if it is encountered
@@ -208,7 +222,7 @@ type FilesystemImpl interface {
 	// Preconditions: vd.Mount().Filesystem().Impl() == this FilesystemImpl.
 	PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error
 
-	// TODO: extended attributes; inotify_add_watch(); bind()
+	// TODO: inotify_add_watch(); bind()
 }
 
 // PrependPathAtVFSRootError is returned by implementations of
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 3ecbc8fc1..97ee4a446 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -101,6 +101,20 @@ type SetStatOptions struct {
 	Stat linux.Statx
 }
 
+// SetxattrOptions contains options to VirtualFilesystem.SetxattrAt(),
+// FilesystemImpl.SetxattrAt(), FileDescription.Setxattr(), and
+// FileDescriptionImpl.Setxattr().
+type SetxattrOptions struct {
+	// Name is the name of the extended attribute being mutated.
+	Name string
+
+	// Value is the extended attribute's new value.
+	Value string
+
+	// Flags contains flags as specified for setxattr/lsetxattr/fsetxattr(2).
+	Flags uint32
+}
+
 // StatOptions contains options to VirtualFilesystem.StatAt(),
 // FilesystemImpl.StatAt(), FileDescription.Stat(), and
 // FileDescriptionImpl.Stat().
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
index 7a1d9e383..d94117bce 100644
--- a/pkg/sentry/vfs/testutil.go
+++ b/pkg/sentry/vfs/testutil.go
@@ -117,6 +117,26 @@ func (fs *FDTestFilesystem) UnlinkAt(ctx context.Context, rp *ResolvingPath) err
 	return syserror.EPERM
 }
 
+// ListxattrAt implements FilesystemImpl.ListxattrAt.
+func (fs *FDTestFilesystem) ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error) {
+	return nil, syserror.EPERM
+}
+
+// GetxattrAt implements FilesystemImpl.GetxattrAt.
+func (fs *FDTestFilesystem) GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error) {
+	return "", syserror.EPERM
+}
+
+// SetxattrAt implements FilesystemImpl.SetxattrAt.
+func (fs *FDTestFilesystem) SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error {
+	return syserror.EPERM
+}
+
+// RemovexattrAt implements FilesystemImpl.RemovexattrAt.
+func (fs *FDTestFilesystem) RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error {
+	return syserror.EPERM
+}
+
 // PrependPath implements FilesystemImpl.PrependPath.
 func (fs *FDTestFilesystem) PrependPath(ctx context.Context, vfsroot, vd VirtualDentry, b *fspath.Builder) error {
 	b.PrependComponent(fmt.Sprintf("vfs.fdTestDentry:%p", vd.dentry.impl.(*fdTestDentry)))
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 7262b0d0a..e60898d7c 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -440,6 +440,93 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
 	}
 }
 
+// ListxattrAt returns all extended attribute names for the file at the given
+// path.
+func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return nil, err
+	}
+	for {
+		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return names, nil
+		}
+		if err == syserror.ENOTSUP {
+			// Linux doesn't actually return ENOTSUP in this case; instead,
+			// fs/xattr.c:vfs_listxattr() falls back to allowing the security
+			// subsystem to return security extended attributes, which by
+			// default don't exist.
+			vfs.putResolvingPath(rp)
+			return nil, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return nil, err
+		}
+	}
+}
+
+// GetxattrAt returns the value associated with the given extended attribute
+// for the file at the given path.
+func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return "", err
+	}
+	for {
+		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return val, nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return "", err
+		}
+	}
+}
+
+// SetxattrAt changes the value associated with the given extended attribute
+// for the file at the given path.
+func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
+// RemovexattrAt removes the given extended attribute from the file at rp.
+func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
+	rp, err := vfs.getResolvingPath(creds, pop)
+	if err != nil {
+		return err
+	}
+	for {
+		err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+		if err == nil {
+			vfs.putResolvingPath(rp)
+			return nil
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return err
+		}
+	}
+}
+
 // SyncAllFilesystems has the semantics of Linux's sync(2).
 func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error {
 	fss := make(map[*Filesystem]struct{})
-- 
cgit v1.2.3


From 3eb489ed6c67b069bc135ab92cb031ce80b40d8f Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 20 Dec 2019 11:52:24 -0800
Subject: Move VFS2 file description status flags to vfs.FileDescription.

PiperOrigin-RevId: 286616668
---
 pkg/sentry/fsimpl/ext/file_description.go         |  19 ---
 pkg/sentry/fsimpl/ext/inode.go                    |   9 +-
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go    |  16 +--
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go          |  16 +--
 pkg/sentry/fsimpl/memfs/filesystem.go             |  11 +-
 pkg/sentry/fsimpl/memfs/memfs.go                  |  14 ---
 pkg/sentry/fsimpl/memfs/named_pipe.go             |   2 +-
 pkg/sentry/vfs/file_description.go                | 141 +++++++++++++++-------
 pkg/sentry/vfs/file_description_impl_util_test.go |   2 +-
 9 files changed, 107 insertions(+), 123 deletions(-)

(limited to 'pkg/sentry/fsimpl/memfs')

diff --git a/pkg/sentry/fsimpl/ext/file_description.go b/pkg/sentry/fsimpl/ext/file_description.go
index 5eca2b83f..841274daf 100644
--- a/pkg/sentry/fsimpl/ext/file_description.go
+++ b/pkg/sentry/fsimpl/ext/file_description.go
@@ -26,13 +26,6 @@ import (
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
-
-	// flags is the same as vfs.OpenOptions.Flags which are passed to
-	// vfs.FilesystemImpl.OpenAt.
-	// TODO(b/134676337): syscalls like read(2), write(2), fchmod(2), fchown(2),
-	// fgetxattr(2), ioctl(2), mmap(2) should fail with EBADF if O_PATH is set.
-	// Only close(2), fstat(2), fstatfs(2) should work.
-	flags uint32
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
@@ -43,18 +36,6 @@ func (fd *fileDescription) inode() *inode {
 	return fd.vfsfd.Dentry().Impl().(*dentry).inode
 }
 
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
-	return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
-	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
-	// no-op.
-	return nil
-}
-
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	var stat linux.Statx
diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index 24249525c..b2cc826c7 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -157,10 +157,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 	switch in.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		fd.flags = flags
 		mnt.IncRef()
 		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
+		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *directory:
 		// Can't open directories writably. This check is not necessary for a read
@@ -169,10 +168,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		fd.flags = flags
 		mnt.IncRef()
 		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
+		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *symlink:
 		if flags&linux.O_PATH == 0 {
@@ -180,10 +178,9 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.ELOOP
 		}
 		var fd symlinkFD
-		fd.flags = flags
 		mnt.IncRef()
 		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
+		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	default:
 		panic(fmt.Sprintf("unknown inode type: %T", in.impl))
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index 30c06baf0..51102ce48 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -65,17 +65,15 @@ type DynamicBytesFD struct {
 
 	vfsfd vfs.FileDescription
 	inode Inode
-	flags uint32
 }
 
 // Init initializes a DynamicBytesFD.
 func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) {
 	m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
 	d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
-	fd.flags = flags
 	fd.inode = d.Impl().(*Dentry).inode
 	fd.SetDataSource(data)
-	fd.vfsfd.Init(fd, m, d)
+	fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{})
 }
 
 // Seek implements vfs.FileDescriptionImpl.Seek.
@@ -117,15 +115,3 @@ func (fd *DynamicBytesFD) SetStat(context.Context, vfs.SetStatOptions) error {
 	// DynamicBytesFiles are immutable.
 	return syserror.EPERM
 }
-
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *DynamicBytesFD) StatusFlags(ctx context.Context) (uint32, error) {
-	return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *DynamicBytesFD) SetStatusFlags(ctx context.Context, flags uint32) error {
-	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
-	// no-op.
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index d6c18937a..bd402330f 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -39,7 +39,6 @@ type GenericDirectoryFD struct {
 
 	vfsfd    vfs.FileDescription
 	children *OrderedChildren
-	flags    uint32
 	off      int64
 }
 
@@ -48,8 +47,7 @@ func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *Ordere
 	m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
 	d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
 	fd.children = children
-	fd.flags = flags
-	fd.vfsfd.Init(fd, m, d)
+	fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{})
 }
 
 // VFSFileDescription returns a pointer to the vfs.FileDescription representing
@@ -180,18 +178,6 @@ func (fd *GenericDirectoryFD) Seek(ctx context.Context, offset int64, whence int
 	return offset, nil
 }
 
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *GenericDirectoryFD) StatusFlags(ctx context.Context) (uint32, error) {
-	return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *GenericDirectoryFD) SetStatusFlags(ctx context.Context, flags uint32) error {
-	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
-	// no-op.
-	return nil
-}
-
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *GenericDirectoryFD) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	fs := fd.filesystem()
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index 22f1e811f..af4389459 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -282,9 +282,8 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	// Filter out flags that are not supported by memfs. O_DIRECTORY and
 	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
-	// appropriate bits in rp), but are returned by
-	// FileDescriptionImpl.StatusFlags(). O_NONBLOCK is supported only by
-	// pipes.
+	// appropriate bits in rp), but are visible in FD status flags. O_NONBLOCK
+	// is supported only by pipes.
 	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK
 
 	if opts.Flags&linux.O_CREAT == 0 {
@@ -384,7 +383,6 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 	switch impl := i.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		fd.flags = flags
 		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
 		fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
 		if fd.writable {
@@ -395,7 +393,7 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 		}
 		mnt.IncRef()
 		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
+		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		if flags&linux.O_TRUNC != 0 {
 			impl.mu.Lock()
 			impl.data = impl.data[:0]
@@ -411,8 +409,7 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 		var fd directoryFD
 		mnt.IncRef()
 		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, mnt, vfsd)
-		fd.flags = flags
+		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *symlink:
 		// Can't open symlinks without O_PATH (which is unimplemented).
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
index 4cb2a4e0f..9d509f6e4 100644
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -261,8 +261,6 @@ func (i *inode) direntType() uint8 {
 type fileDescription struct {
 	vfsfd vfs.FileDescription
 	vfs.FileDescriptionDefaultImpl
-
-	flags uint32 // status flags; immutable
 }
 
 func (fd *fileDescription) filesystem() *filesystem {
@@ -273,18 +271,6 @@ func (fd *fileDescription) inode() *inode {
 	return fd.vfsfd.Dentry().Impl().(*dentry).inode
 }
 
-// StatusFlags implements vfs.FileDescriptionImpl.StatusFlags.
-func (fd *fileDescription) StatusFlags(ctx context.Context) (uint32, error) {
-	return fd.flags, nil
-}
-
-// SetStatusFlags implements vfs.FileDescriptionImpl.SetStatusFlags.
-func (fd *fileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
-	// None of the flags settable by fcntl(F_SETFL) are supported, so this is a
-	// no-op.
-	return nil
-}
-
 // Stat implements vfs.FileDescriptionImpl.Stat.
 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
 	var stat linux.Statx
diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go
index 91cb4b1fc..d5060850e 100644
--- a/pkg/sentry/fsimpl/memfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/memfs/named_pipe.go
@@ -57,6 +57,6 @@ func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, v
 	mnt := rp.Mount()
 	mnt.IncRef()
 	vfsd.IncRef()
-	fd.vfsfd.Init(&fd, mnt, vfsd)
+	fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 	return &fd.vfsfd, nil
 }
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index c5a9adca3..df03886c3 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -20,6 +20,7 @@ import (
 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/sentry/arch"
 	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/sentry/memmap"
 	"gvisor.dev/gvisor/pkg/sentry/usermem"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -39,49 +40,43 @@ type FileDescription struct {
 	// operations.
 	refs int64
 
+	// statusFlags contains status flags, "initialized by open(2) and possibly
+	// modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic
+	// memory operations.
+	statusFlags uint32
+
 	// vd is the filesystem location at which this FileDescription was opened.
 	// A reference is held on vd. vd is immutable.
 	vd VirtualDentry
 
+	opts FileDescriptionOptions
+
 	// impl is the FileDescriptionImpl associated with this Filesystem. impl is
 	// immutable. This should be the last field in FileDescription.
 	impl FileDescriptionImpl
 }
 
+// FileDescriptionOptions contains options to FileDescription.Init().
+type FileDescriptionOptions struct {
+	// If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is
+	// usually only the case if O_DIRECT would actually have an effect.
+	AllowDirectIO bool
+}
+
 // Init must be called before first use of fd. It takes ownership of references
-// on mnt and d held by the caller.
-func (fd *FileDescription) Init(impl FileDescriptionImpl, mnt *Mount, d *Dentry) {
+// on mnt and d held by the caller. statusFlags is the initial file description
+// status flags, which is usually the full set of flags passed to open(2).
+func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) {
 	fd.refs = 1
+	fd.statusFlags = statusFlags | linux.O_LARGEFILE
 	fd.vd = VirtualDentry{
 		mount:  mnt,
 		dentry: d,
 	}
+	fd.opts = *opts
 	fd.impl = impl
 }
 
-// Impl returns the FileDescriptionImpl associated with fd.
-func (fd *FileDescription) Impl() FileDescriptionImpl {
-	return fd.impl
-}
-
-// Mount returns the mount on which fd was opened. It does not take a reference
-// on the returned Mount.
-func (fd *FileDescription) Mount() *Mount {
-	return fd.vd.mount
-}
-
-// Dentry returns the dentry at which fd was opened. It does not take a
-// reference on the returned Dentry.
-func (fd *FileDescription) Dentry() *Dentry {
-	return fd.vd.dentry
-}
-
-// VirtualDentry returns the location at which fd was opened. It does not take
-// a reference on the returned VirtualDentry.
-func (fd *FileDescription) VirtualDentry() VirtualDentry {
-	return fd.vd
-}
-
 // IncRef increments fd's reference count.
 func (fd *FileDescription) IncRef() {
 	atomic.AddInt64(&fd.refs, 1)
@@ -113,6 +108,82 @@ func (fd *FileDescription) DecRef() {
 	}
 }
 
+// Mount returns the mount on which fd was opened. It does not take a reference
+// on the returned Mount.
+func (fd *FileDescription) Mount() *Mount {
+	return fd.vd.mount
+}
+
+// Dentry returns the dentry at which fd was opened. It does not take a
+// reference on the returned Dentry.
+func (fd *FileDescription) Dentry() *Dentry {
+	return fd.vd.dentry
+}
+
+// VirtualDentry returns the location at which fd was opened. It does not take
+// a reference on the returned VirtualDentry.
+func (fd *FileDescription) VirtualDentry() VirtualDentry {
+	return fd.vd
+}
+
+// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
+func (fd *FileDescription) StatusFlags() uint32 {
+	return atomic.LoadUint32(&fd.statusFlags)
+}
+
+// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
+func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error {
+	// Compare Linux's fs/fcntl.c:setfl().
+	oldFlags := fd.StatusFlags()
+	// Linux documents this check as "O_APPEND cannot be cleared if the file is
+	// marked as append-only and the file is open for write", which would make
+	// sense. However, the check as actually implemented seems to be "O_APPEND
+	// cannot be changed if the file is marked as append-only".
+	if (flags^oldFlags)&linux.O_APPEND != 0 {
+		stat, err := fd.impl.Stat(ctx, StatOptions{
+			// There is no mask bit for stx_attributes.
+			Mask: 0,
+			// Linux just reads inode::i_flags directly.
+			Sync: linux.AT_STATX_DONT_SYNC,
+		})
+		if err != nil {
+			return err
+		}
+		if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) {
+			return syserror.EPERM
+		}
+	}
+	if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) {
+		stat, err := fd.impl.Stat(ctx, StatOptions{
+			Mask: linux.STATX_UID,
+			// Linux's inode_owner_or_capable() just reads inode::i_uid
+			// directly.
+			Sync: linux.AT_STATX_DONT_SYNC,
+		})
+		if err != nil {
+			return err
+		}
+		if stat.Mask&linux.STATX_UID == 0 {
+			return syserror.EPERM
+		}
+		if !CanActAsOwner(creds, auth.KUID(stat.UID)) {
+			return syserror.EPERM
+		}
+	}
+	if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO {
+		return syserror.EINVAL
+	}
+	// TODO(jamieliu): FileDescriptionImpl.SetOAsync()?
+	const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK
+	atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags))
+	return nil
+}
+
+// Impl returns the FileDescriptionImpl associated with fd.
+func (fd *FileDescription) Impl() FileDescriptionImpl {
+	return fd.impl
+}
+
 // FileDescriptionImpl contains implementation details for an FileDescription.
 // Implementations of FileDescriptionImpl should contain their associated
 // FileDescription by value as their first field.
@@ -132,14 +203,6 @@ type FileDescriptionImpl interface {
 	// prevent the file descriptor from being closed.
 	OnClose(ctx context.Context) error
 
-	// StatusFlags returns file description status flags, as for
-	// fcntl(F_GETFL).
-	StatusFlags(ctx context.Context) (uint32, error)
-
-	// SetStatusFlags sets file description status flags, as for
-	// fcntl(F_SETFL).
-	SetStatusFlags(ctx context.Context, flags uint32) error
-
 	// Stat returns metadata for the file represented by the FileDescription.
 	Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)
 
@@ -264,18 +327,6 @@ func (fd *FileDescription) OnClose(ctx context.Context) error {
 	return fd.impl.OnClose(ctx)
 }
 
-// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
-func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) {
-	flags, err := fd.impl.StatusFlags(ctx)
-	flags |= linux.O_LARGEFILE
-	return flags, err
-}
-
-// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
-func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
-	return fd.impl.SetStatusFlags(ctx, flags)
-}
-
 // Stat returns metadata for the file represented by fd.
 func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
 	return fd.impl.Stat(ctx, opts)
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index ac7799296..678be07fe 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -48,7 +48,7 @@ type genCountFD struct {
 
 func newGenCountFD(mnt *Mount, vfsd *Dentry) *FileDescription {
 	var fd genCountFD
-	fd.vfsfd.Init(&fd, mnt, vfsd)
+	fd.vfsfd.Init(&fd, 0 /* statusFlags */, mnt, vfsd, &FileDescriptionOptions{})
 	fd.DynamicBytesFileDescriptionImpl.SetDataSource(&fd)
 	return &fd.vfsfd
 }
-- 
cgit v1.2.3


From f45df7505b0e7baf48a37f7c625f05051d144738 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 23 Dec 2019 13:17:29 -0800
Subject: Clean up vfs.FilesystemImpl methods that operate on parent
 directories.

- Make FilesystemImpl methods that operate on parent directories require
  !rp.Done() (i.e. there is at least one path component to resolve) as
  precondition and postcondition (in cases where they do not finish path
  resolution due to mount boundary / absolute symlink), and require that they
  do not need to follow the last path component (the file being created /
  deleted) as a symlink. Check for these in VFS.

- Add FilesystemImpl.GetParentDentryAt(), which is required to obtain the old
  parent directory for VFS.RenameAt(). (Passing the Dentry to be renamed
  instead has the wrong semantics if the file named by the old path is a mount
  point since the Dentry will be on the wrong Mount.)

- Update memfs to implement these methods correctly (?), including RenameAt.

- Change fspath.Parse() to allow empty paths (to simplify implementation of
  AT_EMPTY_PATH).

- Change vfs.PathOperation to take a fspath.Path instead of a raw pathname;
  non-test callers will need to fspath.Parse() pathnames themselves anyway in
  order to detect absolute paths and select PathOperation.Start accordingly.

PiperOrigin-RevId: 286934941
---
 pkg/fspath/BUILD                                  |   2 -
 pkg/fspath/fspath.go                              |  24 +-
 pkg/fspath/fspath_test.go                         |  25 +-
 pkg/sentry/fsimpl/ext/BUILD                       |   1 +
 pkg/sentry/fsimpl/ext/benchmark/BUILD             |   1 +
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go |  11 +-
 pkg/sentry/fsimpl/ext/ext_test.go                 |   9 +-
 pkg/sentry/fsimpl/ext/filesystem.go               |  12 +-
 pkg/sentry/fsimpl/kernfs/BUILD                    |   1 +
 pkg/sentry/fsimpl/kernfs/filesystem.go            | 138 +++--
 pkg/sentry/fsimpl/kernfs/kernfs_test.go           |   7 +-
 pkg/sentry/fsimpl/memfs/BUILD                     |   2 +
 pkg/sentry/fsimpl/memfs/benchmark_test.go         |  27 +-
 pkg/sentry/fsimpl/memfs/filesystem.go             | 667 ++++++++++++----------
 pkg/sentry/fsimpl/memfs/memfs.go                  |  29 +-
 pkg/sentry/fsimpl/memfs/pipe_test.go              |  18 +-
 pkg/sentry/vfs/dentry.go                          |  29 +-
 pkg/sentry/vfs/file_description.go                |  19 +
 pkg/sentry/vfs/filesystem.go                      | 251 +++++++-
 pkg/sentry/vfs/options.go                         |   3 +
 pkg/sentry/vfs/resolving_path.go                  |  46 +-
 pkg/sentry/vfs/testutil.go                        |   7 +-
 pkg/sentry/vfs/vfs.go                             | 259 ++++++---
 pkg/syserror/syserror.go                          |   1 +
 24 files changed, 1051 insertions(+), 538 deletions(-)

(limited to 'pkg/sentry/fsimpl/memfs')

diff --git a/pkg/fspath/BUILD b/pkg/fspath/BUILD
index 0c5f50397..ca540363c 100644
--- a/pkg/fspath/BUILD
+++ b/pkg/fspath/BUILD
@@ -14,7 +14,6 @@ go_library(
         "fspath.go",
     ],
     importpath = "gvisor.dev/gvisor/pkg/fspath",
-    deps = ["//pkg/syserror"],
 )
 
 go_test(
@@ -25,5 +24,4 @@ go_test(
         "fspath_test.go",
     ],
     embed = [":fspath"],
-    deps = ["//pkg/syserror"],
 )
diff --git a/pkg/fspath/fspath.go b/pkg/fspath/fspath.go
index f68752560..9fb3fee24 100644
--- a/pkg/fspath/fspath.go
+++ b/pkg/fspath/fspath.go
@@ -18,19 +18,17 @@ package fspath
 
 import (
 	"strings"
-
-	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 const pathSep = '/'
 
-// Parse parses a pathname as described by path_resolution(7).
-func Parse(pathname string) (Path, error) {
+// Parse parses a pathname as described by path_resolution(7), except that
+// empty pathnames will be parsed successfully to a Path for which
+// Path.Absolute == Path.Dir == Path.HasComponents() == false. (This is
+// necessary to support AT_EMPTY_PATH.)
+func Parse(pathname string) Path {
 	if len(pathname) == 0 {
-		// "... POSIX decrees that an empty pathname must not be resolved
-		// successfully. Linux returns ENOENT in this case." -
-		// path_resolution(7)
-		return Path{}, syserror.ENOENT
+		return Path{}
 	}
 	// Skip leading path separators.
 	i := 0
@@ -41,7 +39,7 @@ func Parse(pathname string) (Path, error) {
 			return Path{
 				Absolute: true,
 				Dir:      true,
-			}, nil
+			}
 		}
 	}
 	// Skip trailing path separators. This is required by Iterator.Next. This
@@ -64,7 +62,7 @@ func Parse(pathname string) (Path, error) {
 		},
 		Absolute: i != 0,
 		Dir:      j != len(pathname)-1,
-	}, nil
+	}
 }
 
 // Path contains the information contained in a pathname string.
@@ -111,6 +109,12 @@ func (p Path) String() string {
 	return b.String()
 }
 
+// HasComponents returns true if p contains a non-zero number of path
+// components.
+func (p Path) HasComponents() bool {
+	return p.Begin.Ok()
+}
+
 // An Iterator represents either a path component in a Path or a terminal
 // iterator indicating that the end of the path has been reached.
 //
diff --git a/pkg/fspath/fspath_test.go b/pkg/fspath/fspath_test.go
index 215b35622..d5e9a549a 100644
--- a/pkg/fspath/fspath_test.go
+++ b/pkg/fspath/fspath_test.go
@@ -18,15 +18,10 @@ import (
 	"reflect"
 	"strings"
 	"testing"
-
-	"gvisor.dev/gvisor/pkg/syserror"
 )
 
 func TestParseIteratorPartialPathnames(t *testing.T) {
-	path, err := Parse("/foo//bar///baz////")
-	if err != nil {
-		t.Fatalf("Parse failed: %v", err)
-	}
+	path := Parse("/foo//bar///baz////")
 	// Parse strips leading slashes, and records their presence as
 	// Path.Absolute.
 	if !path.Absolute {
@@ -70,6 +65,12 @@ func TestParse(t *testing.T) {
 		dir      bool
 	}
 	tests := []testCase{
+		{
+			pathname: "",
+			relpath:  []string{},
+			abs:      false,
+			dir:      false,
+		},
 		{
 			pathname: "/",
 			relpath:  []string{},
@@ -113,10 +114,7 @@ func TestParse(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(test.pathname, func(t *testing.T) {
-			p, err := Parse(test.pathname)
-			if err != nil {
-				t.Fatalf("failed to parse pathname %q: %v", test.pathname, err)
-			}
+			p := Parse(test.pathname)
 			t.Logf("pathname %q => path %q", test.pathname, p)
 			if p.Absolute != test.abs {
 				t.Errorf("path absoluteness: got %v, wanted %v", p.Absolute, test.abs)
@@ -134,10 +132,3 @@ func TestParse(t *testing.T) {
 		})
 	}
 }
-
-func TestParseEmptyPathname(t *testing.T) {
-	p, err := Parse("")
-	if err != syserror.ENOENT {
-		t.Errorf("parsing empty pathname: got (%v, %v), wanted (<unspecified>, ENOENT)", p, err)
-	}
-}
diff --git a/pkg/sentry/fsimpl/ext/BUILD b/pkg/sentry/fsimpl/ext/BUILD
index 880b7bcd3..bc90330bc 100644
--- a/pkg/sentry/fsimpl/ext/BUILD
+++ b/pkg/sentry/fsimpl/ext/BUILD
@@ -74,6 +74,7 @@ go_test(
     deps = [
         "//pkg/abi/linux",
         "//pkg/binary",
+        "//pkg/fspath",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fsimpl/ext/disklayout",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/BUILD b/pkg/sentry/fsimpl/ext/benchmark/BUILD
index bfc46dfa6..4fc8296ef 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/BUILD
+++ b/pkg/sentry/fsimpl/ext/benchmark/BUILD
@@ -7,6 +7,7 @@ go_test(
     size = "small",
     srcs = ["benchmark_test.go"],
     deps = [
+        "//pkg/fspath",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/fsimpl/ext",
diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 177ce2cb9..2f46d2d13 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -24,6 +24,7 @@ import (
 	"strings"
 	"testing"
 
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext"
@@ -121,7 +122,7 @@ func BenchmarkVFS2Ext4fsStat(b *testing.B) {
 				stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               *root,
 					Start:              *root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
@@ -150,9 +151,9 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
 			creds := auth.CredentialsFromContext(ctx)
 			mountPointName := "/1/"
 			pop := vfs.PathOperation{
-				Root:     *root,
-				Start:    *root,
-				Pathname: mountPointName,
+				Root:  *root,
+				Start: *root,
+				Path:  fspath.Parse(mountPointName),
 			}
 
 			// Save the mount point for later use.
@@ -181,7 +182,7 @@ func BenchmarkVFS2ExtfsMountStat(b *testing.B) {
 				stat, err := vfsfs.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               *root,
 					Start:              *root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index e9f756732..5d6c999bd 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -25,6 +25,7 @@ import (
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/ext/disklayout"
@@ -140,7 +141,7 @@ func TestSeek(t *testing.T) {
 			fd, err := vfsfs.OpenAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
@@ -359,7 +360,7 @@ func TestStatAt(t *testing.T) {
 
 			got, err := vfsfs.StatAt(ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
 				&vfs.StatOptions{},
 			)
 			if err != nil {
@@ -429,7 +430,7 @@ func TestRead(t *testing.T) {
 			fd, err := vfsfs.OpenAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.absPath},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.absPath)},
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
@@ -565,7 +566,7 @@ func TestIterDirents(t *testing.T) {
 			fd, err := vfsfs.OpenAt(
 				ctx,
 				auth.CredentialsFromContext(ctx),
-				&vfs.PathOperation{Root: *root, Start: *root, Pathname: test.path},
+				&vfs.PathOperation{Root: *root, Start: *root, Path: fspath.Parse(test.path)},
 				&vfs.OpenOptions{},
 			)
 			if err != nil {
diff --git a/pkg/sentry/fsimpl/ext/filesystem.go b/pkg/sentry/fsimpl/ext/filesystem.go
index d7e87979a..616fc002a 100644
--- a/pkg/sentry/fsimpl/ext/filesystem.go
+++ b/pkg/sentry/fsimpl/ext/filesystem.go
@@ -275,6 +275,16 @@ func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 	return vfsd, nil
 }
 
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	vfsd, inode, err := fs.walk(rp, true)
+	if err != nil {
+		return nil, err
+	}
+	inode.incRef()
+	return vfsd, nil
+}
+
 // OpenAt implements vfs.FilesystemImpl.OpenAt.
 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
 	vfsd, inode, err := fs.walk(rp, false)
@@ -378,7 +388,7 @@ func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts v
 }
 
 // RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
 	if rp.Done() {
 		return syserror.ENOENT
 	}
diff --git a/pkg/sentry/fsimpl/kernfs/BUILD b/pkg/sentry/fsimpl/kernfs/BUILD
index 52596c090..59f7f39e2 100644
--- a/pkg/sentry/fsimpl/kernfs/BUILD
+++ b/pkg/sentry/fsimpl/kernfs/BUILD
@@ -49,6 +49,7 @@ go_test(
     deps = [
         ":kernfs",
         "//pkg/abi/linux",
+        "//pkg/fspath",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/kernfs/filesystem.go b/pkg/sentry/fsimpl/kernfs/filesystem.go
index 3cbbe4b20..a6f9fced5 100644
--- a/pkg/sentry/fsimpl/kernfs/filesystem.go
+++ b/pkg/sentry/fsimpl/kernfs/filesystem.go
@@ -44,39 +44,37 @@ func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingP
 		return nil, err
 	}
 afterSymlink:
+	name := rp.Component()
+	// Revalidation must be skipped if name is "." or ".."; d or its parent
+	// respectively can't be expected to transition from invalidated back to
+	// valid, so detecting invalidation and retrying would loop forever. This
+	// is consistent with Linux: fs/namei.c:walk_component() => lookup_fast()
+	// calls d_revalidate(), but walk_component() => handle_dots() does not.
+	if name == "." {
+		rp.Advance()
+		return vfsd, nil
+	}
+	if name == ".." {
+		nextVFSD, err := rp.ResolveParent(vfsd)
+		if err != nil {
+			return nil, err
+		}
+		rp.Advance()
+		return nextVFSD, nil
+	}
 	d.dirMu.Lock()
-	nextVFSD, err := rp.ResolveComponent(vfsd)
-	d.dirMu.Unlock()
+	nextVFSD, err := rp.ResolveChild(vfsd, name)
 	if err != nil {
+		d.dirMu.Unlock()
 		return nil, err
 	}
-	if nextVFSD != nil {
-		// Cached dentry exists, revalidate.
-		next := nextVFSD.Impl().(*Dentry)
-		if !next.inode.Valid(ctx) {
-			d.dirMu.Lock()
-			rp.VirtualFilesystem().ForceDeleteDentry(nextVFSD)
-			d.dirMu.Unlock()
-			fs.deferDecRef(nextVFSD) // Reference from Lookup.
-			nextVFSD = nil
-		}
-	}
-	if nextVFSD == nil {
-		// Dentry isn't cached; it either doesn't exist or failed
-		// revalidation. Attempt to resolve it via Lookup.
-		name := rp.Component()
-		var err error
-		nextVFSD, err = d.inode.Lookup(ctx, name)
-		// Reference on nextVFSD dropped by a corresponding Valid.
-		if err != nil {
-			return nil, err
-		}
-		d.InsertChild(name, nextVFSD)
+	next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name, nextVFSD)
+	d.dirMu.Unlock()
+	if err != nil {
+		return nil, err
 	}
-	next := nextVFSD.Impl().(*Dentry)
-
 	// Resolve any symlink at current path component.
-	if rp.ShouldFollowSymlink() && d.isSymlink() {
+	if rp.ShouldFollowSymlink() && next.isSymlink() {
 		// TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks".
 		target, err := next.inode.Readlink(ctx)
 		if err != nil {
@@ -89,7 +87,44 @@ afterSymlink:
 
 	}
 	rp.Advance()
-	return nextVFSD, nil
+	return &next.vfsd, nil
+}
+
+// revalidateChildLocked must be called after a call to parent.vfsd.Child(name)
+// or vfs.ResolvingPath.ResolveChild(name) returns childVFSD (which may be
+// nil) to verify that the returned child (or lack thereof) is correct.
+//
+// Preconditions: Filesystem.mu must be locked for at least reading.
+// parent.dirMu must be locked. parent.isDir(). name is not "." or "..".
+//
+// Postconditions: Caller must call fs.processDeferredDecRefs*.
+func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string, childVFSD *vfs.Dentry) (*Dentry, error) {
+	if childVFSD != nil {
+		// Cached dentry exists, revalidate.
+		child := childVFSD.Impl().(*Dentry)
+		if !child.inode.Valid(ctx) {
+			vfsObj.ForceDeleteDentry(childVFSD)
+			fs.deferDecRef(childVFSD) // Reference from Lookup.
+			childVFSD = nil
+		}
+	}
+	if childVFSD == nil {
+		// Dentry isn't cached; it either doesn't exist or failed
+		// revalidation. Attempt to resolve it via Lookup.
+		//
+		// FIXME(b/144498111): Inode.Lookup() should return *(kernfs.)Dentry,
+		// not *vfs.Dentry, since (kernfs.)Filesystem assumes that all dentries
+		// in the filesystem are (kernfs.)Dentry and performs vfs.DentryImpl
+		// casts accordingly.
+		var err error
+		childVFSD, err = parent.inode.Lookup(ctx, name)
+		if err != nil {
+			return nil, err
+		}
+		// Reference on childVFSD dropped by a corresponding Valid.
+		parent.InsertChild(name, childVFSD)
+	}
+	return childVFSD.Impl().(*Dentry), nil
 }
 
 // walkExistingLocked resolves rp to an existing file.
@@ -242,6 +277,19 @@ func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, op
 	return vfsd, nil
 }
 
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.processDeferredDecRefs()
+	defer fs.mu.RUnlock()
+	vfsd, _, err := fs.walkParentDirLocked(ctx, rp)
+	if err != nil {
+		return nil, err
+	}
+	vfsd.IncRef() // Ownership transferred to caller.
+	return vfsd, nil
+}
+
 // LinkAt implements vfs.FilesystemImpl.LinkAt.
 func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
 	if rp.Done() {
@@ -459,40 +507,42 @@ func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
 }
 
 // RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
-	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
-	exchange := opts.Flags&linux.RENAME_EXCHANGE != 0
-	whiteout := opts.Flags&linux.RENAME_WHITEOUT != 0
-	if exchange && (noReplace || whiteout) {
-		// Can't specify RENAME_NOREPLACE or RENAME_WHITEOUT with RENAME_EXCHANGE.
-		return syserror.EINVAL
-	}
-	if exchange || whiteout {
-		// Exchange and Whiteout flags are not supported on kernfs.
+func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	// Only RENAME_NOREPLACE is supported.
+	if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
 		return syserror.EINVAL
 	}
+	noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
 
 	fs.mu.Lock()
 	defer fs.mu.Lock()
 
+	// Resolve the destination directory first to verify that it's on this
+	// Mount.
+	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+	fs.processDeferredDecRefsLocked()
+	if err != nil {
+		return err
+	}
 	mnt := rp.Mount()
-	if mnt != vd.Mount() {
+	if mnt != oldParentVD.Mount() {
 		return syserror.EXDEV
 	}
-
 	if err := mnt.CheckBeginWrite(); err != nil {
 		return err
 	}
 	defer mnt.EndWrite()
 
-	dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
+	srcDirVFSD := oldParentVD.Dentry()
+	srcDir := srcDirVFSD.Impl().(*Dentry)
+	srcDir.dirMu.Lock()
+	src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName, srcDirVFSD.Child(oldName))
+	srcDir.dirMu.Unlock()
 	fs.processDeferredDecRefsLocked()
 	if err != nil {
 		return err
 	}
-
-	srcVFSD := vd.Dentry()
-	srcDirVFSD := srcVFSD.Parent()
+	srcVFSD := &src.vfsd
 
 	// Can we remove the src dentry?
 	if err := checkDeleteLocked(rp, srcVFSD); err != nil {
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index f78bb7b04..73b6e43b5 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -24,6 +24,7 @@ import (
 
 	"github.com/google/go-cmp/cmp"
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
@@ -82,9 +83,9 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem {
 // Precondition: path should be relative path.
 func (s *TestSystem) PathOpAtRoot(path string) vfs.PathOperation {
 	return vfs.PathOperation{
-		Root:     s.root,
-		Start:    s.root,
-		Pathname: path,
+		Root:  s.root,
+		Start: s.root,
+		Path:  fspath.Parse(path),
 	}
 }
 
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
index 0cc751eb8..5689bed3b 100644
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ b/pkg/sentry/fsimpl/memfs/BUILD
@@ -50,6 +50,7 @@ go_test(
     deps = [
         ":memfs",
         "//pkg/abi/linux",
+        "//pkg/fspath",
         "//pkg/refs",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
@@ -68,6 +69,7 @@ go_test(
     embed = [":memfs"],
     deps = [
         "//pkg/abi/linux",
+        "//pkg/fspath",
         "//pkg/sentry/context",
         "//pkg/sentry/context/contexttest",
         "//pkg/sentry/kernel/auth",
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
index 4a7a94a52..6e987af88 100644
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go
@@ -21,6 +21,7 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/refs"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
@@ -193,9 +194,9 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			for i := depth; i > 0; i-- {
 				name := fmt.Sprintf("%d", i)
 				pop := vfs.PathOperation{
-					Root:     root,
-					Start:    vd,
-					Pathname: name,
+					Root:  root,
+					Start: vd,
+					Path:  fspath.Parse(name),
 				}
 				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
 					Mode: 0755,
@@ -216,7 +217,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
 				Root:               root,
 				Start:              vd,
-				Pathname:           filename,
+				Path:               fspath.Parse(filename),
 				FollowFinalSymlink: true,
 			}, &vfs.OpenOptions{
 				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
@@ -237,7 +238,7 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               root,
 					Start:              root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
@@ -378,9 +379,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			root := mntns.Root()
 			defer root.DecRef()
 			pop := vfs.PathOperation{
-				Root:     root,
-				Start:    root,
-				Pathname: mountPointName,
+				Root:  root,
+				Start: root,
+				Path:  fspath.Parse(mountPointName),
 			}
 			if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
 				Mode: 0755,
@@ -408,9 +409,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			for i := depth; i > 0; i-- {
 				name := fmt.Sprintf("%d", i)
 				pop := vfs.PathOperation{
-					Root:     root,
-					Start:    vd,
-					Pathname: name,
+					Root:  root,
+					Start: vd,
+					Path:  fspath.Parse(name),
 				}
 				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
 					Mode: 0755,
@@ -438,7 +439,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
 				Root:               root,
 				Start:              vd,
-				Pathname:           filename,
+				Path:               fspath.Parse(filename),
 				FollowFinalSymlink: true,
 			}, &vfs.OpenOptions{
 				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
@@ -458,7 +459,7 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
 					Root:               root,
 					Start:              root,
-					Pathname:           filePath,
+					Path:               fspath.Parse(filePath),
 					FollowFinalSymlink: true,
 				}, &vfs.StatOptions{})
 				if err != nil {
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index af4389459..4a83f310c 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -25,323 +25,283 @@ import (
 	"gvisor.dev/gvisor/pkg/syserror"
 )
 
-// stepLocked resolves rp.Component() in parent directory vfsd.
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	// All filesystem state is in-memory.
+	return nil
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
 //
 // stepLocked is loosely analogous to fs/namei.c:walk_component().
 //
-// Preconditions: filesystem.mu must be locked. !rp.Done(). inode ==
-// vfsd.Impl().(*dentry).inode.
-func stepLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode) (*vfs.Dentry, *inode, error) {
-	if !inode.isDir() {
-		return nil, nil, syserror.ENOTDIR
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+	if !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
 	}
-	if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
-		return nil, nil, err
+	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
 	}
 afterSymlink:
-	nextVFSD, err := rp.ResolveComponent(vfsd)
+	nextVFSD, err := rp.ResolveComponent(&d.vfsd)
 	if err != nil {
-		return nil, nil, err
+		return nil, err
 	}
 	if nextVFSD == nil {
 		// Since the Dentry tree is the sole source of truth for memfs, if it's
 		// not in the Dentry tree, it doesn't exist.
-		return nil, nil, syserror.ENOENT
+		return nil, syserror.ENOENT
 	}
-	nextInode := nextVFSD.Impl().(*dentry).inode
-	if symlink, ok := nextInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+	next := nextVFSD.Impl().(*dentry)
+	if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
 		// TODO: symlink traversals update access time
 		if err := rp.HandleSymlink(symlink.target); err != nil {
-			return nil, nil, err
+			return nil, err
 		}
 		goto afterSymlink // don't check the current directory again
 	}
 	rp.Advance()
-	return nextVFSD, nextInode, nil
+	return next, nil
 }
 
-// walkExistingLocked resolves rp to an existing file.
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
 //
-// walkExistingLocked is loosely analogous to Linux's
-// fs/namei.c:path_lookupat().
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
 //
-// Preconditions: filesystem.mu must be locked.
-func walkExistingLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*dentry).inode
-	for !rp.Done() {
-		var err error
-		vfsd, inode, err = stepLocked(rp, vfsd, inode)
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+	for !rp.Final() {
+		next, err := stepLocked(rp, d)
 		if err != nil {
-			return nil, nil, err
+			return nil, err
 		}
+		d = next
 	}
-	if rp.MustBeDir() && !inode.isDir() {
-		return nil, nil, syserror.ENOTDIR
+	if !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
 	}
-	return vfsd, inode, nil
+	return d, nil
 }
 
-// walkParentDirLocked resolves all but the last path component of rp to an
-// existing directory. It does not check that the returned directory is
-// searchable by the provider of rp.
+// resolveLocked resolves rp to an existing file.
 //
-// walkParentDirLocked is loosely analogous to Linux's
-// fs/namei.c:path_parentat().
+// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
 //
-// Preconditions: filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath) (*vfs.Dentry, *inode, error) {
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*dentry).inode
-	for !rp.Final() {
-		var err error
-		vfsd, inode, err = stepLocked(rp, vfsd, inode)
+// Preconditions: filesystem.mu must be locked.
+func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	for !rp.Done() {
+		next, err := stepLocked(rp, d)
 		if err != nil {
-			return nil, nil, err
+			return nil, err
 		}
+		d = next
 	}
-	if !inode.isDir() {
-		return nil, nil, syserror.ENOTDIR
+	if rp.MustBeDir() && !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
 	}
-	return vfsd, inode, nil
+	return d, nil
 }
 
-// checkCreateLocked checks that a file named rp.Component() may be created in
-// directory parentVFSD, then returns rp.Component().
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
 //
-// Preconditions: filesystem.mu must be locked. parentInode ==
-// parentVFSD.Impl().(*dentry).inode. parentInode.isDir() == true.
-func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode *inode) (string, error) {
-	if err := parentInode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true); err != nil {
-		return "", err
-	}
-	pc := rp.Component()
-	if pc == "." || pc == ".." {
-		return "", syserror.EEXIST
-	}
-	childVFSD, err := rp.ResolveChild(parentVFSD, pc)
+// doCreateAt is loosely analogous to a conjunction of Linux's
+// fs/namei.c:filename_create() and done_path_create().
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
-		return "", err
+		return err
 	}
-	if childVFSD != nil {
-		return "", syserror.EEXIST
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
 	}
-	if parentVFSD.IsDisowned() {
-		return "", syserror.ENOENT
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EEXIST
 	}
-	return pc, nil
-}
-
-// checkDeleteLocked checks that the file represented by vfsd may be deleted.
-func checkDeleteLocked(vfsd *vfs.Dentry) error {
-	parentVFSD := vfsd.Parent()
-	if parentVFSD == nil {
-		return syserror.EBUSY
+	// Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(),
+	// because if the child exists we want to return EEXIST immediately instead
+	// of attempting symlink/mount traversal.
+	if parent.vfsd.Child(name) != nil {
+		return syserror.EEXIST
 	}
-	if parentVFSD.IsDisowned() {
+	if !dir && rp.MustBeDir() {
 		return syserror.ENOENT
 	}
-	return nil
+	// In memfs, the only way to cause a dentry to be disowned is by removing
+	// it from the filesystem, so this check is equivalent to checking if
+	// parent has been removed.
+	if parent.vfsd.IsDisowned() {
+		return syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	return create(parent, name)
 }
 
 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
 func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	vfsd, inode, err := walkExistingLocked(rp)
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return nil, err
 	}
 	if opts.CheckSearchable {
-		if !inode.isDir() {
+		if !d.inode.isDir() {
 			return nil, syserror.ENOTDIR
 		}
-		if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil {
 			return nil, err
 		}
 	}
-	inode.incRef()
-	return vfsd, nil
+	d.IncRef()
+	return &d.vfsd, nil
 }
 
-// LinkAt implements vfs.FilesystemImpl.LinkAt.
-func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
-		return err
-	}
-	if rp.Mount() != vd.Mount() {
-		return syserror.EXDEV
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	d := vd.Dentry().Impl().(*dentry)
-	if d.inode.isDir() {
-		return syserror.EPERM
+		return nil, err
 	}
-	d.inode.incLinksLocked()
-	child := fs.newDentry(d.inode)
-	parentVFSD.InsertChild(&child.vfsd, pc)
-	parentInode.impl.(*directory).childList.PushBack(child)
-	return nil
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		if rp.Mount() != vd.Mount() {
+			return syserror.EXDEV
+		}
+		d := vd.Dentry().Impl().(*dentry)
+		if d.inode.isDir() {
+			return syserror.EPERM
+		}
+		if d.inode.nlink == 0 {
+			return syserror.ENOENT
+		}
+		if d.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+		d.inode.incLinksLocked()
+		child := fs.newDentry(d.inode)
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
 }
 
 // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
-	parentVFSD.InsertChild(&child.vfsd, pc)
-	parentInode.impl.(*directory).childList.PushBack(child)
-	parentInode.incLinksLocked() // from child's ".."
-	return nil
+	return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error {
+		if parent.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+		parent.inode.incLinksLocked() // from child's ".."
+		child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
 }
 
 // MknodAt implements vfs.FilesystemImpl.MknodAt.
 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-
-	switch opts.Mode.FileType() {
-	case 0:
-		// "Zero file type is equivalent to type S_IFREG." - mknod(2)
-		fallthrough
-	case linux.ModeRegular:
-		// TODO(b/138862511): Implement.
-		return syserror.EINVAL
-
-	case linux.ModeNamedPipe:
-		child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
-		parentVFSD.InsertChild(&child.vfsd, pc)
-		parentInode.impl.(*directory).childList.PushBack(child)
-		return nil
-
-	case linux.ModeSocket:
-		// TODO(b/138862511): Implement.
-		return syserror.EINVAL
-
-	case linux.ModeCharacterDevice:
-		fallthrough
-	case linux.ModeBlockDevice:
-		// TODO(b/72101894): We don't support creating block or character
-		// devices at the moment.
-		//
-		// When we start supporting block and character devices, we'll
-		// need to check for CAP_MKNOD here.
-		return syserror.EPERM
-
-	default:
-		// "EINVAL - mode requested creation of something other than a
-		// regular file, device special file, FIFO or socket." - mknod(2)
-		return syserror.EINVAL
-	}
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		switch opts.Mode.FileType() {
+		case 0, linux.S_IFREG:
+			child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+			parent.vfsd.InsertChild(&child.vfsd, name)
+			parent.inode.impl.(*directory).childList.PushBack(child)
+			return nil
+		case linux.S_IFIFO:
+			child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
+			parent.vfsd.InsertChild(&child.vfsd, name)
+			parent.inode.impl.(*directory).childList.PushBack(child)
+			return nil
+		case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK:
+			// Not yet supported.
+			return syserror.EPERM
+		default:
+			return syserror.EINVAL
+		}
+	})
 }
 
 // OpenAt implements vfs.FilesystemImpl.OpenAt.
 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	// Filter out flags that are not supported by memfs. O_DIRECTORY and
-	// O_NOFOLLOW have no effect here (they're handled by VFS by setting
-	// appropriate bits in rp), but are visible in FD status flags. O_NONBLOCK
-	// is supported only by pipes.
-	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK
+	if opts.Flags&linux.O_TMPFILE != 0 {
+		// Not yet supported.
+		return nil, syserror.EOPNOTSUPP
+	}
 
+	// Handle O_CREAT and !O_CREAT separately, since in the latter case we
+	// don't need fs.mu for writing.
 	if opts.Flags&linux.O_CREAT == 0 {
 		fs.mu.RLock()
 		defer fs.mu.RUnlock()
-		vfsd, inode, err := walkExistingLocked(rp)
+		d, err := resolveLocked(rp)
 		if err != nil {
 			return nil, err
 		}
-		return inode.open(ctx, rp, vfsd, opts.Flags, false)
+		return d.open(ctx, rp, opts.Flags, false /* afterCreate */)
 	}
 
 	mustCreate := opts.Flags&linux.O_EXCL != 0
-	vfsd := rp.Start()
-	inode := vfsd.Impl().(*dentry).inode
+	start := rp.Start().Impl().(*dentry)
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
 	if rp.Done() {
+		// Reject attempts to open directories with O_CREAT.
 		if rp.MustBeDir() {
 			return nil, syserror.EISDIR
 		}
 		if mustCreate {
 			return nil, syserror.EEXIST
 		}
-		return inode.open(ctx, rp, vfsd, opts.Flags, false)
+		return start.open(ctx, rp, opts.Flags, false /* afterCreate */)
 	}
 afterTrailingSymlink:
-	// Walk to the parent directory of the last path component.
-	for !rp.Final() {
-		var err error
-		vfsd, inode, err = stepLocked(rp, vfsd, inode)
-		if err != nil {
-			return nil, err
-		}
-	}
-	if !inode.isDir() {
-		return nil, syserror.ENOTDIR
+	parent, err := walkParentDirLocked(rp, start)
+	if err != nil {
+		return nil, err
 	}
 	// Check for search permission in the parent directory.
-	if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
 		return nil, err
 	}
 	// Reject attempts to open directories with O_CREAT.
 	if rp.MustBeDir() {
 		return nil, syserror.EISDIR
 	}
-	pc := rp.Component()
-	if pc == "." || pc == ".." {
+	name := rp.Component()
+	if name == "." || name == ".." {
 		return nil, syserror.EISDIR
 	}
 	// Determine whether or not we need to create a file.
-	childVFSD, err := rp.ResolveChild(vfsd, pc)
-	if err != nil {
-		return nil, err
-	}
-	if childVFSD == nil {
+	child, err := stepLocked(rp, parent)
+	if err == syserror.ENOENT {
 		// Already checked for searchability above; now check for writability.
-		if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+		if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
 			return nil, err
 		}
 		if err := rp.Mount().CheckBeginWrite(); err != nil {
@@ -349,38 +309,35 @@ afterTrailingSymlink:
 		}
 		defer rp.Mount().EndWrite()
 		// Create and open the child.
-		childInode := fs.newRegularFile(rp.Credentials(), opts.Mode)
-		child := fs.newDentry(childInode)
-		vfsd.InsertChild(&child.vfsd, pc)
-		inode.impl.(*directory).childList.PushBack(child)
-		return childInode.open(ctx, rp, &child.vfsd, opts.Flags, true)
+		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return child.open(ctx, rp, opts.Flags, true)
 	}
-	// Open existing file or follow symlink.
-	if mustCreate {
-		return nil, syserror.EEXIST
+	if err != nil {
+		return nil, err
 	}
-	childInode := childVFSD.Impl().(*dentry).inode
-	if symlink, ok := childInode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
-		// TODO: symlink traversals update access time
-		if err := rp.HandleSymlink(symlink.target); err != nil {
-			return nil, err
-		}
-		// rp.Final() may no longer be true since we now need to resolve the
-		// symlink target.
+	// Do we need to resolve a trailing symlink?
+	if !rp.Done() {
+		start = parent
 		goto afterTrailingSymlink
 	}
-	return childInode.open(ctx, rp, childVFSD, opts.Flags, false)
+	// Open existing file.
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	return child.open(ctx, rp, opts.Flags, false)
 }
 
-func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
 	ats := vfs.AccessTypesForOpenFlags(flags)
 	if !afterCreate {
-		if err := i.checkPermissions(rp.Credentials(), ats, i.isDir()); err != nil {
+		if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
 			return nil, err
 		}
 	}
 	mnt := rp.Mount()
-	switch impl := i.impl.(type) {
+	switch impl := d.inode.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
 		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
@@ -392,8 +349,8 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 			// mnt.EndWrite() is called by regularFileFD.Release().
 		}
 		mnt.IncRef()
-		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
+		d.IncRef()
+		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
 		if flags&linux.O_TRUNC != 0 {
 			impl.mu.Lock()
 			impl.data = impl.data[:0]
@@ -408,28 +365,28 @@ func (i *inode) open(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentr
 		}
 		var fd directoryFD
 		mnt.IncRef()
-		vfsd.IncRef()
-		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
+		d.IncRef()
+		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *symlink:
 		// Can't open symlinks without O_PATH (which is unimplemented).
 		return nil, syserror.ELOOP
 	case *namedPipe:
-		return newNamedPipeFD(ctx, impl, rp, vfsd, flags)
+		return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags)
 	default:
-		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
 	}
 }
 
 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
 func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
 	fs.mu.RLock()
-	_, inode, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return "", err
 	}
-	symlink, ok := inode.impl.(*symlink)
+	symlink, ok := d.inode.impl.(*symlink)
 	if !ok {
 		return "", syserror.EINVAL
 	}
@@ -437,63 +394,172 @@ func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (st
 }
 
 // RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
-	if rp.Done() {
-		return syserror.ENOENT
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	if opts.Flags != 0 {
+		// TODO(b/145974740): Support renameat2 flags.
+		return syserror.EINVAL
 	}
+
+	// Resolve newParent first to verify that it's on this Mount.
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
+	newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return err
 	}
-	_, err = checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
+	newName := rp.Component()
+	if newName == "." || newName == ".." {
+		return syserror.EBUSY
+	}
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
 		return err
 	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
+	defer mnt.EndWrite()
+
+	oldParent := oldParentVD.Dentry().Impl().(*dentry)
+	if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
 		return err
 	}
-	defer rp.Mount().EndWrite()
-	// TODO: actually implement RenameAt
-	return syserror.EPERM
+	// Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(),
+	// because if the existing child is a symlink or mount point then we want
+	// to rename over it rather than follow it.
+	renamedVFSD := oldParent.vfsd.Child(oldName)
+	if renamedVFSD == nil {
+		return syserror.ENOENT
+	}
+	renamed := renamedVFSD.Impl().(*dentry)
+	if renamed.inode.isDir() {
+		if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) {
+			return syserror.EINVAL
+		}
+		if oldParent != newParent {
+			// Writability is needed to change renamed's "..".
+			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil {
+				return err
+			}
+		}
+	} else {
+		if opts.MustBeDir || rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+
+	if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	replacedVFSD := newParent.vfsd.Child(newName)
+	var replaced *dentry
+	if replacedVFSD != nil {
+		replaced = replacedVFSD.Impl().(*dentry)
+		if replaced.inode.isDir() {
+			if !renamed.inode.isDir() {
+				return syserror.EISDIR
+			}
+			if replaced.vfsd.HasChildren() {
+				return syserror.ENOTEMPTY
+			}
+		} else {
+			if rp.MustBeDir() {
+				return syserror.ENOTDIR
+			}
+			if renamed.inode.isDir() {
+				return syserror.ENOTDIR
+			}
+		}
+	} else {
+		if renamed.inode.isDir() && newParent.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+	}
+	if newParent.vfsd.IsDisowned() {
+		return syserror.ENOENT
+	}
+
+	// Linux places this check before some of those above; we do it here for
+	// simplicity, under the assumption that applications are not intentionally
+	// doing noop renames expecting them to succeed where non-noop renames
+	// would fail.
+	if renamedVFSD == replacedVFSD {
+		return nil
+	}
+	vfsObj := rp.VirtualFilesystem()
+	oldParentDir := oldParent.inode.impl.(*directory)
+	newParentDir := newParent.inode.impl.(*directory)
+	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil {
+		return err
+	}
+	if replaced != nil {
+		newParentDir.childList.Remove(replaced)
+		if replaced.inode.isDir() {
+			newParent.inode.decLinksLocked() // from replaced's ".."
+		}
+		replaced.inode.decLinksLocked()
+	}
+	oldParentDir.childList.Remove(renamed)
+	newParentDir.childList.PushBack(renamed)
+	if renamed.inode.isDir() {
+		oldParent.inode.decLinksLocked()
+		newParent.inode.incLinksLocked()
+	}
+	// TODO: update timestamps and parent directory sizes
+	vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
+	return nil
 }
 
 // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
 func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
-	vfsd, inode, err := walkExistingLocked(rp)
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return err
 	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
 		return err
 	}
-	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(vfsd); err != nil {
-		return err
+	name := rp.Component()
+	if name == "." {
+		return syserror.EINVAL
 	}
-	if !inode.isDir() {
+	if name == ".." {
+		return syserror.ENOTEMPTY
+	}
+	childVFSD := parent.vfsd.Child(name)
+	if childVFSD == nil {
+		return syserror.ENOENT
+	}
+	child := childVFSD.Impl().(*dentry)
+	if !child.inode.isDir() {
 		return syserror.ENOTDIR
 	}
-	if vfsd.HasChildren() {
+	if childVFSD.HasChildren() {
 		return syserror.ENOTEMPTY
 	}
-	if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	vfsObj := rp.VirtualFilesystem()
+	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
 		return err
 	}
-	// Remove from parent directory's childList.
-	vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
-	inode.decRef()
+	parent.inode.impl.(*directory).childList.Remove(child)
+	parent.inode.decLinksLocked() // from child's ".."
+	child.inode.decLinksLocked()
+	vfsObj.CommitDeleteDentry(childVFSD)
 	return nil
 }
 
 // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
 	fs.mu.RLock()
-	_, _, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
@@ -507,21 +573,21 @@ func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts
 // StatAt implements vfs.FilesystemImpl.StatAt.
 func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
 	fs.mu.RLock()
-	_, inode, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
 	if err != nil {
 		return linux.Statx{}, err
 	}
 	var stat linux.Statx
-	inode.statTo(&stat)
+	d.inode.statTo(&stat)
 	return stat, nil
 }
 
 // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
 func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
 	fs.mu.RLock()
-	_, _, err := walkExistingLocked(rp)
-	fs.mu.RUnlock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
 	if err != nil {
 		return linux.Statfs{}, err
 	}
@@ -531,53 +597,52 @@ func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linu
 
 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
-	if rp.Done() {
-		return syserror.EEXIST
-	}
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parentVFSD, parentInode, err := walkParentDirLocked(rp)
-	if err != nil {
-		return err
-	}
-	pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
-	if err != nil {
-		return err
-	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer rp.Mount().EndWrite()
-	child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
-	parentVFSD.InsertChild(&child.vfsd, pc)
-	parentInode.impl.(*directory).childList.PushBack(child)
-	return nil
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
 }
 
 // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
 func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
 	fs.mu.Lock()
 	defer fs.mu.Unlock()
-	vfsd, inode, err := walkExistingLocked(rp)
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
 	if err != nil {
 		return err
 	}
-	if err := rp.Mount().CheckBeginWrite(); err != nil {
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
 		return err
 	}
-	defer rp.Mount().EndWrite()
-	if err := checkDeleteLocked(vfsd); err != nil {
-		return err
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EISDIR
 	}
-	if inode.isDir() {
+	childVFSD := parent.vfsd.Child(name)
+	if childVFSD == nil {
+		return syserror.ENOENT
+	}
+	child := childVFSD.Impl().(*dentry)
+	if child.inode.isDir() {
 		return syserror.EISDIR
 	}
-	if err := rp.VirtualFilesystem().DeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
+	if !rp.MustBeDir() {
+		return syserror.ENOTDIR
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	vfsObj := rp.VirtualFilesystem()
+	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
 		return err
 	}
-	// Remove from parent directory's childList.
-	vfsd.Parent().Impl().(*dentry).inode.impl.(*directory).childList.Remove(vfsd.Impl().(*dentry))
-	inode.decLinksLocked()
+	parent.inode.impl.(*directory).childList.Remove(child)
+	child.inode.decLinksLocked()
+	vfsObj.CommitDeleteDentry(childVFSD)
 	return nil
 }
 
@@ -585,7 +650,7 @@ func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error
 func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, _, err := walkExistingLocked(rp)
+	_, err := resolveLocked(rp)
 	if err != nil {
 		return nil, err
 	}
@@ -597,7 +662,7 @@ func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([
 func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, _, err := walkExistingLocked(rp)
+	_, err := resolveLocked(rp)
 	if err != nil {
 		return "", err
 	}
@@ -609,7 +674,7 @@ func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, nam
 func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, _, err := walkExistingLocked(rp)
+	_, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
@@ -621,7 +686,7 @@ func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opt
 func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
 	fs.mu.RLock()
 	defer fs.mu.RUnlock()
-	_, _, err := walkExistingLocked(rp)
+	_, err := resolveLocked(rp)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
index 9d509f6e4..8d0167c93 100644
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ b/pkg/sentry/fsimpl/memfs/memfs.go
@@ -29,6 +29,7 @@ package memfs
 
 import (
 	"fmt"
+	"math"
 	"sync"
 	"sync/atomic"
 
@@ -64,12 +65,6 @@ func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.Virt
 func (fs *filesystem) Release() {
 }
 
-// Sync implements vfs.FilesystemImpl.Sync.
-func (fs *filesystem) Sync(ctx context.Context) error {
-	// All filesystem state is in-memory.
-	return nil
-}
-
 // dentry implements vfs.DentryImpl.
 type dentry struct {
 	vfsd vfs.Dentry
@@ -137,6 +132,8 @@ type inode struct {
 	impl interface{} // immutable
 }
 
+const maxLinks = math.MaxUint32
+
 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
 	i.refs = 1
 	i.mode = uint32(mode)
@@ -147,20 +144,28 @@ func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials,
 	i.impl = impl
 }
 
-// Preconditions: filesystem.mu must be locked for writing.
+// incLinksLocked increments i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+// i.nlink < maxLinks.
 func (i *inode) incLinksLocked() {
-	if atomic.AddUint32(&i.nlink, 1) <= 1 {
+	if i.nlink == 0 {
 		panic("memfs.inode.incLinksLocked() called with no existing links")
 	}
+	if i.nlink == maxLinks {
+		panic("memfs.inode.incLinksLocked() called with maximum link count")
+	}
+	atomic.AddUint32(&i.nlink, 1)
 }
 
-// Preconditions: filesystem.mu must be locked for writing.
+// decLinksLocked decrements i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
 func (i *inode) decLinksLocked() {
-	if nlink := atomic.AddUint32(&i.nlink, ^uint32(0)); nlink == 0 {
-		i.decRef()
-	} else if nlink == ^uint32(0) { // negative overflow
+	if i.nlink == 0 {
 		panic("memfs.inode.decLinksLocked() called with no existing links")
 	}
+	atomic.AddUint32(&i.nlink, ^uint32(0))
 }
 
 func (i *inode) incRef() {
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go
index 5bf527c80..be917aeee 100644
--- a/pkg/sentry/fsimpl/memfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/memfs/pipe_test.go
@@ -19,6 +19,7 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@@ -38,7 +39,7 @@ func TestSeparateFDs(t *testing.T) {
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	rfdchan := make(chan *vfs.FileDescription)
@@ -76,7 +77,7 @@ func TestNonblockingRead(t *testing.T) {
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK}
@@ -108,7 +109,7 @@ func TestNonblockingWriteError(t *testing.T) {
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK}
@@ -126,7 +127,7 @@ func TestSingleFD(t *testing.T) {
 	pop := vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}
 	openOpts := vfs.OpenOptions{Flags: linux.O_RDWR}
@@ -160,10 +161,9 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	// Create the pipe.
 	root := mntns.Root()
 	pop := vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Pathname:           fileName,
-		FollowFinalSymlink: true,
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(fileName),
 	}
 	mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644}
 	if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil {
@@ -174,7 +174,7 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
 		Root:               root,
 		Start:              root,
-		Pathname:           fileName,
+		Path:               fspath.Parse(fileName),
 		FollowFinalSymlink: true,
 	}, &vfs.StatOptions{})
 	if err != nil {
diff --git a/pkg/sentry/vfs/dentry.go b/pkg/sentry/vfs/dentry.go
index 6209eb053..1bc9c4a38 100644
--- a/pkg/sentry/vfs/dentry.go
+++ b/pkg/sentry/vfs/dentry.go
@@ -234,6 +234,18 @@ func (d *Dentry) InsertChild(child *Dentry, name string) {
 	child.name = name
 }
 
+// IsAncestorOf returns true if d is an ancestor of d2; that is, d is either
+// d2's parent or an ancestor of d2's parent.
+func (d *Dentry) IsAncestorOf(d2 *Dentry) bool {
+	for d2.parent != nil {
+		if d2.parent == d {
+			return true
+		}
+		d2 = d2.parent
+	}
+	return false
+}
+
 // PrepareDeleteDentry must be called before attempting to delete the file
 // represented by d. If PrepareDeleteDentry succeeds, the caller must call
 // AbortDeleteDentry or CommitDeleteDentry depending on the deletion's outcome.
@@ -283,21 +295,6 @@ func (vfs *VirtualFilesystem) CommitDeleteDentry(d *Dentry) {
 	}
 }
 
-// DeleteDentry combines PrepareDeleteDentry and CommitDeleteDentry, as
-// appropriate for in-memory filesystems that don't need to ensure that some
-// external state change succeeds before committing the deletion.
-//
-// DeleteDentry is a mutator of d and d.Parent().
-//
-// Preconditions: d is a child Dentry.
-func (vfs *VirtualFilesystem) DeleteDentry(mntns *MountNamespace, d *Dentry) error {
-	if err := vfs.PrepareDeleteDentry(mntns, d); err != nil {
-		return err
-	}
-	vfs.CommitDeleteDentry(d)
-	return nil
-}
-
 // ForceDeleteDentry causes d to become disowned. It should only be used in
 // cases where VFS has no ability to stop the deletion (e.g. d represents the
 // local state of a file on a remote filesystem on which the file has already
@@ -326,7 +323,7 @@ func (vfs *VirtualFilesystem) ForceDeleteDentry(d *Dentry) {
 // CommitRenameExchangeDentry depending on the rename's outcome.
 //
 // Preconditions: from is a child Dentry. If to is not nil, it must be a child
-// Dentry from the same Filesystem.
+// Dentry from the same Filesystem. from != to.
 func (vfs *VirtualFilesystem) PrepareRenameDentry(mntns *MountNamespace, from, to *Dentry) error {
 	if checkInvariants {
 		if from.parent == nil {
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index df03886c3..0b053201a 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -192,6 +192,8 @@ func (fd *FileDescription) Impl() FileDescriptionImpl {
 // be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and
 // auth.KGID respectively).
 //
+// All methods may return errors not specified.
+//
 // FileDescriptionImpl is analogous to Linux's struct file_operations.
 type FileDescriptionImpl interface {
 	// Release is called when the associated FileDescription reaches zero
@@ -220,6 +222,10 @@ type FileDescriptionImpl interface {
 	// PRead reads from the file into dst, starting at the given offset, and
 	// returns the number of bytes read. PRead is permitted to return partial
 	// reads with a nil error.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
 	PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
 
 	// Read is similar to PRead, but does not specify an offset.
@@ -229,6 +235,10 @@ type FileDescriptionImpl interface {
 	// the number of bytes read; note that POSIX 2.9.7 "Thread Interactions
 	// with Regular File Operations" requires that all operations that may
 	// mutate the FileDescription offset are serialized.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP.
 	Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error)
 
 	// PWrite writes src to the file, starting at the given offset, and returns
@@ -238,6 +248,11 @@ type FileDescriptionImpl interface {
 	// As in Linux (but not POSIX), if O_APPEND is in effect for the
 	// FileDescription, PWrite should ignore the offset and append data to the
 	// end of the file.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, PWrite returns
+	// EOPNOTSUPP.
 	PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
 
 	// Write is similar to PWrite, but does not specify an offset, which is
@@ -247,6 +262,10 @@ type FileDescriptionImpl interface {
 	// PWrite that uses a FileDescription offset, to make it possible for
 	// remote filesystems to implement O_APPEND correctly (i.e. atomically with
 	// respect to writers outside the scope of VFS).
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP.
 	Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error)
 
 	// IterDirents invokes cb on each entry in the directory represented by the
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index b766614e7..89bd58864 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -108,6 +108,24 @@ func (fs *Filesystem) DecRef() {
 // (responsible for actually implementing the operation) isn't known until path
 // resolution is complete.
 //
+// Unless otherwise specified, FilesystemImpl methods are responsible for
+// performing permission checks. In many cases, vfs package functions in
+// permissions.go may be used to help perform these checks.
+//
+// When multiple specified error conditions apply to a given method call, the
+// implementation may return any applicable errno unless otherwise specified,
+// but returning the earliest error specified is preferable to maximize
+// compatibility with Linux.
+//
+// All methods may return errors not specified, notably including:
+//
+// - ENOENT if a required path component does not exist.
+//
+// - ENOTDIR if an intermediate path component is not a directory.
+//
+// - Errors from vfs-package functions (ResolvingPath.Resolve*(),
+// Mount.CheckBeginWrite(), permission-checking functions, etc.)
+//
 // For all methods that take or return linux.Statx, Statx.Uid and Statx.Gid
 // should be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID
 // and auth.KGID respectively).
@@ -130,46 +148,223 @@ type FilesystemImpl interface {
 	// GetDentryAt does not correspond directly to a Linux syscall; it is used
 	// in the implementation of:
 	//
-	// - Syscalls that need to resolve two paths: rename(), renameat(),
-	// renameat2(), link(), linkat().
+	// - Syscalls that need to resolve two paths: link(), linkat().
 	//
 	// - Syscalls that need to refer to a filesystem position outside the
 	// context of a file description: chdir(), fchdir(), chroot(), mount(),
 	// umount().
 	GetDentryAt(ctx context.Context, rp *ResolvingPath, opts GetDentryOptions) (*Dentry, error)
 
+	// GetParentDentryAt returns a Dentry representing the directory at the
+	// second-to-last path component in rp. (Note that, despite the name, this
+	// is not necessarily the parent directory of the file at rp, since the
+	// last path component in rp may be "." or "..".) A reference is taken on
+	// the returned Dentry.
+	//
+	// GetParentDentryAt does not correspond directly to a Linux syscall; it is
+	// used in the implementation of the rename() family of syscalls, which
+	// must resolve the parent directories of two paths.
+	//
+	// Preconditions: !rp.Done().
+	//
+	// Postconditions: If GetParentDentryAt returns a nil error, then
+	// rp.Final(). If GetParentDentryAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
+	GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error)
+
 	// LinkAt creates a hard link at rp representing the same file as vd. It
 	// does not take ownership of references on vd.
 	//
-	// The implementation is responsible for checking that vd.Mount() ==
-	// rp.Mount(), and that vd does not represent a directory.
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", LinkAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, LinkAt returns EEXIST.
+	//
+	// - If rp.MustBeDir(), LinkAt returns ENOENT.
+	//
+	// - If the directory in which the link would be created has been removed
+	// by RmdirAt or RenameAt, LinkAt returns ENOENT.
+	//
+	// - If rp.Mount != vd.Mount(), LinkAt returns EXDEV.
+	//
+	// - If vd represents a directory, LinkAt returns EPERM.
+	//
+	// - If vd represents a file for which all existing links have been
+	// removed, or a file created by open(O_TMPFILE|O_EXCL), LinkAt returns
+	// ENOENT. Equivalently, if vd represents a file with a link count of 0 not
+	// created by open(O_TMPFILE) without O_EXCL, LinkAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If LinkAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error
 
 	// MkdirAt creates a directory at rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", MkdirAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, MkdirAt returns EEXIST.
+	//
+	// - If the directory in which the new directory would be created has been
+	// removed by RmdirAt or RenameAt, MkdirAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If MkdirAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	MkdirAt(ctx context.Context, rp *ResolvingPath, opts MkdirOptions) error
 
 	// MknodAt creates a regular file, device special file, or named pipe at
 	// rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", MknodAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, MknodAt returns EEXIST.
+	//
+	// - If rp.MustBeDir(), MknodAt returns ENOENT.
+	//
+	// - If the directory in which the file would be created has been removed
+	// by RmdirAt or RenameAt, MknodAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If MknodAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	MknodAt(ctx context.Context, rp *ResolvingPath, opts MknodOptions) error
 
 	// OpenAt returns an FileDescription providing access to the file at rp. A
 	// reference is taken on the returned FileDescription.
+	//
+	// Errors:
+	//
+	// - If opts.Flags specifies O_TMPFILE and this feature is unsupported by
+	// the implementation, OpenAt returns EOPNOTSUPP. (All other unsupported
+	// features are silently ignored, consistently with Linux's open*(2).)
 	OpenAt(ctx context.Context, rp *ResolvingPath, opts OpenOptions) (*FileDescription, error)
 
 	// ReadlinkAt returns the target of the symbolic link at rp.
+	//
+	// Errors:
+	//
+	// - If the file at rp is not a symbolic link, ReadlinkAt returns EINVAL.
 	ReadlinkAt(ctx context.Context, rp *ResolvingPath) (string, error)
 
-	// RenameAt renames the Dentry represented by vd to rp. It does not take
-	// ownership of references on vd.
+	// RenameAt renames the file named oldName in directory oldParentVD to rp.
+	// It does not take ownership of references on oldParentVD.
+	//
+	// Errors [1]:
+	//
+	// - If opts.Flags specifies unsupported options, RenameAt returns EINVAL.
+	//
+	// - If the last path component in rp is "." or "..", and opts.Flags
+	// contains RENAME_NOREPLACE, RenameAt returns EEXIST.
+	//
+	// - If the last path component in rp is "." or "..", and opts.Flags does
+	// not contain RENAME_NOREPLACE, RenameAt returns EBUSY.
+	//
+	// - If rp.Mount != oldParentVD.Mount(), RenameAt returns EXDEV.
 	//
-	// The implementation is responsible for checking that vd.Mount() ==
-	// rp.Mount().
-	RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error
+	// - If the renamed file is not a directory, and opts.MustBeDir is true,
+	// RenameAt returns ENOTDIR.
+	//
+	// - If renaming would replace an existing file and opts.Flags contains
+	// RENAME_NOREPLACE, RenameAt returns EEXIST.
+	//
+	// - If there is no existing file at rp and opts.Flags contains
+	// RENAME_EXCHANGE, RenameAt returns ENOENT.
+	//
+	// - If there is an existing non-directory file at rp, and rp.MustBeDir()
+	// is true, RenameAt returns ENOTDIR.
+	//
+	// - If the renamed file is not a directory, opts.Flags does not contain
+	// RENAME_EXCHANGE, and rp.MustBeDir() is true, RenameAt returns ENOTDIR.
+	// (This check is not subsumed by the check for directory replacement below
+	// since it applies even if there is no file to replace.)
+	//
+	// - If the renamed file is a directory, and the new parent directory of
+	// the renamed file is either the renamed directory or a descendant
+	// subdirectory of the renamed directory, RenameAt returns EINVAL.
+	//
+	// - If renaming would exchange the renamed file with an ancestor directory
+	// of the renamed file, RenameAt returns EINVAL.
+	//
+	// - If renaming would replace an ancestor directory of the renamed file,
+	// RenameAt returns ENOTEMPTY. (This check would be subsumed by the
+	// non-empty directory check below; however, this check takes place before
+	// the self-rename check.)
+	//
+	// - If the renamed file would replace or exchange with itself (i.e. the
+	// source and destination paths resolve to the same file), RenameAt returns
+	// nil, skipping the checks described below.
+	//
+	// - If the source or destination directory is not writable by the provider
+	// of rp.Credentials(), RenameAt returns EACCES.
+	//
+	// - If the renamed file is a directory, and renaming would replace a
+	// non-directory file, RenameAt returns ENOTDIR.
+	//
+	// - If the renamed file is not a directory, and renaming would replace a
+	// directory, RenameAt returns EISDIR.
+	//
+	// - If the new parent directory of the renamed file has been removed by
+	// RmdirAt or a preceding call to RenameAt, RenameAt returns ENOENT.
+	//
+	// - If the renamed file is a directory, it is not writable by the
+	// provider of rp.Credentials(), and the source and destination parent
+	// directories are different, RenameAt returns EACCES. (This is nominally
+	// required to change the ".." entry in the renamed directory.)
+	//
+	// - If renaming would replace a non-empty directory, RenameAt returns
+	// ENOTEMPTY.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink(). oldName is not "." or "..".
+	//
+	// Postconditions: If RenameAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
+	//
+	// [1] "The worst of all namespace operations - renaming directory.
+	// "Perverted" doesn't even start to describe it. Somebody in UCB had a
+	// heck of a trip..." - fs/namei.c:vfs_rename()
+	RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error
 
 	// RmdirAt removes the directory at rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is ".", RmdirAt returns EINVAL.
+	//
+	// - If the last path component in rp is "..", RmdirAt returns ENOTEMPTY.
+	//
+	// - If no file exists at rp, RmdirAt returns ENOENT.
+	//
+	// - If the file at rp exists but is not a directory, RmdirAt returns
+	// ENOTDIR.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If RmdirAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	RmdirAt(ctx context.Context, rp *ResolvingPath) error
 
 	// SetStatAt updates metadata for the file at the given path.
+	//
+	// Errors:
+	//
+	// - If opts specifies unsupported options, SetStatAt returns EINVAL.
 	SetStatAt(ctx context.Context, rp *ResolvingPath, opts SetStatOptions) error
 
 	// StatAt returns metadata for the file at rp.
@@ -181,9 +376,45 @@ type FilesystemImpl interface {
 	StatFSAt(ctx context.Context, rp *ResolvingPath) (linux.Statfs, error)
 
 	// SymlinkAt creates a symbolic link at rp referring to the given target.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", SymlinkAt returns
+	// EEXIST.
+	//
+	// - If a file already exists at rp, SymlinkAt returns EEXIST.
+	//
+	// - If rp.MustBeDir(), SymlinkAt returns ENOENT.
+	//
+	// - If the directory in which the symbolic link would be created has been
+	// removed by RmdirAt or RenameAt, SymlinkAt returns ENOENT.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If SymlinkAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	SymlinkAt(ctx context.Context, rp *ResolvingPath, target string) error
 
-	// UnlinkAt removes the non-directory file at rp.
+	// UnlinkAt removes the file at rp.
+	//
+	// Errors:
+	//
+	// - If the last path component in rp is "." or "..", UnlinkAt returns
+	// EISDIR.
+	//
+	// - If no file exists at rp, UnlinkAt returns ENOENT.
+	//
+	// - If rp.MustBeDir(), and the file at rp exists and is not a directory,
+	// UnlinkAt returns ENOTDIR.
+	//
+	// - If the file at rp exists but is a directory, UnlinkAt returns EISDIR.
+	//
+	// Preconditions: !rp.Done(). For the final path component in rp,
+	// !rp.ShouldFollowSymlink().
+	//
+	// Postconditions: If UnlinkAt returns an error returned by
+	// ResolvingPath.Resolve*(), then !rp.Done().
 	UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 
 	// ListxattrAt returns all extended attribute names for the file at rp.
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 97ee4a446..87d2b0d1c 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -83,6 +83,9 @@ type ReadOptions struct {
 type RenameOptions struct {
 	// Flags contains flags as specified for renameat2(2).
 	Flags uint32
+
+	// If MustBeDir is true, the renamed file must be a directory.
+	MustBeDir bool
 }
 
 // SetStatOptions contains options to VirtualFilesystem.SetStatAt(),
diff --git a/pkg/sentry/vfs/resolving_path.go b/pkg/sentry/vfs/resolving_path.go
index d580fd39e..f0641d314 100644
--- a/pkg/sentry/vfs/resolving_path.go
+++ b/pkg/sentry/vfs/resolving_path.go
@@ -112,30 +112,26 @@ var resolvingPathPool = sync.Pool{
 	},
 }
 
-func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) (*ResolvingPath, error) {
-	path, err := fspath.Parse(pop.Pathname)
-	if err != nil {
-		return nil, err
-	}
+func (vfs *VirtualFilesystem) getResolvingPath(creds *auth.Credentials, pop *PathOperation) *ResolvingPath {
 	rp := resolvingPathPool.Get().(*ResolvingPath)
 	rp.vfs = vfs
 	rp.root = pop.Root
 	rp.mount = pop.Start.mount
 	rp.start = pop.Start.dentry
-	rp.pit = path.Begin
+	rp.pit = pop.Path.Begin
 	rp.flags = 0
 	if pop.FollowFinalSymlink {
 		rp.flags |= rpflagsFollowFinalSymlink
 	}
-	rp.mustBeDir = path.Dir
-	rp.mustBeDirOrig = path.Dir
+	rp.mustBeDir = pop.Path.Dir
+	rp.mustBeDirOrig = pop.Path.Dir
 	rp.symlinks = 0
 	rp.curPart = 0
 	rp.numOrigParts = 1
 	rp.creds = creds
-	rp.parts[0] = path.Begin
-	rp.origParts[0] = path.Begin
-	return rp, nil
+	rp.parts[0] = pop.Path.Begin
+	rp.origParts[0] = pop.Path.Begin
+	return rp
 }
 
 func (vfs *VirtualFilesystem) putResolvingPath(rp *ResolvingPath) {
@@ -345,29 +341,34 @@ func (rp *ResolvingPath) ShouldFollowSymlink() bool {
 // symlink target and returns nil. Otherwise it returns a non-nil error.
 //
 // Preconditions: !rp.Done().
+//
+// Postconditions: If HandleSymlink returns a nil error, then !rp.Done().
 func (rp *ResolvingPath) HandleSymlink(target string) error {
 	if rp.symlinks >= linux.MaxSymlinkTraversals {
 		return syserror.ELOOP
 	}
-	targetPath, err := fspath.Parse(target)
-	if err != nil {
-		return err
+	if len(target) == 0 {
+		return syserror.ENOENT
 	}
 	rp.symlinks++
+	targetPath := fspath.Parse(target)
 	if targetPath.Absolute {
 		rp.absSymlinkTarget = targetPath
 		return resolveAbsSymlinkError{}
 	}
-	if !targetPath.Begin.Ok() {
-		panic(fmt.Sprintf("symbolic link has non-empty target %q that is both relative and has no path components?", target))
-	}
 	// Consume the path component that represented the symlink.
 	rp.Advance()
 	// Prepend the symlink target to the relative path.
+	if checkInvariants {
+		if !targetPath.HasComponents() {
+			panic(fmt.Sprintf("non-empty pathname %q parsed to relative path with no components", target))
+		}
+	}
 	rp.relpathPrepend(targetPath)
 	return nil
 }
 
+// Preconditions: path.HasComponents().
 func (rp *ResolvingPath) relpathPrepend(path fspath.Path) {
 	if rp.pit.Ok() {
 		rp.parts[rp.curPart] = rp.pit
@@ -467,6 +468,17 @@ func (rp *ResolvingPath) handleError(err error) bool {
 	}
 }
 
+// canHandleError returns true if err is an error returned by rp.Resolve*()
+// that rp.handleError() may attempt to handle.
+func (rp *ResolvingPath) canHandleError(err error) bool {
+	switch err.(type) {
+	case resolveMountRootOrJumpError, resolveMountPointError, resolveAbsSymlinkError:
+		return true
+	default:
+		return false
+	}
+}
+
 // MustBeDir returns true if the file traversed by rp must be a directory.
 func (rp *ResolvingPath) MustBeDir() bool {
 	return rp.mustBeDir
diff --git a/pkg/sentry/vfs/testutil.go b/pkg/sentry/vfs/testutil.go
index d94117bce..ee5c8b9e2 100644
--- a/pkg/sentry/vfs/testutil.go
+++ b/pkg/sentry/vfs/testutil.go
@@ -57,6 +57,11 @@ func (fs *FDTestFilesystem) GetDentryAt(ctx context.Context, rp *ResolvingPath,
 	return nil, syserror.EPERM
 }
 
+// GetParentDentryAt implements FilesystemImpl.GetParentDentryAt.
+func (fs *FDTestFilesystem) GetParentDentryAt(ctx context.Context, rp *ResolvingPath) (*Dentry, error) {
+	return nil, syserror.EPERM
+}
+
 // LinkAt implements FilesystemImpl.LinkAt.
 func (fs *FDTestFilesystem) LinkAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry) error {
 	return syserror.EPERM
@@ -83,7 +88,7 @@ func (fs *FDTestFilesystem) ReadlinkAt(ctx context.Context, rp *ResolvingPath) (
 }
 
 // RenameAt implements FilesystemImpl.RenameAt.
-func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, vd VirtualDentry, opts RenameOptions) error {
+func (fs *FDTestFilesystem) RenameAt(ctx context.Context, rp *ResolvingPath, oldParentVD VirtualDentry, oldName string, opts RenameOptions) error {
 	return syserror.EPERM
 }
 
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index e60898d7c..3e4df8558 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -28,9 +28,11 @@
 package vfs
 
 import (
+	"fmt"
 	"sync"
 
 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
 	"gvisor.dev/gvisor/pkg/syserror"
@@ -111,11 +113,11 @@ type PathOperation struct {
 	// are borrowed from the provider of the PathOperation (i.e. the caller of
 	// the VFS method to which the PathOperation was passed).
 	//
-	// Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root.
+	// Invariants: Start.Ok(). If Path.Absolute, then Start == Root.
 	Start VirtualDentry
 
 	// Path is the pathname traversed by this operation.
-	Pathname string
+	Path fspath.Path
 
 	// If FollowFinalSymlink is true, and the Dentry traversed by the final
 	// path component represents a symbolic link, the symbolic link should be
@@ -126,10 +128,7 @@ type PathOperation struct {
 // GetDentryAt returns a VirtualDentry representing the given path, at which a
 // file must exist. A reference is taken on the returned VirtualDentry.
 func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return VirtualDentry{}, err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
 		if err == nil {
@@ -148,6 +147,33 @@ func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Crede
 	}
 }
 
+// Preconditions: pop.Path.Begin.Ok().
+func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) {
+	rp := vfs.getResolvingPath(creds, pop)
+	for {
+		parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp)
+		if err == nil {
+			parentVD := VirtualDentry{
+				mount:  rp.mount,
+				dentry: parent,
+			}
+			rp.mount.IncRef()
+			name := rp.Component()
+			vfs.putResolvingPath(rp)
+			return parentVD, name, nil
+		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
+		if !rp.handleError(err) {
+			vfs.putResolvingPath(rp)
+			return VirtualDentry{}, "", err
+		}
+	}
+}
+
 // LinkAt creates a hard link at newpop representing the existing file at
 // oldpop.
 func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error {
@@ -155,21 +181,36 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
 	if err != nil {
 		return err
 	}
-	rp, err := vfs.getResolvingPath(creds, newpop)
-	if err != nil {
+
+	if !newpop.Path.Begin.Ok() {
 		oldVD.DecRef()
-		return err
+		if newpop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
 	}
+	if newpop.FollowFinalSymlink {
+		oldVD.DecRef()
+		ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, newpop)
 	for {
 		err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD)
 		if err == nil {
-			oldVD.DecRef()
 			vfs.putResolvingPath(rp)
+			oldVD.DecRef()
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
-			oldVD.DecRef()
 			vfs.putResolvingPath(rp)
+			oldVD.DecRef()
 			return err
 		}
 	}
@@ -177,19 +218,32 @@ func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credential
 
 // MkdirAt creates a directory at the given path.
 func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
+	}
 	// "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
 	// also honored." - mkdir(2)
 	opts.Mode &= 0777 | linux.S_ISVTX
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
-	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err
@@ -200,16 +254,29 @@ func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentia
 // MknodAt creates a file of the given mode at the given path. It returns an
 // error from the syserror package.
 func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return nil
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
 	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
-		if err = rp.mount.fs.impl.MknodAt(ctx, rp, *opts); err == nil {
+		err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts)
+		if err != nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
-		// Handle mount traversals.
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err
@@ -259,10 +326,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 	if opts.Flags&linux.O_NOFOLLOW != 0 {
 		pop.FollowFinalSymlink = false
 	}
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return nil, err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	if opts.Flags&linux.O_DIRECTORY != 0 {
 		rp.mustBeDir = true
 		rp.mustBeDirOrig = true
@@ -282,10 +346,7 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
 
 // ReadlinkAt returns the target of the symbolic link at the given path.
 func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return "", err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp)
 		if err == nil {
@@ -301,25 +362,59 @@ func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Creden
 
 // RenameAt renames the file at oldpop to newpop.
 func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error {
-	oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{})
-	if err != nil {
-		return err
+	if !oldpop.Path.Begin.Ok() {
+		if oldpop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
 	}
-	rp, err := vfs.getResolvingPath(creds, newpop)
+	if oldpop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop)
 	if err != nil {
-		oldVD.DecRef()
 		return err
 	}
+	if oldName == "." || oldName == ".." {
+		oldParentVD.DecRef()
+		return syserror.EBUSY
+	}
+
+	if !newpop.Path.Begin.Ok() {
+		oldParentVD.DecRef()
+		if newpop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
+	}
+	if newpop.FollowFinalSymlink {
+		oldParentVD.DecRef()
+		ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, newpop)
+	renameOpts := *opts
+	if oldpop.Path.Dir {
+		renameOpts.MustBeDir = true
+	}
 	for {
-		err := rp.mount.fs.impl.RenameAt(ctx, rp, oldVD, *opts)
+		err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts)
 		if err == nil {
-			oldVD.DecRef()
 			vfs.putResolvingPath(rp)
+			oldParentVD.DecRef()
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
-			oldVD.DecRef()
 			vfs.putResolvingPath(rp)
+			oldParentVD.DecRef()
 			return err
 		}
 	}
@@ -327,16 +422,29 @@ func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credenti
 
 // RmdirAt removes the directory at the given path.
 func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
 	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink")
+		return syserror.EINVAL
+	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.RmdirAt(ctx, rp)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err
@@ -346,10 +454,7 @@ func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentia
 
 // SetStatAt changes metadata for the file at the given path.
 func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts)
 		if err == nil {
@@ -365,10 +470,7 @@ func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credent
 
 // StatAt returns metadata for the file at the given path.
 func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return linux.Statx{}, err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
 		if err == nil {
@@ -385,10 +487,7 @@ func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credential
 // StatFSAt returns metadata for the filesystem containing the file at the
 // given path.
 func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return linux.Statfs{}, err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp)
 		if err == nil {
@@ -404,16 +503,29 @@ func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credenti
 
 // SymlinkAt creates a symbolic link at the given path with the given target.
 func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EEXIST
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink")
+		return syserror.EINVAL
 	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err
@@ -423,16 +535,29 @@ func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credent
 
 // UnlinkAt deletes the non-directory file at the given path.
 func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
+	if !pop.Path.Begin.Ok() {
+		if pop.Path.Absolute {
+			return syserror.EBUSY
+		}
+		return syserror.ENOENT
+	}
+	if pop.FollowFinalSymlink {
+		ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink")
+		return syserror.EINVAL
 	}
+
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.UnlinkAt(ctx, rp)
 		if err == nil {
 			vfs.putResolvingPath(rp)
 			return nil
 		}
+		if checkInvariants {
+			if rp.canHandleError(err) && rp.Done() {
+				panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %T", rp.mount.fs.impl, err))
+			}
+		}
 		if !rp.handleError(err) {
 			vfs.putResolvingPath(rp)
 			return err
@@ -443,10 +568,7 @@ func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credenti
 // ListxattrAt returns all extended attribute names for the file at the given
 // path.
 func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) ([]string, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return nil, err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		names, err := rp.mount.fs.impl.ListxattrAt(ctx, rp)
 		if err == nil {
@@ -471,10 +593,7 @@ func (vfs *VirtualFilesystem) ListxattrAt(ctx context.Context, creds *auth.Crede
 // GetxattrAt returns the value associated with the given extended attribute
 // for the file at the given path.
 func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) (string, error) {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return "", err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		val, err := rp.mount.fs.impl.GetxattrAt(ctx, rp, name)
 		if err == nil {
@@ -491,10 +610,7 @@ func (vfs *VirtualFilesystem) GetxattrAt(ctx context.Context, creds *auth.Creden
 // SetxattrAt changes the value associated with the given extended attribute
 // for the file at the given path.
 func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetxattrOptions) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
 		if err == nil {
@@ -510,10 +626,7 @@ func (vfs *VirtualFilesystem) SetxattrAt(ctx context.Context, creds *auth.Creden
 
 // RemovexattrAt removes the given extended attribute from the file at rp.
 func (vfs *VirtualFilesystem) RemovexattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error {
-	rp, err := vfs.getResolvingPath(creds, pop)
-	if err != nil {
-		return err
-	}
+	rp := vfs.getResolvingPath(creds, pop)
 	for {
 		err := rp.mount.fs.impl.RemovexattrAt(ctx, rp, name)
 		if err == nil {
diff --git a/pkg/syserror/syserror.go b/pkg/syserror/syserror.go
index 1987e89cc..2269f6237 100644
--- a/pkg/syserror/syserror.go
+++ b/pkg/syserror/syserror.go
@@ -45,6 +45,7 @@ var (
 	ELIBBAD      = error(syscall.ELIBBAD)
 	ELOOP        = error(syscall.ELOOP)
 	EMFILE       = error(syscall.EMFILE)
+	EMLINK       = error(syscall.EMLINK)
 	EMSGSIZE     = error(syscall.EMSGSIZE)
 	ENAMETOOLONG = error(syscall.ENAMETOOLONG)
 	ENOATTR      = ENODATA
-- 
cgit v1.2.3


From 796f53c0befc21570b185811e26b74e71950dfc3 Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Fri, 27 Dec 2019 00:12:14 -0800
Subject: Add VFS2 support for /proc/filesystems.

Updates #1195

PiperOrigin-RevId: 287269106
---
 pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go |  4 +-
 pkg/sentry/fsimpl/ext/ext_test.go                 |  4 +-
 pkg/sentry/fsimpl/kernfs/kernfs_test.go           |  4 +-
 pkg/sentry/fsimpl/memfs/benchmark_test.go         |  8 +++-
 pkg/sentry/fsimpl/memfs/pipe_test.go              |  4 +-
 pkg/sentry/fsimpl/proc/tasks_test.go              | 30 +++++++------
 pkg/sentry/vfs/file_description_impl_util_test.go |  2 +-
 pkg/sentry/vfs/filesystem_type.go                 | 55 ++++++++++++++++++++---
 pkg/sentry/vfs/mount.go                           | 15 ++++---
 pkg/sentry/vfs/options.go                         |  4 ++
 pkg/sentry/vfs/vfs.go                             | 12 ++---
 11 files changed, 103 insertions(+), 39 deletions(-)

(limited to 'pkg/sentry/fsimpl/memfs')

diff --git a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
index 2f46d2d13..a56b03711 100644
--- a/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
+++ b/pkg/sentry/fsimpl/ext/benchmark/benchmark_test.go
@@ -50,7 +50,9 @@ func setUp(b *testing.B, imagePath string) (context.Context, *vfs.VirtualFilesys
 
 	// Create VFS.
 	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{})
+	vfsObj.MustRegisterFilesystemType("extfs", ext.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, imagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
diff --git a/pkg/sentry/fsimpl/ext/ext_test.go b/pkg/sentry/fsimpl/ext/ext_test.go
index 5d6c999bd..6c14a1e2d 100644
--- a/pkg/sentry/fsimpl/ext/ext_test.go
+++ b/pkg/sentry/fsimpl/ext/ext_test.go
@@ -66,7 +66,9 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.VirtualFilesys
 
 	// Create VFS.
 	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{})
+	vfsObj.MustRegisterFilesystemType("extfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, localImagePath, "extfs", &vfs.GetFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
diff --git a/pkg/sentry/fsimpl/kernfs/kernfs_test.go b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
index 3db12caa0..4b6b95f5f 100644
--- a/pkg/sentry/fsimpl/kernfs/kernfs_test.go
+++ b/pkg/sentry/fsimpl/kernfs/kernfs_test.go
@@ -59,7 +59,9 @@ func newTestSystem(t *testing.T, rootFn RootDentryFn) *TestSystem {
 	ctx := contexttest.Context(t)
 	creds := auth.CredentialsFromContext(ctx)
 	v := vfs.New()
-	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn})
+	v.MustRegisterFilesystemType("testfs", &fsType{rootFn: rootFn}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
 	mns, err := v.NewMountNamespace(ctx, creds, "", "testfs", &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("Failed to create testfs root mount: %v", err)
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
index 6e987af88..a27876a4e 100644
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ b/pkg/sentry/fsimpl/memfs/benchmark_test.go
@@ -176,7 +176,9 @@ func BenchmarkVFS2MemfsStat(b *testing.B) {
 
 			// Create VFS.
 			vfsObj := vfs.New()
-			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
+			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+				AllowUserMount: true,
+			})
 			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
@@ -365,7 +367,9 @@ func BenchmarkVFS2MemfsMountStat(b *testing.B) {
 
 			// Create VFS.
 			vfsObj := vfs.New()
-			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{})
+			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+				AllowUserMount: true,
+			})
 			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
 			if err != nil {
 				b.Fatalf("failed to create tmpfs root mount: %v", err)
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go
index be917aeee..807c1af7a 100644
--- a/pkg/sentry/fsimpl/memfs/pipe_test.go
+++ b/pkg/sentry/fsimpl/memfs/pipe_test.go
@@ -152,7 +152,9 @@ func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesy
 
 	// Create VFS.
 	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{})
+	vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("failed to create tmpfs root mount: %v", err)
diff --git a/pkg/sentry/fsimpl/proc/tasks_test.go b/pkg/sentry/fsimpl/proc/tasks_test.go
index 48201d75a..2560fcef9 100644
--- a/pkg/sentry/fsimpl/proc/tasks_test.go
+++ b/pkg/sentry/fsimpl/proc/tasks_test.go
@@ -56,25 +56,25 @@ func checkDots(dirs []vfs.Dirent) ([]vfs.Dirent, error) {
 
 func checkTasksStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
 	wants := map[string]vfs.Dirent{
-		"loadavg":     vfs.Dirent{Type: linux.DT_REG},
-		"meminfo":     vfs.Dirent{Type: linux.DT_REG},
-		"mounts":      vfs.Dirent{Type: linux.DT_LNK},
-		"self":        vfs.Dirent{Type: linux.DT_LNK},
-		"stat":        vfs.Dirent{Type: linux.DT_REG},
-		"thread-self": vfs.Dirent{Type: linux.DT_LNK},
-		"version":     vfs.Dirent{Type: linux.DT_REG},
+		"loadavg":     {Type: linux.DT_REG},
+		"meminfo":     {Type: linux.DT_REG},
+		"mounts":      {Type: linux.DT_LNK},
+		"self":        {Type: linux.DT_LNK},
+		"stat":        {Type: linux.DT_REG},
+		"thread-self": {Type: linux.DT_LNK},
+		"version":     {Type: linux.DT_REG},
 	}
 	return checkFiles(gots, wants)
 }
 
 func checkTaskStaticFiles(gots []vfs.Dirent) ([]vfs.Dirent, error) {
 	wants := map[string]vfs.Dirent{
-		"io":     vfs.Dirent{Type: linux.DT_REG},
-		"maps":   vfs.Dirent{Type: linux.DT_REG},
-		"smaps":  vfs.Dirent{Type: linux.DT_REG},
-		"stat":   vfs.Dirent{Type: linux.DT_REG},
-		"statm":  vfs.Dirent{Type: linux.DT_REG},
-		"status": vfs.Dirent{Type: linux.DT_REG},
+		"io":     {Type: linux.DT_REG},
+		"maps":   {Type: linux.DT_REG},
+		"smaps":  {Type: linux.DT_REG},
+		"stat":   {Type: linux.DT_REG},
+		"statm":  {Type: linux.DT_REG},
+		"status": {Type: linux.DT_REG},
 	}
 	return checkFiles(gots, wants)
 }
@@ -114,7 +114,9 @@ func setup() (context.Context, *vfs.VirtualFilesystem, vfs.VirtualDentry, error)
 	creds := auth.CredentialsFromContext(ctx)
 
 	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("procfs", &procFSType{})
+	vfsObj.MustRegisterFilesystemType("procfs", &procFSType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "procfs", &vfs.GetFilesystemOptions{})
 	if err != nil {
 		return nil, nil, vfs.VirtualDentry{}, fmt.Errorf("NewMountNamespace(): %v", err)
diff --git a/pkg/sentry/vfs/file_description_impl_util_test.go b/pkg/sentry/vfs/file_description_impl_util_test.go
index 678be07fe..9ed58512f 100644
--- a/pkg/sentry/vfs/file_description_impl_util_test.go
+++ b/pkg/sentry/vfs/file_description_impl_util_test.go
@@ -89,7 +89,7 @@ func TestGenCountFD(t *testing.T) {
 	creds := auth.CredentialsFromContext(ctx)
 
 	vfsObj := New() // vfs.New()
-	vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{})
+	vfsObj.MustRegisterFilesystemType("testfs", FDTestFilesystemType{}, &RegisterFilesystemTypeOptions{})
 	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "testfs", &GetFilesystemOptions{})
 	if err != nil {
 		t.Fatalf("failed to create testfs root mount: %v", err)
diff --git a/pkg/sentry/vfs/filesystem_type.go b/pkg/sentry/vfs/filesystem_type.go
index c335e206d..023301780 100644
--- a/pkg/sentry/vfs/filesystem_type.go
+++ b/pkg/sentry/vfs/filesystem_type.go
@@ -15,6 +15,7 @@
 package vfs
 
 import (
+	"bytes"
 	"fmt"
 
 	"gvisor.dev/gvisor/pkg/sentry/context"
@@ -43,28 +44,70 @@ type GetFilesystemOptions struct {
 	InternalData interface{}
 }
 
+type registeredFilesystemType struct {
+	fsType FilesystemType
+	opts   RegisterFilesystemTypeOptions
+}
+
+// RegisterFilesystemTypeOptions contains options to
+// VirtualFilesystem.RegisterFilesystem().
+type RegisterFilesystemTypeOptions struct {
+	// If AllowUserMount is true, allow calls to VirtualFilesystem.MountAt()
+	// for which MountOptions.InternalMount == false to use this filesystem
+	// type.
+	AllowUserMount bool
+
+	// If AllowUserList is true, make this filesystem type visible in
+	// /proc/filesystems.
+	AllowUserList bool
+
+	// If RequiresDevice is true, indicate that mounting this filesystem
+	// requires a block device as the mount source in /proc/filesystems.
+	RequiresDevice bool
+}
+
 // RegisterFilesystemType registers the given FilesystemType in vfs with the
 // given name.
-func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType) error {
+func (vfs *VirtualFilesystem) RegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) error {
 	vfs.fsTypesMu.Lock()
 	defer vfs.fsTypesMu.Unlock()
 	if existing, ok := vfs.fsTypes[name]; ok {
-		return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing)
+		return fmt.Errorf("name %q is already registered to filesystem type %T", name, existing.fsType)
+	}
+	vfs.fsTypes[name] = &registeredFilesystemType{
+		fsType: fsType,
+		opts:   *opts,
 	}
-	vfs.fsTypes[name] = fsType
 	return nil
 }
 
 // MustRegisterFilesystemType is equivalent to RegisterFilesystemType but
 // panics on failure.
-func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType) {
-	if err := vfs.RegisterFilesystemType(name, fsType); err != nil {
+func (vfs *VirtualFilesystem) MustRegisterFilesystemType(name string, fsType FilesystemType, opts *RegisterFilesystemTypeOptions) {
+	if err := vfs.RegisterFilesystemType(name, fsType, opts); err != nil {
 		panic(fmt.Sprintf("failed to register filesystem type %T: %v", fsType, err))
 	}
 }
 
-func (vfs *VirtualFilesystem) getFilesystemType(name string) FilesystemType {
+func (vfs *VirtualFilesystem) getFilesystemType(name string) *registeredFilesystemType {
 	vfs.fsTypesMu.RLock()
 	defer vfs.fsTypesMu.RUnlock()
 	return vfs.fsTypes[name]
 }
+
+// GenerateProcFilesystems emits the contents of /proc/filesystems for vfs to
+// buf.
+func (vfs *VirtualFilesystem) GenerateProcFilesystems(buf *bytes.Buffer) {
+	vfs.fsTypesMu.RLock()
+	defer vfs.fsTypesMu.RUnlock()
+	for name, rft := range vfs.fsTypes {
+		if !rft.opts.AllowUserList {
+			continue
+		}
+		var nodev string
+		if !rft.opts.RequiresDevice {
+			nodev = "nodev"
+		}
+		fmt.Fprintf(buf, "%s\t%s\n", nodev, name)
+	}
+}
diff --git a/pkg/sentry/vfs/mount.go b/pkg/sentry/vfs/mount.go
index ec23ab0dd..00177b371 100644
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@@ -112,11 +112,11 @@ type MountNamespace struct {
 // configured by the given arguments. A reference is taken on the returned
 // MountNamespace.
 func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *GetFilesystemOptions) (*MountNamespace, error) {
-	fsType := vfs.getFilesystemType(fsTypeName)
-	if fsType == nil {
+	rft := vfs.getFilesystemType(fsTypeName)
+	if rft == nil {
 		return nil, syserror.ENODEV
 	}
-	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
+	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, *opts)
 	if err != nil {
 		return nil, err
 	}
@@ -136,11 +136,14 @@ func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth
 
 // MountAt creates and mounts a Filesystem configured by the given arguments.
 func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) error {
-	fsType := vfs.getFilesystemType(fsTypeName)
-	if fsType == nil {
+	rft := vfs.getFilesystemType(fsTypeName)
+	if rft == nil {
 		return syserror.ENODEV
 	}
-	fs, root, err := fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
+	if !opts.InternalMount && !rft.opts.AllowUserMount {
+		return syserror.ENODEV
+	}
+	fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions)
 	if err != nil {
 		return err
 	}
diff --git a/pkg/sentry/vfs/options.go b/pkg/sentry/vfs/options.go
index 87d2b0d1c..b7774bf28 100644
--- a/pkg/sentry/vfs/options.go
+++ b/pkg/sentry/vfs/options.go
@@ -50,6 +50,10 @@ type MknodOptions struct {
 type MountOptions struct {
 	// GetFilesystemOptions contains options to FilesystemType.GetFilesystem().
 	GetFilesystemOptions GetFilesystemOptions
+
+	// If InternalMount is true, allow the use of filesystem types for which
+	// RegisterFilesystemTypeOptions.AllowUserMount == false.
+	InternalMount bool
 }
 
 // OpenOptions contains options to VirtualFilesystem.OpenAt() and
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index 3e4df8558..a3bdb5805 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -75,23 +75,23 @@ type VirtualFilesystem struct {
 	// mountpoints is analogous to Linux's mountpoint_hashtable.
 	mountpoints map[*Dentry]map[*Mount]struct{}
 
+	// fsTypes contains all registered FilesystemTypes. fsTypes is protected by
+	// fsTypesMu.
+	fsTypesMu sync.RWMutex
+	fsTypes   map[string]*registeredFilesystemType
+
 	// filesystems contains all Filesystems. filesystems is protected by
 	// filesystemsMu.
 	filesystemsMu sync.Mutex
 	filesystems   map[*Filesystem]struct{}
-
-	// fsTypes contains all FilesystemTypes that are usable in the
-	// VirtualFilesystem. fsTypes is protected by fsTypesMu.
-	fsTypesMu sync.RWMutex
-	fsTypes   map[string]FilesystemType
 }
 
 // New returns a new VirtualFilesystem with no mounts or FilesystemTypes.
 func New() *VirtualFilesystem {
 	vfs := &VirtualFilesystem{
 		mountpoints: make(map[*Dentry]map[*Mount]struct{}),
+		fsTypes:     make(map[string]*registeredFilesystemType),
 		filesystems: make(map[*Filesystem]struct{}),
-		fsTypes:     make(map[string]FilesystemType),
 	}
 	vfs.mounts.Init()
 	return vfs
-- 
cgit v1.2.3


From 1f384ac42b9ee8b52000dc2bff79d975853519ed Mon Sep 17 00:00:00 2001
From: Jamie Liu <jamieliu@google.com>
Date: Mon, 30 Dec 2019 11:35:06 -0800
Subject: Add VFS2 support for device special files.

- Add FileDescriptionOptions.UseDentryMetadata, which reduces the amount of
  boilerplate needed for device FDs and the like between filesystems.

- Switch back to having FileDescription.Init() take references on the Mount and
  Dentry; otherwise managing refcounts around failed calls to
  OpenDeviceSpecialFile() / Device.Open() is tricky.

PiperOrigin-RevId: 287575574
---
 pkg/sentry/fsimpl/ext/inode.go                 |   6 --
 pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go |   2 -
 pkg/sentry/fsimpl/kernfs/fd_impl_util.go       |   2 -
 pkg/sentry/fsimpl/memfs/filesystem.go          |   4 -
 pkg/sentry/fsimpl/memfs/named_pipe.go          |   2 -
 pkg/sentry/vfs/BUILD                           |   1 +
 pkg/sentry/vfs/device.go                       | 100 ++++++++++++++++++++++++
 pkg/sentry/vfs/file_description.go             | 101 +++++++++++++++++++++++--
 pkg/sentry/vfs/file_description_impl_util.go   |  15 ++++
 pkg/sentry/vfs/filesystem.go                   |  21 +++++
 pkg/sentry/vfs/vfs.go                          |   6 ++
 11 files changed, 236 insertions(+), 24 deletions(-)
 create mode 100644 pkg/sentry/vfs/device.go

(limited to 'pkg/sentry/fsimpl/memfs')

diff --git a/pkg/sentry/fsimpl/ext/inode.go b/pkg/sentry/fsimpl/ext/inode.go
index b2cc826c7..8608805bf 100644
--- a/pkg/sentry/fsimpl/ext/inode.go
+++ b/pkg/sentry/fsimpl/ext/inode.go
@@ -157,8 +157,6 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 	switch in.impl.(type) {
 	case *regularFile:
 		var fd regularFileFD
-		mnt.IncRef()
-		vfsd.IncRef()
 		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *directory:
@@ -168,8 +166,6 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		mnt.IncRef()
-		vfsd.IncRef()
 		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *symlink:
@@ -178,8 +174,6 @@ func (in *inode) open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*v
 			return nil, syserror.ELOOP
 		}
 		var fd symlinkFD
-		mnt.IncRef()
-		vfsd.IncRef()
 		fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	default:
diff --git a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
index c5fe65722..606ca692d 100644
--- a/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
+++ b/pkg/sentry/fsimpl/kernfs/dynamic_bytes_file.go
@@ -81,8 +81,6 @@ type DynamicBytesFD struct {
 
 // Init initializes a DynamicBytesFD.
 func (fd *DynamicBytesFD) Init(m *vfs.Mount, d *vfs.Dentry, data vfs.DynamicBytesSource, flags uint32) {
-	m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
-	d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
 	fd.inode = d.Impl().(*Dentry).inode
 	fd.SetDataSource(data)
 	fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{})
diff --git a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
index 77975583b..bcf069b5f 100644
--- a/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
+++ b/pkg/sentry/fsimpl/kernfs/fd_impl_util.go
@@ -44,8 +44,6 @@ type GenericDirectoryFD struct {
 
 // Init initializes a GenericDirectoryFD.
 func (fd *GenericDirectoryFD) Init(m *vfs.Mount, d *vfs.Dentry, children *OrderedChildren, flags uint32) {
-	m.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
-	d.IncRef() // DecRef in vfs.FileDescription.vd.DecRef on final ref.
 	fd.children = children
 	fd.vfsfd.Init(fd, flags, m, d, &vfs.FileDescriptionOptions{})
 }
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
index 4a83f310c..b063e09a3 100644
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ b/pkg/sentry/fsimpl/memfs/filesystem.go
@@ -348,8 +348,6 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32,
 			}
 			// mnt.EndWrite() is called by regularFileFD.Release().
 		}
-		mnt.IncRef()
-		d.IncRef()
 		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
 		if flags&linux.O_TRUNC != 0 {
 			impl.mu.Lock()
@@ -364,8 +362,6 @@ func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32,
 			return nil, syserror.EISDIR
 		}
 		var fd directoryFD
-		mnt.IncRef()
-		d.IncRef()
 		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
 		return &fd.vfsfd, nil
 	case *symlink:
diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go
index d5060850e..b5a204438 100644
--- a/pkg/sentry/fsimpl/memfs/named_pipe.go
+++ b/pkg/sentry/fsimpl/memfs/named_pipe.go
@@ -55,8 +55,6 @@ func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, v
 		return nil, err
 	}
 	mnt := rp.Mount()
-	mnt.IncRef()
-	vfsd.IncRef()
 	fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
 	return &fd.vfsfd, nil
 }
diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD
index e3e554b88..4c6aa04a1 100644
--- a/pkg/sentry/vfs/BUILD
+++ b/pkg/sentry/vfs/BUILD
@@ -9,6 +9,7 @@ go_library(
         "context.go",
         "debug.go",
         "dentry.go",
+        "device.go",
         "file_description.go",
         "file_description_impl_util.go",
         "filesystem.go",
diff --git a/pkg/sentry/vfs/device.go b/pkg/sentry/vfs/device.go
new file mode 100644
index 000000000..cb672e36f
--- /dev/null
+++ b/pkg/sentry/vfs/device.go
@@ -0,0 +1,100 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package vfs
+
+import (
+	"fmt"
+
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// DeviceKind indicates whether a device is a block or character device.
+type DeviceKind uint32
+
+const (
+	// BlockDevice indicates a block device.
+	BlockDevice DeviceKind = iota
+
+	// CharDevice indicates a character device.
+	CharDevice
+)
+
+// String implements fmt.Stringer.String.
+func (kind DeviceKind) String() string {
+	switch kind {
+	case BlockDevice:
+		return "block"
+	case CharDevice:
+		return "character"
+	default:
+		return fmt.Sprintf("invalid device kind %d", kind)
+	}
+}
+
+type devTuple struct {
+	kind  DeviceKind
+	major uint32
+	minor uint32
+}
+
+// A Device backs device special files.
+type Device interface {
+	// Open returns a FileDescription representing this device.
+	Open(ctx context.Context, mnt *Mount, d *Dentry, opts OpenOptions) (*FileDescription, error)
+}
+
+type registeredDevice struct {
+	dev  Device
+	opts RegisterDeviceOptions
+}
+
+// RegisterDeviceOptions contains options to
+// VirtualFilesystem.RegisterDevice().
+type RegisterDeviceOptions struct {
+	// GroupName is the name shown for this device registration in
+	// /proc/devices. If GroupName is empty, this registration will not be
+	// shown in /proc/devices.
+	GroupName string
+}
+
+// RegisterDevice registers the given Device in vfs with the given major and
+// minor device numbers.
+func (vfs *VirtualFilesystem) RegisterDevice(kind DeviceKind, major, minor uint32, dev Device, opts *RegisterDeviceOptions) error {
+	tup := devTuple{kind, major, minor}
+	vfs.devicesMu.Lock()
+	defer vfs.devicesMu.Unlock()
+	if existing, ok := vfs.devices[tup]; ok {
+		return fmt.Errorf("%s device number (%d, %d) is already registered to device type %T", kind, major, minor, existing.dev)
+	}
+	vfs.devices[tup] = &registeredDevice{
+		dev:  dev,
+		opts: *opts,
+	}
+	return nil
+}
+
+// OpenDeviceSpecialFile returns a FileDescription representing the given
+// device.
+func (vfs *VirtualFilesystem) OpenDeviceSpecialFile(ctx context.Context, mnt *Mount, d *Dentry, kind DeviceKind, major, minor uint32, opts *OpenOptions) (*FileDescription, error) {
+	tup := devTuple{kind, major, minor}
+	vfs.devicesMu.RLock()
+	defer vfs.devicesMu.RUnlock()
+	rd, ok := vfs.devices[tup]
+	if !ok {
+		return nil, syserror.ENXIO
+	}
+	return rd.dev.Open(ctx, mnt, d, *opts)
+}
diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go
index 0b053201a..6afe280bc 100644
--- a/pkg/sentry/vfs/file_description.go
+++ b/pkg/sentry/vfs/file_description.go
@@ -61,11 +61,25 @@ type FileDescriptionOptions struct {
 	// If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is
 	// usually only the case if O_DIRECT would actually have an effect.
 	AllowDirectIO bool
+
+	// If UseDentryMetadata is true, calls to FileDescription methods that
+	// interact with file and filesystem metadata (Stat, SetStat, StatFS,
+	// Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
+	// the corresponding FilesystemImpl methods instead of the corresponding
+	// FileDescriptionImpl methods.
+	//
+	// UseDentryMetadata is intended for file descriptions that are implemented
+	// outside of individual filesystems, such as pipes, sockets, and device
+	// special files. FileDescriptions for which UseDentryMetadata is true may
+	// embed DentryMetadataFileDescriptionImpl to obtain appropriate
+	// implementations of FileDescriptionImpl methods that should not be
+	// called.
+	UseDentryMetadata bool
 }
 
-// Init must be called before first use of fd. It takes ownership of references
-// on mnt and d held by the caller. statusFlags is the initial file description
-// status flags, which is usually the full set of flags passed to open(2).
+// Init must be called before first use of fd. It takes references on mnt and
+// d. statusFlags is the initial file description status flags, which is
+// usually the full set of flags passed to open(2).
 func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) {
 	fd.refs = 1
 	fd.statusFlags = statusFlags | linux.O_LARGEFILE
@@ -73,6 +87,7 @@ func (fd *FileDescription) Init(impl FileDescriptionImpl, statusFlags uint32, mn
 		mount:  mnt,
 		dentry: d,
 	}
+	fd.vd.IncRef()
 	fd.opts = *opts
 	fd.impl = impl
 }
@@ -140,7 +155,7 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede
 	// sense. However, the check as actually implemented seems to be "O_APPEND
 	// cannot be changed if the file is marked as append-only".
 	if (flags^oldFlags)&linux.O_APPEND != 0 {
-		stat, err := fd.impl.Stat(ctx, StatOptions{
+		stat, err := fd.Stat(ctx, StatOptions{
 			// There is no mask bit for stx_attributes.
 			Mask: 0,
 			// Linux just reads inode::i_flags directly.
@@ -154,7 +169,7 @@ func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Crede
 		}
 	}
 	if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) {
-		stat, err := fd.impl.Stat(ctx, StatOptions{
+		stat, err := fd.Stat(ctx, StatOptions{
 			Mask: linux.STATX_UID,
 			// Linux's inode_owner_or_capable() just reads inode::i_uid
 			// directly.
@@ -348,17 +363,47 @@ func (fd *FileDescription) OnClose(ctx context.Context) error {
 
 // Stat returns metadata for the file represented by fd.
 func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
+		vfsObj.putResolvingPath(rp)
+		return stat, err
+	}
 	return fd.impl.Stat(ctx, opts)
 }
 
 // SetStat updates metadata for the file represented by fd.
 func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts)
+		vfsObj.putResolvingPath(rp)
+		return err
+	}
 	return fd.impl.SetStat(ctx, opts)
 }
 
 // StatFS returns metadata for the filesystem containing the file represented
 // by fd.
 func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp)
+		vfsObj.putResolvingPath(rp)
+		return statfs, err
+	}
 	return fd.impl.StatFS(ctx)
 }
 
@@ -417,6 +462,16 @@ func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.
 // Listxattr returns all extended attribute names for the file represented by
 // fd.
 func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp)
+		vfsObj.putResolvingPath(rp)
+		return names, err
+	}
 	names, err := fd.impl.Listxattr(ctx)
 	if err == syserror.ENOTSUP {
 		// Linux doesn't actually return ENOTSUP in this case; instead,
@@ -431,18 +486,48 @@ func (fd *FileDescription) Listxattr(ctx context.Context) ([]string, error) {
 // Getxattr returns the value associated with the given extended attribute for
 // the file represented by fd.
 func (fd *FileDescription) Getxattr(ctx context.Context, name string) (string, error) {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, name)
+		vfsObj.putResolvingPath(rp)
+		return val, err
+	}
 	return fd.impl.Getxattr(ctx, name)
 }
 
 // Setxattr changes the value associated with the given extended attribute for
 // the file represented by fd.
 func (fd *FileDescription) Setxattr(ctx context.Context, opts SetxattrOptions) error {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, opts)
+		vfsObj.putResolvingPath(rp)
+		return err
+	}
 	return fd.impl.Setxattr(ctx, opts)
 }
 
 // Removexattr removes the given extended attribute from the file represented
 // by fd.
 func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
+	if fd.opts.UseDentryMetadata {
+		vfsObj := fd.vd.mount.vfs
+		rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
+			Root:  fd.vd,
+			Start: fd.vd,
+		})
+		err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name)
+		vfsObj.putResolvingPath(rp)
+		return err
+	}
 	return fd.impl.Removexattr(ctx, name)
 }
 
@@ -464,7 +549,7 @@ func (fd *FileDescription) MappedName(ctx context.Context) string {
 
 // DeviceID implements memmap.MappingIdentity.DeviceID.
 func (fd *FileDescription) DeviceID() uint64 {
-	stat, err := fd.impl.Stat(context.Background(), StatOptions{
+	stat, err := fd.Stat(context.Background(), StatOptions{
 		// There is no STATX_DEV; we assume that Stat will return it if it's
 		// available regardless of mask.
 		Mask: 0,
@@ -480,7 +565,7 @@ func (fd *FileDescription) DeviceID() uint64 {
 
 // InodeID implements memmap.MappingIdentity.InodeID.
 func (fd *FileDescription) InodeID() uint64 {
-	stat, err := fd.impl.Stat(context.Background(), StatOptions{
+	stat, err := fd.Stat(context.Background(), StatOptions{
 		Mask: linux.STATX_INO,
 		// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly.
 		Sync: linux.AT_STATX_DONT_SYNC,
@@ -493,5 +578,5 @@ func (fd *FileDescription) InodeID() uint64 {
 
 // Msync implements memmap.MappingIdentity.Msync.
 func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
-	return fd.impl.Sync(ctx)
+	return fd.Sync(ctx)
 }
diff --git a/pkg/sentry/vfs/file_description_impl_util.go b/pkg/sentry/vfs/file_description_impl_util.go
index de782e577..66eb57bc2 100644
--- a/pkg/sentry/vfs/file_description_impl_util.go
+++ b/pkg/sentry/vfs/file_description_impl_util.go
@@ -177,6 +177,21 @@ func (DirectoryFileDescriptionDefaultImpl) Write(ctx context.Context, src userme
 	return 0, syserror.EISDIR
 }
 
+// DentryMetadataFileDescriptionImpl may be embedded by implementations of
+// FileDescriptionImpl for which FileDescriptionOptions.UseDentryMetadata is
+// true to obtain implementations of Stat and SetStat that panic.
+type DentryMetadataFileDescriptionImpl struct{}
+
+// Stat implements FileDescriptionImpl.Stat.
+func (DentryMetadataFileDescriptionImpl) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
+	panic("illegal call to DentryMetadataFileDescriptionImpl.Stat")
+}
+
+// SetStat implements FileDescriptionImpl.SetStat.
+func (DentryMetadataFileDescriptionImpl) SetStat(ctx context.Context, opts SetStatOptions) error {
+	panic("illegal call to DentryMetadataFileDescriptionImpl.SetStat")
+}
+
 // DynamicBytesFileDescriptionImpl may be embedded by implementations of
 // FileDescriptionImpl that represent read-only regular files whose contents
 // are backed by a bytes.Buffer that is regenerated when necessary, consistent
diff --git a/pkg/sentry/vfs/filesystem.go b/pkg/sentry/vfs/filesystem.go
index 89bd58864..ea78f555b 100644
--- a/pkg/sentry/vfs/filesystem.go
+++ b/pkg/sentry/vfs/filesystem.go
@@ -418,17 +418,38 @@ type FilesystemImpl interface {
 	UnlinkAt(ctx context.Context, rp *ResolvingPath) error
 
 	// ListxattrAt returns all extended attribute names for the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem,
+	// ListxattrAt returns nil. (See FileDescription.Listxattr for an
+	// explanation.)
 	ListxattrAt(ctx context.Context, rp *ResolvingPath) ([]string, error)
 
 	// GetxattrAt returns the value associated with the given extended
 	// attribute for the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem, GetxattrAt
+	// returns ENOTSUP.
 	GetxattrAt(ctx context.Context, rp *ResolvingPath, name string) (string, error)
 
 	// SetxattrAt changes the value associated with the given extended
 	// attribute for the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem, SetxattrAt
+	// returns ENOTSUP.
 	SetxattrAt(ctx context.Context, rp *ResolvingPath, opts SetxattrOptions) error
 
 	// RemovexattrAt removes the given extended attribute from the file at rp.
+	//
+	// Errors:
+	//
+	// - If extended attributes are not supported by the filesystem,
+	// RemovexattrAt returns ENOTSUP.
 	RemovexattrAt(ctx context.Context, rp *ResolvingPath, name string) error
 
 	// PrependPath prepends a path from vd to vd.Mount().Root() to b.
diff --git a/pkg/sentry/vfs/vfs.go b/pkg/sentry/vfs/vfs.go
index a3bdb5805..ea2db7031 100644
--- a/pkg/sentry/vfs/vfs.go
+++ b/pkg/sentry/vfs/vfs.go
@@ -75,6 +75,11 @@ type VirtualFilesystem struct {
 	// mountpoints is analogous to Linux's mountpoint_hashtable.
 	mountpoints map[*Dentry]map[*Mount]struct{}
 
+	// devices contains all registered Devices. devices is protected by
+	// devicesMu.
+	devicesMu sync.RWMutex
+	devices   map[devTuple]*registeredDevice
+
 	// fsTypes contains all registered FilesystemTypes. fsTypes is protected by
 	// fsTypesMu.
 	fsTypesMu sync.RWMutex
@@ -90,6 +95,7 @@ type VirtualFilesystem struct {
 func New() *VirtualFilesystem {
 	vfs := &VirtualFilesystem{
 		mountpoints: make(map[*Dentry]map[*Mount]struct{}),
+		devices:     make(map[devTuple]*registeredDevice),
 		fsTypes:     make(map[string]*registeredFilesystemType),
 		filesystems: make(map[*Filesystem]struct{}),
 	}
-- 
cgit v1.2.3


From 51f3ab85e024fcd74c49d273ce5202a207577d31 Mon Sep 17 00:00:00 2001
From: Nicolas Lacasse <nlacasse@google.com>
Date: Mon, 6 Jan 2020 12:51:35 -0800
Subject: Convert memfs into proto-tmpfs.

- Renamed memfs to tmpfs.
- Copied fileRangeSet bits from fs/fsutil/ to fsimpl/tmpfs/
- Changed tmpfs to be backed by filemem instead of byte slice.
- regularFileReadWriter uses a sync.Pool, similar to gofer client.

PiperOrigin-RevId: 288356380
---
 pkg/sentry/fs/fsutil/BUILD                   |   2 +-
 pkg/sentry/fs/fsutil/file_range_set.go       |  14 +-
 pkg/sentry/fsimpl/memfs/BUILD                |  80 ---
 pkg/sentry/fsimpl/memfs/benchmark_test.go    | 487 -------------------
 pkg/sentry/fsimpl/memfs/directory.go         | 187 -------
 pkg/sentry/fsimpl/memfs/filesystem.go        | 698 ---------------------------
 pkg/sentry/fsimpl/memfs/memfs.go             | 293 -----------
 pkg/sentry/fsimpl/memfs/named_pipe.go        |  60 ---
 pkg/sentry/fsimpl/memfs/pipe_test.go         | 235 ---------
 pkg/sentry/fsimpl/memfs/regular_file.go      | 154 ------
 pkg/sentry/fsimpl/memfs/symlink.go           |  36 --
 pkg/sentry/fsimpl/tmpfs/BUILD                |  92 ++++
 pkg/sentry/fsimpl/tmpfs/benchmark_test.go    | 487 +++++++++++++++++++
 pkg/sentry/fsimpl/tmpfs/directory.go         | 187 +++++++
 pkg/sentry/fsimpl/tmpfs/filesystem.go        | 698 +++++++++++++++++++++++++++
 pkg/sentry/fsimpl/tmpfs/named_pipe.go        |  60 +++
 pkg/sentry/fsimpl/tmpfs/pipe_test.go         | 235 +++++++++
 pkg/sentry/fsimpl/tmpfs/regular_file.go      | 357 ++++++++++++++
 pkg/sentry/fsimpl/tmpfs/regular_file_test.go | 224 +++++++++
 pkg/sentry/fsimpl/tmpfs/symlink.go           |  36 ++
 pkg/sentry/fsimpl/tmpfs/tmpfs.go             | 299 ++++++++++++
 21 files changed, 2683 insertions(+), 2238 deletions(-)
 delete mode 100644 pkg/sentry/fsimpl/memfs/BUILD
 delete mode 100644 pkg/sentry/fsimpl/memfs/benchmark_test.go
 delete mode 100644 pkg/sentry/fsimpl/memfs/directory.go
 delete mode 100644 pkg/sentry/fsimpl/memfs/filesystem.go
 delete mode 100644 pkg/sentry/fsimpl/memfs/memfs.go
 delete mode 100644 pkg/sentry/fsimpl/memfs/named_pipe.go
 delete mode 100644 pkg/sentry/fsimpl/memfs/pipe_test.go
 delete mode 100644 pkg/sentry/fsimpl/memfs/regular_file.go
 delete mode 100644 pkg/sentry/fsimpl/memfs/symlink.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/BUILD
 create mode 100644 pkg/sentry/fsimpl/tmpfs/benchmark_test.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/directory.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/filesystem.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/named_pipe.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/pipe_test.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/regular_file.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/regular_file_test.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/symlink.go
 create mode 100644 pkg/sentry/fsimpl/tmpfs/tmpfs.go

(limited to 'pkg/sentry/fsimpl/memfs')

diff --git a/pkg/sentry/fs/fsutil/BUILD b/pkg/sentry/fs/fsutil/BUILD
index b2e8d9c77..9ca695a95 100644
--- a/pkg/sentry/fs/fsutil/BUILD
+++ b/pkg/sentry/fs/fsutil/BUILD
@@ -53,7 +53,7 @@ go_template_instance(
         "Key": "uint64",
         "Range": "memmap.MappableRange",
         "Value": "uint64",
-        "Functions": "fileRangeSetFunctions",
+        "Functions": "FileRangeSetFunctions",
     },
 )
 
diff --git a/pkg/sentry/fs/fsutil/file_range_set.go b/pkg/sentry/fs/fsutil/file_range_set.go
index 0a5466b0a..f52d712e3 100644
--- a/pkg/sentry/fs/fsutil/file_range_set.go
+++ b/pkg/sentry/fs/fsutil/file_range_set.go
@@ -34,25 +34,25 @@ import (
 //
 // type FileRangeSet <generated by go_generics>
 
-// fileRangeSetFunctions implements segment.Functions for FileRangeSet.
-type fileRangeSetFunctions struct{}
+// FileRangeSetFunctions implements segment.Functions for FileRangeSet.
+type FileRangeSetFunctions struct{}
 
 // MinKey implements segment.Functions.MinKey.
-func (fileRangeSetFunctions) MinKey() uint64 {
+func (FileRangeSetFunctions) MinKey() uint64 {
 	return 0
 }
 
 // MaxKey implements segment.Functions.MaxKey.
-func (fileRangeSetFunctions) MaxKey() uint64 {
+func (FileRangeSetFunctions) MaxKey() uint64 {
 	return math.MaxUint64
 }
 
 // ClearValue implements segment.Functions.ClearValue.
-func (fileRangeSetFunctions) ClearValue(_ *uint64) {
+func (FileRangeSetFunctions) ClearValue(_ *uint64) {
 }
 
 // Merge implements segment.Functions.Merge.
-func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) {
+func (FileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _ memmap.MappableRange, frstart2 uint64) (uint64, bool) {
 	if frstart1+mr1.Length() != frstart2 {
 		return 0, false
 	}
@@ -60,7 +60,7 @@ func (fileRangeSetFunctions) Merge(mr1 memmap.MappableRange, frstart1 uint64, _
 }
 
 // Split implements segment.Functions.Split.
-func (fileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) {
+func (FileRangeSetFunctions) Split(mr memmap.MappableRange, frstart uint64, split uint64) (uint64, uint64) {
 	return frstart, frstart + (split - mr.Start)
 }
 
diff --git a/pkg/sentry/fsimpl/memfs/BUILD b/pkg/sentry/fsimpl/memfs/BUILD
deleted file mode 100644
index 5689bed3b..000000000
--- a/pkg/sentry/fsimpl/memfs/BUILD
+++ /dev/null
@@ -1,80 +0,0 @@
-load("//tools/go_stateify:defs.bzl", "go_library")
-load("@io_bazel_rules_go//go:def.bzl", "go_test")
-
-package(licenses = ["notice"])
-
-load("//tools/go_generics:defs.bzl", "go_template_instance")
-
-go_template_instance(
-    name = "dentry_list",
-    out = "dentry_list.go",
-    package = "memfs",
-    prefix = "dentry",
-    template = "//pkg/ilist:generic_list",
-    types = {
-        "Element": "*dentry",
-        "Linker": "*dentry",
-    },
-)
-
-go_library(
-    name = "memfs",
-    srcs = [
-        "dentry_list.go",
-        "directory.go",
-        "filesystem.go",
-        "memfs.go",
-        "named_pipe.go",
-        "regular_file.go",
-        "symlink.go",
-    ],
-    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs",
-    deps = [
-        "//pkg/abi/linux",
-        "//pkg/amutex",
-        "//pkg/fspath",
-        "//pkg/sentry/arch",
-        "//pkg/sentry/context",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/kernel/pipe",
-        "//pkg/sentry/usermem",
-        "//pkg/sentry/vfs",
-        "//pkg/syserror",
-    ],
-)
-
-go_test(
-    name = "benchmark_test",
-    size = "small",
-    srcs = ["benchmark_test.go"],
-    deps = [
-        ":memfs",
-        "//pkg/abi/linux",
-        "//pkg/fspath",
-        "//pkg/refs",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
-        "//pkg/sentry/fs",
-        "//pkg/sentry/fs/tmpfs",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/vfs",
-        "//pkg/syserror",
-    ],
-)
-
-go_test(
-    name = "memfs_test",
-    size = "small",
-    srcs = ["pipe_test.go"],
-    embed = [":memfs"],
-    deps = [
-        "//pkg/abi/linux",
-        "//pkg/fspath",
-        "//pkg/sentry/context",
-        "//pkg/sentry/context/contexttest",
-        "//pkg/sentry/kernel/auth",
-        "//pkg/sentry/usermem",
-        "//pkg/sentry/vfs",
-        "//pkg/syserror",
-    ],
-)
diff --git a/pkg/sentry/fsimpl/memfs/benchmark_test.go b/pkg/sentry/fsimpl/memfs/benchmark_test.go
deleted file mode 100644
index a27876a4e..000000000
--- a/pkg/sentry/fsimpl/memfs/benchmark_test.go
+++ /dev/null
@@ -1,487 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package benchmark_test
-
-import (
-	"fmt"
-	"runtime"
-	"strings"
-	"testing"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/refs"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
-	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
-	"gvisor.dev/gvisor/pkg/sentry/fsimpl/memfs"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// Differences from stat_benchmark:
-//
-// - Syscall interception, CopyInPath, copyOutStat, and overlayfs overheads are
-// not included.
-//
-// - *MountStat benchmarks use a tmpfs root mount and a tmpfs submount at /tmp.
-// Non-MountStat benchmarks use a tmpfs root mount and no submounts.
-// stat_benchmark uses a varying root mount, a tmpfs submount at /tmp, and a
-// subdirectory /tmp/<top_dir> (assuming TEST_TMPDIR == "/tmp"). Thus
-// stat_benchmark at depth 1 does a comparable amount of work to *MountStat
-// benchmarks at depth 2, and non-MountStat benchmarks at depth 3.
-var depths = []int{1, 2, 3, 8, 64, 100}
-
-const (
-	mountPointName = "tmp"
-	filename       = "gvisor_test_temp_0_1557494568"
-)
-
-// This is copied from syscalls/linux/sys_file.go, with the dependency on
-// kernel.Task stripped out.
-func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error {
-	var (
-		d   *fs.Dirent // The file.
-		rel *fs.Dirent // The relative directory for search (if required.)
-		err error
-	)
-
-	// Extract the working directory (maybe).
-	if len(path) > 0 && path[0] == '/' {
-		// Absolute path; rel can be nil.
-	} else if dirFD == linux.AT_FDCWD {
-		// Need to reference the working directory.
-		rel = wd
-	} else {
-		// Need to extract the given FD.
-		return syserror.EBADF
-	}
-
-	// Lookup the node.
-	remainingTraversals := uint(linux.MaxSymlinkTraversals)
-	if resolve {
-		d, err = mntns.FindInode(ctx, root, rel, path, &remainingTraversals)
-	} else {
-		d, err = mntns.FindLink(ctx, root, rel, path, &remainingTraversals)
-	}
-	if err != nil {
-		return err
-	}
-
-	err = fn(root, d)
-	d.DecRef()
-	return err
-}
-
-func BenchmarkVFS1TmpfsStat(b *testing.B) {
-	for _, depth := range depths {
-		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
-			ctx := contexttest.Context(b)
-
-			// Create VFS.
-			tmpfsFS, ok := fs.FindFilesystem("tmpfs")
-			if !ok {
-				b.Fatalf("failed to find tmpfs filesystem type")
-			}
-			rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
-			if err != nil {
-				b.Fatalf("failed to create tmpfs root mount: %v", err)
-			}
-			mntns, err := fs.NewMountNamespace(ctx, rootInode)
-			if err != nil {
-				b.Fatalf("failed to create mount namespace: %v", err)
-			}
-			defer mntns.DecRef()
-
-			var filePathBuilder strings.Builder
-			filePathBuilder.WriteByte('/')
-
-			// Create nested directories with given depth.
-			root := mntns.Root()
-			defer root.DecRef()
-			d := root
-			d.IncRef()
-			defer d.DecRef()
-			for i := depth; i > 0; i-- {
-				name := fmt.Sprintf("%d", i)
-				if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
-					b.Fatalf("failed to create directory %q: %v", name, err)
-				}
-				next, err := d.Walk(ctx, root, name)
-				if err != nil {
-					b.Fatalf("failed to walk to directory %q: %v", name, err)
-				}
-				d.DecRef()
-				d = next
-				filePathBuilder.WriteString(name)
-				filePathBuilder.WriteByte('/')
-			}
-
-			// Create the file that will be stat'd.
-			file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
-			if err != nil {
-				b.Fatalf("failed to create file %q: %v", filename, err)
-			}
-			file.DecRef()
-			filePathBuilder.WriteString(filename)
-			filePath := filePathBuilder.String()
-
-			dirPath := false
-			runtime.GC()
-			b.ResetTimer()
-			for i := 0; i < b.N; i++ {
-				err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
-					if dirPath && !fs.IsDir(d.Inode.StableAttr) {
-						return syserror.ENOTDIR
-					}
-					uattr, err := d.Inode.UnstableAttr(ctx)
-					if err != nil {
-						return err
-					}
-					// Sanity check.
-					if uattr.Perms.User.Execute {
-						b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
-					}
-					return nil
-				})
-				if err != nil {
-					b.Fatalf("stat(%q) failed: %v", filePath, err)
-				}
-			}
-			// Don't include deferred cleanup in benchmark time.
-			b.StopTimer()
-		})
-	}
-}
-
-func BenchmarkVFS2MemfsStat(b *testing.B) {
-	for _, depth := range depths {
-		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
-			ctx := contexttest.Context(b)
-			creds := auth.CredentialsFromContext(ctx)
-
-			// Create VFS.
-			vfsObj := vfs.New()
-			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
-				AllowUserMount: true,
-			})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
-			if err != nil {
-				b.Fatalf("failed to create tmpfs root mount: %v", err)
-			}
-			defer mntns.DecRef(vfsObj)
-
-			var filePathBuilder strings.Builder
-			filePathBuilder.WriteByte('/')
-
-			// Create nested directories with given depth.
-			root := mntns.Root()
-			defer root.DecRef()
-			vd := root
-			vd.IncRef()
-			for i := depth; i > 0; i-- {
-				name := fmt.Sprintf("%d", i)
-				pop := vfs.PathOperation{
-					Root:  root,
-					Start: vd,
-					Path:  fspath.Parse(name),
-				}
-				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
-					Mode: 0755,
-				}); err != nil {
-					b.Fatalf("failed to create directory %q: %v", name, err)
-				}
-				nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
-				if err != nil {
-					b.Fatalf("failed to walk to directory %q: %v", name, err)
-				}
-				vd.DecRef()
-				vd = nextVD
-				filePathBuilder.WriteString(name)
-				filePathBuilder.WriteByte('/')
-			}
-
-			// Create the file that will be stat'd.
-			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
-				Root:               root,
-				Start:              vd,
-				Path:               fspath.Parse(filename),
-				FollowFinalSymlink: true,
-			}, &vfs.OpenOptions{
-				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
-				Mode:  0644,
-			})
-			vd.DecRef()
-			vd = vfs.VirtualDentry{}
-			if err != nil {
-				b.Fatalf("failed to create file %q: %v", filename, err)
-			}
-			defer fd.DecRef()
-			filePathBuilder.WriteString(filename)
-			filePath := filePathBuilder.String()
-
-			runtime.GC()
-			b.ResetTimer()
-			for i := 0; i < b.N; i++ {
-				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
-					Root:               root,
-					Start:              root,
-					Path:               fspath.Parse(filePath),
-					FollowFinalSymlink: true,
-				}, &vfs.StatOptions{})
-				if err != nil {
-					b.Fatalf("stat(%q) failed: %v", filePath, err)
-				}
-				// Sanity check.
-				if stat.Mode&^linux.S_IFMT != 0644 {
-					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
-				}
-			}
-			// Don't include deferred cleanup in benchmark time.
-			b.StopTimer()
-		})
-	}
-}
-
-func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
-	for _, depth := range depths {
-		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
-			ctx := contexttest.Context(b)
-
-			// Create VFS.
-			tmpfsFS, ok := fs.FindFilesystem("tmpfs")
-			if !ok {
-				b.Fatalf("failed to find tmpfs filesystem type")
-			}
-			rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
-			if err != nil {
-				b.Fatalf("failed to create tmpfs root mount: %v", err)
-			}
-			mntns, err := fs.NewMountNamespace(ctx, rootInode)
-			if err != nil {
-				b.Fatalf("failed to create mount namespace: %v", err)
-			}
-			defer mntns.DecRef()
-
-			var filePathBuilder strings.Builder
-			filePathBuilder.WriteByte('/')
-
-			// Create and mount the submount.
-			root := mntns.Root()
-			defer root.DecRef()
-			if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil {
-				b.Fatalf("failed to create mount point: %v", err)
-			}
-			mountPoint, err := root.Walk(ctx, root, mountPointName)
-			if err != nil {
-				b.Fatalf("failed to walk to mount point: %v", err)
-			}
-			defer mountPoint.DecRef()
-			submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
-			if err != nil {
-				b.Fatalf("failed to create tmpfs submount: %v", err)
-			}
-			if err := mntns.Mount(ctx, mountPoint, submountInode); err != nil {
-				b.Fatalf("failed to mount tmpfs submount: %v", err)
-			}
-			filePathBuilder.WriteString(mountPointName)
-			filePathBuilder.WriteByte('/')
-
-			// Create nested directories with given depth.
-			d, err := root.Walk(ctx, root, mountPointName)
-			if err != nil {
-				b.Fatalf("failed to walk to mount root: %v", err)
-			}
-			defer d.DecRef()
-			for i := depth; i > 0; i-- {
-				name := fmt.Sprintf("%d", i)
-				if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
-					b.Fatalf("failed to create directory %q: %v", name, err)
-				}
-				next, err := d.Walk(ctx, root, name)
-				if err != nil {
-					b.Fatalf("failed to walk to directory %q: %v", name, err)
-				}
-				d.DecRef()
-				d = next
-				filePathBuilder.WriteString(name)
-				filePathBuilder.WriteByte('/')
-			}
-
-			// Create the file that will be stat'd.
-			file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
-			if err != nil {
-				b.Fatalf("failed to create file %q: %v", filename, err)
-			}
-			file.DecRef()
-			filePathBuilder.WriteString(filename)
-			filePath := filePathBuilder.String()
-
-			dirPath := false
-			runtime.GC()
-			b.ResetTimer()
-			for i := 0; i < b.N; i++ {
-				err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
-					if dirPath && !fs.IsDir(d.Inode.StableAttr) {
-						return syserror.ENOTDIR
-					}
-					uattr, err := d.Inode.UnstableAttr(ctx)
-					if err != nil {
-						return err
-					}
-					// Sanity check.
-					if uattr.Perms.User.Execute {
-						b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
-					}
-					return nil
-				})
-				if err != nil {
-					b.Fatalf("stat(%q) failed: %v", filePath, err)
-				}
-			}
-			// Don't include deferred cleanup in benchmark time.
-			b.StopTimer()
-		})
-	}
-}
-
-func BenchmarkVFS2MemfsMountStat(b *testing.B) {
-	for _, depth := range depths {
-		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
-			ctx := contexttest.Context(b)
-			creds := auth.CredentialsFromContext(ctx)
-
-			// Create VFS.
-			vfsObj := vfs.New()
-			vfsObj.MustRegisterFilesystemType("memfs", memfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
-				AllowUserMount: true,
-			})
-			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
-			if err != nil {
-				b.Fatalf("failed to create tmpfs root mount: %v", err)
-			}
-			defer mntns.DecRef(vfsObj)
-
-			var filePathBuilder strings.Builder
-			filePathBuilder.WriteByte('/')
-
-			// Create the mount point.
-			root := mntns.Root()
-			defer root.DecRef()
-			pop := vfs.PathOperation{
-				Root:  root,
-				Start: root,
-				Path:  fspath.Parse(mountPointName),
-			}
-			if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
-				Mode: 0755,
-			}); err != nil {
-				b.Fatalf("failed to create mount point: %v", err)
-			}
-			// Save the mount point for later use.
-			mountPoint, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
-			if err != nil {
-				b.Fatalf("failed to walk to mount point: %v", err)
-			}
-			defer mountPoint.DecRef()
-			// Create and mount the submount.
-			if err := vfsObj.MountAt(ctx, creds, "", &pop, "memfs", &vfs.MountOptions{}); err != nil {
-				b.Fatalf("failed to mount tmpfs submount: %v", err)
-			}
-			filePathBuilder.WriteString(mountPointName)
-			filePathBuilder.WriteByte('/')
-
-			// Create nested directories with given depth.
-			vd, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
-			if err != nil {
-				b.Fatalf("failed to walk to mount root: %v", err)
-			}
-			for i := depth; i > 0; i-- {
-				name := fmt.Sprintf("%d", i)
-				pop := vfs.PathOperation{
-					Root:  root,
-					Start: vd,
-					Path:  fspath.Parse(name),
-				}
-				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
-					Mode: 0755,
-				}); err != nil {
-					b.Fatalf("failed to create directory %q: %v", name, err)
-				}
-				nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
-				if err != nil {
-					b.Fatalf("failed to walk to directory %q: %v", name, err)
-				}
-				vd.DecRef()
-				vd = nextVD
-				filePathBuilder.WriteString(name)
-				filePathBuilder.WriteByte('/')
-			}
-
-			// Verify that we didn't create any directories under the mount
-			// point (i.e. they were all created on the submount).
-			firstDirName := fmt.Sprintf("%d", depth)
-			if child := mountPoint.Dentry().Child(firstDirName); child != nil {
-				b.Fatalf("created directory %q under root mount, not submount", firstDirName)
-			}
-
-			// Create the file that will be stat'd.
-			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
-				Root:               root,
-				Start:              vd,
-				Path:               fspath.Parse(filename),
-				FollowFinalSymlink: true,
-			}, &vfs.OpenOptions{
-				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
-				Mode:  0644,
-			})
-			vd.DecRef()
-			if err != nil {
-				b.Fatalf("failed to create file %q: %v", filename, err)
-			}
-			fd.DecRef()
-			filePathBuilder.WriteString(filename)
-			filePath := filePathBuilder.String()
-
-			runtime.GC()
-			b.ResetTimer()
-			for i := 0; i < b.N; i++ {
-				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
-					Root:               root,
-					Start:              root,
-					Path:               fspath.Parse(filePath),
-					FollowFinalSymlink: true,
-				}, &vfs.StatOptions{})
-				if err != nil {
-					b.Fatalf("stat(%q) failed: %v", filePath, err)
-				}
-				// Sanity check.
-				if stat.Mode&^linux.S_IFMT != 0644 {
-					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
-				}
-			}
-			// Don't include deferred cleanup in benchmark time.
-			b.StopTimer()
-		})
-	}
-}
-
-func init() {
-	// Turn off reference leak checking for a fair comparison between vfs1 and
-	// vfs2.
-	refs.SetLeakMode(refs.NoLeakChecking)
-}
diff --git a/pkg/sentry/fsimpl/memfs/directory.go b/pkg/sentry/fsimpl/memfs/directory.go
deleted file mode 100644
index 0bd82e480..000000000
--- a/pkg/sentry/fsimpl/memfs/directory.go
+++ /dev/null
@@ -1,187 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-type directory struct {
-	inode inode
-
-	// childList is a list containing (1) child Dentries and (2) fake Dentries
-	// (with inode == nil) that represent the iteration position of
-	// directoryFDs. childList is used to support directoryFD.IterDirents()
-	// efficiently. childList is protected by filesystem.mu.
-	childList dentryList
-}
-
-func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode {
-	dir := &directory{}
-	dir.inode.init(dir, fs, creds, mode)
-	dir.inode.nlink = 2 // from "." and parent directory or ".." for root
-	return &dir.inode
-}
-
-func (i *inode) isDir() bool {
-	_, ok := i.impl.(*directory)
-	return ok
-}
-
-type directoryFD struct {
-	fileDescription
-	vfs.DirectoryFileDescriptionDefaultImpl
-
-	// Protected by filesystem.mu.
-	iter *dentry
-	off  int64
-}
-
-// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *directoryFD) Release() {
-	if fd.iter != nil {
-		fs := fd.filesystem()
-		dir := fd.inode().impl.(*directory)
-		fs.mu.Lock()
-		dir.childList.Remove(fd.iter)
-		fs.mu.Unlock()
-		fd.iter = nil
-	}
-}
-
-// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
-func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
-	fs := fd.filesystem()
-	vfsd := fd.vfsfd.VirtualDentry().Dentry()
-
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-
-	if fd.off == 0 {
-		if !cb.Handle(vfs.Dirent{
-			Name:    ".",
-			Type:    linux.DT_DIR,
-			Ino:     vfsd.Impl().(*dentry).inode.ino,
-			NextOff: 1,
-		}) {
-			return nil
-		}
-		fd.off++
-	}
-	if fd.off == 1 {
-		parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
-		if !cb.Handle(vfs.Dirent{
-			Name:    "..",
-			Type:    parentInode.direntType(),
-			Ino:     parentInode.ino,
-			NextOff: 2,
-		}) {
-			return nil
-		}
-		fd.off++
-	}
-
-	dir := vfsd.Impl().(*dentry).inode.impl.(*directory)
-	var child *dentry
-	if fd.iter == nil {
-		// Start iteration at the beginning of dir.
-		child = dir.childList.Front()
-		fd.iter = &dentry{}
-	} else {
-		// Continue iteration from where we left off.
-		child = fd.iter.Next()
-		dir.childList.Remove(fd.iter)
-	}
-	for child != nil {
-		// Skip other directoryFD iterators.
-		if child.inode != nil {
-			if !cb.Handle(vfs.Dirent{
-				Name:    child.vfsd.Name(),
-				Type:    child.inode.direntType(),
-				Ino:     child.inode.ino,
-				NextOff: fd.off + 1,
-			}) {
-				dir.childList.InsertBefore(child, fd.iter)
-				return nil
-			}
-			fd.off++
-		}
-		child = child.Next()
-	}
-	dir.childList.PushBack(fd.iter)
-	return nil
-}
-
-// Seek implements vfs.FileDescriptionImpl.Seek.
-func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
-	fs := fd.filesystem()
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-
-	switch whence {
-	case linux.SEEK_SET:
-		// Use offset as given.
-	case linux.SEEK_CUR:
-		offset += fd.off
-	default:
-		return 0, syserror.EINVAL
-	}
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
-
-	// If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't
-	// seek even if doing so might reposition the iterator due to concurrent
-	// mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek().
-	if fd.off == offset {
-		return offset, nil
-	}
-
-	fd.off = offset
-	// Compensate for "." and "..".
-	remChildren := int64(0)
-	if offset >= 2 {
-		remChildren = offset - 2
-	}
-
-	dir := fd.inode().impl.(*directory)
-
-	// Ensure that fd.iter exists and is not linked into dir.childList.
-	if fd.iter == nil {
-		fd.iter = &dentry{}
-	} else {
-		dir.childList.Remove(fd.iter)
-	}
-	// Insert fd.iter before the remChildren'th child, or at the end of the
-	// list if remChildren >= number of children.
-	child := dir.childList.Front()
-	for child != nil {
-		// Skip other directoryFD iterators.
-		if child.inode != nil {
-			if remChildren == 0 {
-				dir.childList.InsertBefore(child, fd.iter)
-				return offset, nil
-			}
-			remChildren--
-		}
-		child = child.Next()
-	}
-	dir.childList.PushBack(fd.iter)
-	return offset, nil
-}
diff --git a/pkg/sentry/fsimpl/memfs/filesystem.go b/pkg/sentry/fsimpl/memfs/filesystem.go
deleted file mode 100644
index b063e09a3..000000000
--- a/pkg/sentry/fsimpl/memfs/filesystem.go
+++ /dev/null
@@ -1,698 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"fmt"
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// Sync implements vfs.FilesystemImpl.Sync.
-func (fs *filesystem) Sync(ctx context.Context) error {
-	// All filesystem state is in-memory.
-	return nil
-}
-
-// stepLocked resolves rp.Component() to an existing file, starting from the
-// given directory.
-//
-// stepLocked is loosely analogous to fs/namei.c:walk_component().
-//
-// Preconditions: filesystem.mu must be locked. !rp.Done().
-func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
-	if !d.inode.isDir() {
-		return nil, syserror.ENOTDIR
-	}
-	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
-		return nil, err
-	}
-afterSymlink:
-	nextVFSD, err := rp.ResolveComponent(&d.vfsd)
-	if err != nil {
-		return nil, err
-	}
-	if nextVFSD == nil {
-		// Since the Dentry tree is the sole source of truth for memfs, if it's
-		// not in the Dentry tree, it doesn't exist.
-		return nil, syserror.ENOENT
-	}
-	next := nextVFSD.Impl().(*dentry)
-	if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
-		// TODO: symlink traversals update access time
-		if err := rp.HandleSymlink(symlink.target); err != nil {
-			return nil, err
-		}
-		goto afterSymlink // don't check the current directory again
-	}
-	rp.Advance()
-	return next, nil
-}
-
-// walkParentDirLocked resolves all but the last path component of rp to an
-// existing directory, starting from the given directory (which is usually
-// rp.Start().Impl().(*dentry)). It does not check that the returned directory
-// is searchable by the provider of rp.
-//
-// walkParentDirLocked is loosely analogous to Linux's
-// fs/namei.c:path_parentat().
-//
-// Preconditions: filesystem.mu must be locked. !rp.Done().
-func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
-	for !rp.Final() {
-		next, err := stepLocked(rp, d)
-		if err != nil {
-			return nil, err
-		}
-		d = next
-	}
-	if !d.inode.isDir() {
-		return nil, syserror.ENOTDIR
-	}
-	return d, nil
-}
-
-// resolveLocked resolves rp to an existing file.
-//
-// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
-//
-// Preconditions: filesystem.mu must be locked.
-func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
-	d := rp.Start().Impl().(*dentry)
-	for !rp.Done() {
-		next, err := stepLocked(rp, d)
-		if err != nil {
-			return nil, err
-		}
-		d = next
-	}
-	if rp.MustBeDir() && !d.inode.isDir() {
-		return nil, syserror.ENOTDIR
-	}
-	return d, nil
-}
-
-// doCreateAt checks that creating a file at rp is permitted, then invokes
-// create to do so.
-//
-// doCreateAt is loosely analogous to a conjunction of Linux's
-// fs/namei.c:filename_create() and done_path_create().
-//
-// Preconditions: !rp.Done(). For the final path component in rp,
-// !rp.ShouldFollowSymlink().
-func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
-	if err != nil {
-		return err
-	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
-		return err
-	}
-	name := rp.Component()
-	if name == "." || name == ".." {
-		return syserror.EEXIST
-	}
-	// Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(),
-	// because if the child exists we want to return EEXIST immediately instead
-	// of attempting symlink/mount traversal.
-	if parent.vfsd.Child(name) != nil {
-		return syserror.EEXIST
-	}
-	if !dir && rp.MustBeDir() {
-		return syserror.ENOENT
-	}
-	// In memfs, the only way to cause a dentry to be disowned is by removing
-	// it from the filesystem, so this check is equivalent to checking if
-	// parent has been removed.
-	if parent.vfsd.IsDisowned() {
-		return syserror.ENOENT
-	}
-	mnt := rp.Mount()
-	if err := mnt.CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer mnt.EndWrite()
-	return create(parent, name)
-}
-
-// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
-func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	d, err := resolveLocked(rp)
-	if err != nil {
-		return nil, err
-	}
-	if opts.CheckSearchable {
-		if !d.inode.isDir() {
-			return nil, syserror.ENOTDIR
-		}
-		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil {
-			return nil, err
-		}
-	}
-	d.IncRef()
-	return &d.vfsd, nil
-}
-
-// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
-func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
-	if err != nil {
-		return nil, err
-	}
-	d.IncRef()
-	return &d.vfsd, nil
-}
-
-// LinkAt implements vfs.FilesystemImpl.LinkAt.
-func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
-	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
-		if rp.Mount() != vd.Mount() {
-			return syserror.EXDEV
-		}
-		d := vd.Dentry().Impl().(*dentry)
-		if d.inode.isDir() {
-			return syserror.EPERM
-		}
-		if d.inode.nlink == 0 {
-			return syserror.ENOENT
-		}
-		if d.inode.nlink == maxLinks {
-			return syserror.EMLINK
-		}
-		d.inode.incLinksLocked()
-		child := fs.newDentry(d.inode)
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
-		return nil
-	})
-}
-
-// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
-func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
-	return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error {
-		if parent.inode.nlink == maxLinks {
-			return syserror.EMLINK
-		}
-		parent.inode.incLinksLocked() // from child's ".."
-		child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
-		return nil
-	})
-}
-
-// MknodAt implements vfs.FilesystemImpl.MknodAt.
-func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
-	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
-		switch opts.Mode.FileType() {
-		case 0, linux.S_IFREG:
-			child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
-			parent.vfsd.InsertChild(&child.vfsd, name)
-			parent.inode.impl.(*directory).childList.PushBack(child)
-			return nil
-		case linux.S_IFIFO:
-			child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
-			parent.vfsd.InsertChild(&child.vfsd, name)
-			parent.inode.impl.(*directory).childList.PushBack(child)
-			return nil
-		case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK:
-			// Not yet supported.
-			return syserror.EPERM
-		default:
-			return syserror.EINVAL
-		}
-	})
-}
-
-// OpenAt implements vfs.FilesystemImpl.OpenAt.
-func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
-	if opts.Flags&linux.O_TMPFILE != 0 {
-		// Not yet supported.
-		return nil, syserror.EOPNOTSUPP
-	}
-
-	// Handle O_CREAT and !O_CREAT separately, since in the latter case we
-	// don't need fs.mu for writing.
-	if opts.Flags&linux.O_CREAT == 0 {
-		fs.mu.RLock()
-		defer fs.mu.RUnlock()
-		d, err := resolveLocked(rp)
-		if err != nil {
-			return nil, err
-		}
-		return d.open(ctx, rp, opts.Flags, false /* afterCreate */)
-	}
-
-	mustCreate := opts.Flags&linux.O_EXCL != 0
-	start := rp.Start().Impl().(*dentry)
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	if rp.Done() {
-		// Reject attempts to open directories with O_CREAT.
-		if rp.MustBeDir() {
-			return nil, syserror.EISDIR
-		}
-		if mustCreate {
-			return nil, syserror.EEXIST
-		}
-		return start.open(ctx, rp, opts.Flags, false /* afterCreate */)
-	}
-afterTrailingSymlink:
-	parent, err := walkParentDirLocked(rp, start)
-	if err != nil {
-		return nil, err
-	}
-	// Check for search permission in the parent directory.
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
-		return nil, err
-	}
-	// Reject attempts to open directories with O_CREAT.
-	if rp.MustBeDir() {
-		return nil, syserror.EISDIR
-	}
-	name := rp.Component()
-	if name == "." || name == ".." {
-		return nil, syserror.EISDIR
-	}
-	// Determine whether or not we need to create a file.
-	child, err := stepLocked(rp, parent)
-	if err == syserror.ENOENT {
-		// Already checked for searchability above; now check for writability.
-		if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
-			return nil, err
-		}
-		if err := rp.Mount().CheckBeginWrite(); err != nil {
-			return nil, err
-		}
-		defer rp.Mount().EndWrite()
-		// Create and open the child.
-		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
-		return child.open(ctx, rp, opts.Flags, true)
-	}
-	if err != nil {
-		return nil, err
-	}
-	// Do we need to resolve a trailing symlink?
-	if !rp.Done() {
-		start = parent
-		goto afterTrailingSymlink
-	}
-	// Open existing file.
-	if mustCreate {
-		return nil, syserror.EEXIST
-	}
-	return child.open(ctx, rp, opts.Flags, false)
-}
-
-func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
-	ats := vfs.AccessTypesForOpenFlags(flags)
-	if !afterCreate {
-		if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
-			return nil, err
-		}
-	}
-	mnt := rp.Mount()
-	switch impl := d.inode.impl.(type) {
-	case *regularFile:
-		var fd regularFileFD
-		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
-		fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
-		if fd.writable {
-			if err := mnt.CheckBeginWrite(); err != nil {
-				return nil, err
-			}
-			// mnt.EndWrite() is called by regularFileFD.Release().
-		}
-		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
-		if flags&linux.O_TRUNC != 0 {
-			impl.mu.Lock()
-			impl.data = impl.data[:0]
-			atomic.StoreInt64(&impl.dataLen, 0)
-			impl.mu.Unlock()
-		}
-		return &fd.vfsfd, nil
-	case *directory:
-		// Can't open directories writably.
-		if ats&vfs.MayWrite != 0 {
-			return nil, syserror.EISDIR
-		}
-		var fd directoryFD
-		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
-		return &fd.vfsfd, nil
-	case *symlink:
-		// Can't open symlinks without O_PATH (which is unimplemented).
-		return nil, syserror.ELOOP
-	case *namedPipe:
-		return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags)
-	default:
-		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
-	}
-}
-
-// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
-func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	d, err := resolveLocked(rp)
-	if err != nil {
-		return "", err
-	}
-	symlink, ok := d.inode.impl.(*symlink)
-	if !ok {
-		return "", syserror.EINVAL
-	}
-	return symlink.target, nil
-}
-
-// RenameAt implements vfs.FilesystemImpl.RenameAt.
-func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
-	if opts.Flags != 0 {
-		// TODO(b/145974740): Support renameat2 flags.
-		return syserror.EINVAL
-	}
-
-	// Resolve newParent first to verify that it's on this Mount.
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
-	if err != nil {
-		return err
-	}
-	newName := rp.Component()
-	if newName == "." || newName == ".." {
-		return syserror.EBUSY
-	}
-	mnt := rp.Mount()
-	if mnt != oldParentVD.Mount() {
-		return syserror.EXDEV
-	}
-	if err := mnt.CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer mnt.EndWrite()
-
-	oldParent := oldParentVD.Dentry().Impl().(*dentry)
-	if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
-		return err
-	}
-	// Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(),
-	// because if the existing child is a symlink or mount point then we want
-	// to rename over it rather than follow it.
-	renamedVFSD := oldParent.vfsd.Child(oldName)
-	if renamedVFSD == nil {
-		return syserror.ENOENT
-	}
-	renamed := renamedVFSD.Impl().(*dentry)
-	if renamed.inode.isDir() {
-		if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) {
-			return syserror.EINVAL
-		}
-		if oldParent != newParent {
-			// Writability is needed to change renamed's "..".
-			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil {
-				return err
-			}
-		}
-	} else {
-		if opts.MustBeDir || rp.MustBeDir() {
-			return syserror.ENOTDIR
-		}
-	}
-
-	if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
-		return err
-	}
-	replacedVFSD := newParent.vfsd.Child(newName)
-	var replaced *dentry
-	if replacedVFSD != nil {
-		replaced = replacedVFSD.Impl().(*dentry)
-		if replaced.inode.isDir() {
-			if !renamed.inode.isDir() {
-				return syserror.EISDIR
-			}
-			if replaced.vfsd.HasChildren() {
-				return syserror.ENOTEMPTY
-			}
-		} else {
-			if rp.MustBeDir() {
-				return syserror.ENOTDIR
-			}
-			if renamed.inode.isDir() {
-				return syserror.ENOTDIR
-			}
-		}
-	} else {
-		if renamed.inode.isDir() && newParent.inode.nlink == maxLinks {
-			return syserror.EMLINK
-		}
-	}
-	if newParent.vfsd.IsDisowned() {
-		return syserror.ENOENT
-	}
-
-	// Linux places this check before some of those above; we do it here for
-	// simplicity, under the assumption that applications are not intentionally
-	// doing noop renames expecting them to succeed where non-noop renames
-	// would fail.
-	if renamedVFSD == replacedVFSD {
-		return nil
-	}
-	vfsObj := rp.VirtualFilesystem()
-	oldParentDir := oldParent.inode.impl.(*directory)
-	newParentDir := newParent.inode.impl.(*directory)
-	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil {
-		return err
-	}
-	if replaced != nil {
-		newParentDir.childList.Remove(replaced)
-		if replaced.inode.isDir() {
-			newParent.inode.decLinksLocked() // from replaced's ".."
-		}
-		replaced.inode.decLinksLocked()
-	}
-	oldParentDir.childList.Remove(renamed)
-	newParentDir.childList.PushBack(renamed)
-	if renamed.inode.isDir() {
-		oldParent.inode.decLinksLocked()
-		newParent.inode.incLinksLocked()
-	}
-	// TODO: update timestamps and parent directory sizes
-	vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
-	return nil
-}
-
-// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
-func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
-	if err != nil {
-		return err
-	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
-		return err
-	}
-	name := rp.Component()
-	if name == "." {
-		return syserror.EINVAL
-	}
-	if name == ".." {
-		return syserror.ENOTEMPTY
-	}
-	childVFSD := parent.vfsd.Child(name)
-	if childVFSD == nil {
-		return syserror.ENOENT
-	}
-	child := childVFSD.Impl().(*dentry)
-	if !child.inode.isDir() {
-		return syserror.ENOTDIR
-	}
-	if childVFSD.HasChildren() {
-		return syserror.ENOTEMPTY
-	}
-	mnt := rp.Mount()
-	if err := mnt.CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer mnt.EndWrite()
-	vfsObj := rp.VirtualFilesystem()
-	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
-		return err
-	}
-	parent.inode.impl.(*directory).childList.Remove(child)
-	parent.inode.decLinksLocked() // from child's ".."
-	child.inode.decLinksLocked()
-	vfsObj.CommitDeleteDentry(childVFSD)
-	return nil
-}
-
-// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
-func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
-	if err != nil {
-		return err
-	}
-	if opts.Stat.Mask == 0 {
-		return nil
-	}
-	// TODO: implement inode.setStat
-	return syserror.EPERM
-}
-
-// StatAt implements vfs.FilesystemImpl.StatAt.
-func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	d, err := resolveLocked(rp)
-	if err != nil {
-		return linux.Statx{}, err
-	}
-	var stat linux.Statx
-	d.inode.statTo(&stat)
-	return stat, nil
-}
-
-// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
-func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
-	if err != nil {
-		return linux.Statfs{}, err
-	}
-	// TODO: actually implement statfs
-	return linux.Statfs{}, syserror.ENOSYS
-}
-
-// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
-func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
-	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
-		child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
-		parent.vfsd.InsertChild(&child.vfsd, name)
-		parent.inode.impl.(*directory).childList.PushBack(child)
-		return nil
-	})
-}
-
-// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
-func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
-	fs.mu.Lock()
-	defer fs.mu.Unlock()
-	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
-	if err != nil {
-		return err
-	}
-	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
-		return err
-	}
-	name := rp.Component()
-	if name == "." || name == ".." {
-		return syserror.EISDIR
-	}
-	childVFSD := parent.vfsd.Child(name)
-	if childVFSD == nil {
-		return syserror.ENOENT
-	}
-	child := childVFSD.Impl().(*dentry)
-	if child.inode.isDir() {
-		return syserror.EISDIR
-	}
-	if !rp.MustBeDir() {
-		return syserror.ENOTDIR
-	}
-	mnt := rp.Mount()
-	if err := mnt.CheckBeginWrite(); err != nil {
-		return err
-	}
-	defer mnt.EndWrite()
-	vfsObj := rp.VirtualFilesystem()
-	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
-		return err
-	}
-	parent.inode.impl.(*directory).childList.Remove(child)
-	child.inode.decLinksLocked()
-	vfsObj.CommitDeleteDentry(childVFSD)
-	return nil
-}
-
-// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
-func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
-	if err != nil {
-		return nil, err
-	}
-	// TODO(b/127675828): support extended attributes
-	return nil, syserror.ENOTSUP
-}
-
-// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
-func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
-	if err != nil {
-		return "", err
-	}
-	// TODO(b/127675828): support extended attributes
-	return "", syserror.ENOTSUP
-}
-
-// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
-func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
-	if err != nil {
-		return err
-	}
-	// TODO(b/127675828): support extended attributes
-	return syserror.ENOTSUP
-}
-
-// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
-func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	_, err := resolveLocked(rp)
-	if err != nil {
-		return err
-	}
-	// TODO(b/127675828): support extended attributes
-	return syserror.ENOTSUP
-}
-
-// PrependPath implements vfs.FilesystemImpl.PrependPath.
-func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
-	fs.mu.RLock()
-	defer fs.mu.RUnlock()
-	return vfs.GenericPrependPath(vfsroot, vd, b)
-}
diff --git a/pkg/sentry/fsimpl/memfs/memfs.go b/pkg/sentry/fsimpl/memfs/memfs.go
deleted file mode 100644
index 8d0167c93..000000000
--- a/pkg/sentry/fsimpl/memfs/memfs.go
+++ /dev/null
@@ -1,293 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Package memfs provides a filesystem implementation that behaves like tmpfs:
-// the Dentry tree is the sole source of truth for the state of the filesystem.
-//
-// memfs is intended primarily to demonstrate filesystem implementation
-// patterns. Real uses cases for an in-memory filesystem should use tmpfs
-// instead.
-//
-// Lock order:
-//
-// filesystem.mu
-//   regularFileFD.offMu
-//     regularFile.mu
-//   inode.mu
-package memfs
-
-import (
-	"fmt"
-	"math"
-	"sync"
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-// FilesystemType implements vfs.FilesystemType.
-type FilesystemType struct{}
-
-// filesystem implements vfs.FilesystemImpl.
-type filesystem struct {
-	vfsfs vfs.Filesystem
-
-	// mu serializes changes to the Dentry tree.
-	mu sync.RWMutex
-
-	nextInoMinusOne uint64 // accessed using atomic memory operations
-}
-
-// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
-func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
-	var fs filesystem
-	fs.vfsfs.Init(vfsObj, &fs)
-	root := fs.newDentry(fs.newDirectory(creds, 01777))
-	return &fs.vfsfs, &root.vfsd, nil
-}
-
-// Release implements vfs.FilesystemImpl.Release.
-func (fs *filesystem) Release() {
-}
-
-// dentry implements vfs.DentryImpl.
-type dentry struct {
-	vfsd vfs.Dentry
-
-	// inode is the inode represented by this dentry. Multiple Dentries may
-	// share a single non-directory inode (with hard links). inode is
-	// immutable.
-	inode *inode
-
-	// memfs doesn't count references on dentries; because the dentry tree is
-	// the sole source of truth, it is by definition always consistent with the
-	// state of the filesystem. However, it does count references on inodes,
-	// because inode resources are released when all references are dropped.
-	// (memfs doesn't really have resources to release, but we implement
-	// reference counting because tmpfs regular files will.)
-
-	// dentryEntry (ugh) links dentries into their parent directory.childList.
-	dentryEntry
-}
-
-func (fs *filesystem) newDentry(inode *inode) *dentry {
-	d := &dentry{
-		inode: inode,
-	}
-	d.vfsd.Init(d)
-	return d
-}
-
-// IncRef implements vfs.DentryImpl.IncRef.
-func (d *dentry) IncRef() {
-	d.inode.incRef()
-}
-
-// TryIncRef implements vfs.DentryImpl.TryIncRef.
-func (d *dentry) TryIncRef() bool {
-	return d.inode.tryIncRef()
-}
-
-// DecRef implements vfs.DentryImpl.DecRef.
-func (d *dentry) DecRef() {
-	d.inode.decRef()
-}
-
-// inode represents a filesystem object.
-type inode struct {
-	// refs is a reference count. refs is accessed using atomic memory
-	// operations.
-	//
-	// A reference is held on all inodes that are reachable in the filesystem
-	// tree. For non-directories (which may have multiple hard links), this
-	// means that a reference is dropped when nlink reaches 0. For directories,
-	// nlink never reaches 0 due to the "." entry; instead,
-	// filesystem.RmdirAt() drops the reference.
-	refs int64
-
-	// Inode metadata; protected by mu and accessed using atomic memory
-	// operations unless otherwise specified.
-	mu    sync.RWMutex
-	mode  uint32 // excluding file type bits, which are based on impl
-	nlink uint32 // protected by filesystem.mu instead of inode.mu
-	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
-	gid   uint32 // auth.KGID, but ...
-	ino   uint64 // immutable
-
-	impl interface{} // immutable
-}
-
-const maxLinks = math.MaxUint32
-
-func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
-	i.refs = 1
-	i.mode = uint32(mode)
-	i.uid = uint32(creds.EffectiveKUID)
-	i.gid = uint32(creds.EffectiveKGID)
-	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
-	// i.nlink initialized by caller
-	i.impl = impl
-}
-
-// incLinksLocked increments i's link count.
-//
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
-// i.nlink < maxLinks.
-func (i *inode) incLinksLocked() {
-	if i.nlink == 0 {
-		panic("memfs.inode.incLinksLocked() called with no existing links")
-	}
-	if i.nlink == maxLinks {
-		panic("memfs.inode.incLinksLocked() called with maximum link count")
-	}
-	atomic.AddUint32(&i.nlink, 1)
-}
-
-// decLinksLocked decrements i's link count.
-//
-// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
-func (i *inode) decLinksLocked() {
-	if i.nlink == 0 {
-		panic("memfs.inode.decLinksLocked() called with no existing links")
-	}
-	atomic.AddUint32(&i.nlink, ^uint32(0))
-}
-
-func (i *inode) incRef() {
-	if atomic.AddInt64(&i.refs, 1) <= 1 {
-		panic("memfs.inode.incRef() called without holding a reference")
-	}
-}
-
-func (i *inode) tryIncRef() bool {
-	for {
-		refs := atomic.LoadInt64(&i.refs)
-		if refs == 0 {
-			return false
-		}
-		if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) {
-			return true
-		}
-	}
-}
-
-func (i *inode) decRef() {
-	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
-		// This is unnecessary; it's mostly to simulate what tmpfs would do.
-		if regfile, ok := i.impl.(*regularFile); ok {
-			regfile.mu.Lock()
-			regfile.data = nil
-			atomic.StoreInt64(&regfile.dataLen, 0)
-			regfile.mu.Unlock()
-		}
-	} else if refs < 0 {
-		panic("memfs.inode.decRef() called without holding a reference")
-	}
-}
-
-func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
-	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
-}
-
-// Go won't inline this function, and returning linux.Statx (which is quite
-// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
-// output parameter.
-func (i *inode) statTo(stat *linux.Statx) {
-	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
-	stat.Blksize = 1 // usermem.PageSize in tmpfs
-	stat.Nlink = atomic.LoadUint32(&i.nlink)
-	stat.UID = atomic.LoadUint32(&i.uid)
-	stat.GID = atomic.LoadUint32(&i.gid)
-	stat.Mode = uint16(atomic.LoadUint32(&i.mode))
-	stat.Ino = i.ino
-	// TODO: device number
-	switch impl := i.impl.(type) {
-	case *regularFile:
-		stat.Mode |= linux.S_IFREG
-		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
-		stat.Size = uint64(atomic.LoadInt64(&impl.dataLen))
-		// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
-		// a uint64 accessed using atomic memory operations to avoid taking
-		// locks).
-		stat.Blocks = allocatedBlocksForSize(stat.Size)
-	case *directory:
-		stat.Mode |= linux.S_IFDIR
-	case *symlink:
-		stat.Mode |= linux.S_IFLNK
-		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
-		stat.Size = uint64(len(impl.target))
-		stat.Blocks = allocatedBlocksForSize(stat.Size)
-	case *namedPipe:
-		stat.Mode |= linux.S_IFIFO
-	default:
-		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
-	}
-}
-
-// allocatedBlocksForSize returns the number of 512B blocks needed to
-// accommodate the given size in bytes, as appropriate for struct
-// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
-// size is independent of the "preferred block size for I/O", struct
-// stat::st_blksize and struct statx::stx_blksize.)
-func allocatedBlocksForSize(size uint64) uint64 {
-	return (size + 511) / 512
-}
-
-func (i *inode) direntType() uint8 {
-	switch i.impl.(type) {
-	case *regularFile:
-		return linux.DT_REG
-	case *directory:
-		return linux.DT_DIR
-	case *symlink:
-		return linux.DT_LNK
-	default:
-		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
-	}
-}
-
-// fileDescription is embedded by memfs implementations of
-// vfs.FileDescriptionImpl.
-type fileDescription struct {
-	vfsfd vfs.FileDescription
-	vfs.FileDescriptionDefaultImpl
-}
-
-func (fd *fileDescription) filesystem() *filesystem {
-	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
-}
-
-func (fd *fileDescription) inode() *inode {
-	return fd.vfsfd.Dentry().Impl().(*dentry).inode
-}
-
-// Stat implements vfs.FileDescriptionImpl.Stat.
-func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
-	var stat linux.Statx
-	fd.inode().statTo(&stat)
-	return stat, nil
-}
-
-// SetStat implements vfs.FileDescriptionImpl.SetStat.
-func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
-	if opts.Stat.Mask == 0 {
-		return nil
-	}
-	// TODO: implement inode.setStat
-	return syserror.EPERM
-}
diff --git a/pkg/sentry/fsimpl/memfs/named_pipe.go b/pkg/sentry/fsimpl/memfs/named_pipe.go
deleted file mode 100644
index b5a204438..000000000
--- a/pkg/sentry/fsimpl/memfs/named_pipe.go
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-)
-
-type namedPipe struct {
-	inode inode
-
-	pipe *pipe.VFSPipe
-}
-
-// Preconditions:
-//   * fs.mu must be locked.
-//   * rp.Mount().CheckBeginWrite() has been called successfully.
-func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
-	file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)}
-	file.inode.init(file, fs, creds, mode)
-	file.inode.nlink = 1 // Only the parent has a link.
-	return &file.inode
-}
-
-// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented
-// entirely via struct embedding.
-type namedPipeFD struct {
-	fileDescription
-
-	*pipe.VFSPipeFD
-}
-
-func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
-	var err error
-	var fd namedPipeFD
-	fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, rp, vfsd, &fd.vfsfd, flags)
-	if err != nil {
-		return nil, err
-	}
-	mnt := rp.Mount()
-	fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
-	return &fd.vfsfd, nil
-}
diff --git a/pkg/sentry/fsimpl/memfs/pipe_test.go b/pkg/sentry/fsimpl/memfs/pipe_test.go
deleted file mode 100644
index 807c1af7a..000000000
--- a/pkg/sentry/fsimpl/memfs/pipe_test.go
+++ /dev/null
@@ -1,235 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"bytes"
-	"testing"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/fspath"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-const fileName = "mypipe"
-
-func TestSeparateFDs(t *testing.T) {
-	ctx, creds, vfsObj, root := setup(t)
-	defer root.DecRef()
-
-	// Open the read side. This is done in a concurrently because opening
-	// One end the pipe blocks until the other end is opened.
-	pop := vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Path:               fspath.Parse(fileName),
-		FollowFinalSymlink: true,
-	}
-	rfdchan := make(chan *vfs.FileDescription)
-	go func() {
-		openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY}
-		rfd, _ := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
-		rfdchan <- rfd
-	}()
-
-	// Open the write side.
-	openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY}
-	wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
-	if err != nil {
-		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
-	}
-	defer wfd.DecRef()
-
-	rfd, ok := <-rfdchan
-	if !ok {
-		t.Fatalf("failed to open pipe for reading %q", fileName)
-	}
-	defer rfd.DecRef()
-
-	const msg = "vamos azul"
-	checkEmpty(ctx, t, rfd)
-	checkWrite(ctx, t, wfd, msg)
-	checkRead(ctx, t, rfd, msg)
-}
-
-func TestNonblockingRead(t *testing.T) {
-	ctx, creds, vfsObj, root := setup(t)
-	defer root.DecRef()
-
-	// Open the read side as nonblocking.
-	pop := vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Path:               fspath.Parse(fileName),
-		FollowFinalSymlink: true,
-	}
-	openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK}
-	rfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
-	if err != nil {
-		t.Fatalf("failed to open pipe for reading %q: %v", fileName, err)
-	}
-	defer rfd.DecRef()
-
-	// Open the write side.
-	openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY}
-	wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
-	if err != nil {
-		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
-	}
-	defer wfd.DecRef()
-
-	const msg = "geh blau"
-	checkEmpty(ctx, t, rfd)
-	checkWrite(ctx, t, wfd, msg)
-	checkRead(ctx, t, rfd, msg)
-}
-
-func TestNonblockingWriteError(t *testing.T) {
-	ctx, creds, vfsObj, root := setup(t)
-	defer root.DecRef()
-
-	// Open the write side as nonblocking, which should return ENXIO.
-	pop := vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Path:               fspath.Parse(fileName),
-		FollowFinalSymlink: true,
-	}
-	openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK}
-	_, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
-	if err != syserror.ENXIO {
-		t.Fatalf("expected ENXIO, but got error: %v", err)
-	}
-}
-
-func TestSingleFD(t *testing.T) {
-	ctx, creds, vfsObj, root := setup(t)
-	defer root.DecRef()
-
-	// Open the pipe as readable and writable.
-	pop := vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Path:               fspath.Parse(fileName),
-		FollowFinalSymlink: true,
-	}
-	openOpts := vfs.OpenOptions{Flags: linux.O_RDWR}
-	fd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
-	if err != nil {
-		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
-	}
-	defer fd.DecRef()
-
-	const msg = "forza blu"
-	checkEmpty(ctx, t, fd)
-	checkWrite(ctx, t, fd, msg)
-	checkRead(ctx, t, fd, msg)
-}
-
-// setup creates a VFS with a pipe in the root directory at path fileName. The
-// returned VirtualDentry must be DecRef()'d be the caller. It calls t.Fatal
-// upon failure.
-func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry) {
-	ctx := contexttest.Context(t)
-	creds := auth.CredentialsFromContext(ctx)
-
-	// Create VFS.
-	vfsObj := vfs.New()
-	vfsObj.MustRegisterFilesystemType("memfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
-		AllowUserMount: true,
-	})
-	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "memfs", &vfs.GetFilesystemOptions{})
-	if err != nil {
-		t.Fatalf("failed to create tmpfs root mount: %v", err)
-	}
-
-	// Create the pipe.
-	root := mntns.Root()
-	pop := vfs.PathOperation{
-		Root:  root,
-		Start: root,
-		Path:  fspath.Parse(fileName),
-	}
-	mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644}
-	if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil {
-		t.Fatalf("failed to create file %q: %v", fileName, err)
-	}
-
-	// Sanity check: the file pipe exists and has the correct mode.
-	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
-		Root:               root,
-		Start:              root,
-		Path:               fspath.Parse(fileName),
-		FollowFinalSymlink: true,
-	}, &vfs.StatOptions{})
-	if err != nil {
-		t.Fatalf("stat(%q) failed: %v", fileName, err)
-	}
-	if stat.Mode&^linux.S_IFMT != 0644 {
-		t.Errorf("got wrong permissions (%0o)", stat.Mode)
-	}
-	if stat.Mode&linux.S_IFMT != linux.ModeNamedPipe {
-		t.Errorf("got wrong file type (%0o)", stat.Mode)
-	}
-
-	return ctx, creds, vfsObj, root
-}
-
-// checkEmpty calls t.Fatal if the pipe in fd is not empty.
-func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
-	readData := make([]byte, 1)
-	dst := usermem.BytesIOSequence(readData)
-	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
-	if err != syserror.ErrWouldBlock {
-		t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err)
-	}
-	if bytesRead != 0 {
-		t.Fatalf("expected to read 0 bytes, but got %d", bytesRead)
-	}
-}
-
-// checkWrite calls t.Fatal if it fails to write all of msg to fd.
-func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
-	writeData := []byte(msg)
-	src := usermem.BytesIOSequence(writeData)
-	bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{})
-	if err != nil {
-		t.Fatalf("error writing to pipe %q: %v", fileName, err)
-	}
-	if bytesWritten != int64(len(writeData)) {
-		t.Fatalf("expected to write %d bytes, but wrote %d", len(writeData), bytesWritten)
-	}
-}
-
-// checkRead calls t.Fatal if it fails to read msg from fd.
-func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
-	readData := make([]byte, len(msg))
-	dst := usermem.BytesIOSequence(readData)
-	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
-	if err != nil {
-		t.Fatalf("error reading from pipe %q: %v", fileName, err)
-	}
-	if bytesRead != int64(len(msg)) {
-		t.Fatalf("expected to read %d bytes, but got %d", len(msg), bytesRead)
-	}
-	if !bytes.Equal(readData, []byte(msg)) {
-		t.Fatalf("expected to read %q from pipe, but got %q", msg, string(readData))
-	}
-}
diff --git a/pkg/sentry/fsimpl/memfs/regular_file.go b/pkg/sentry/fsimpl/memfs/regular_file.go
deleted file mode 100644
index b7f4853b3..000000000
--- a/pkg/sentry/fsimpl/memfs/regular_file.go
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"io"
-	"sync"
-	"sync/atomic"
-
-	"gvisor.dev/gvisor/pkg/abi/linux"
-	"gvisor.dev/gvisor/pkg/sentry/context"
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-	"gvisor.dev/gvisor/pkg/sentry/usermem"
-	"gvisor.dev/gvisor/pkg/sentry/vfs"
-	"gvisor.dev/gvisor/pkg/syserror"
-)
-
-type regularFile struct {
-	inode inode
-
-	mu   sync.RWMutex
-	data []byte
-	// dataLen is len(data), but accessed using atomic memory operations to
-	// avoid locking in inode.stat().
-	dataLen int64
-}
-
-func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
-	file := &regularFile{}
-	file.inode.init(file, fs, creds, mode)
-	file.inode.nlink = 1 // from parent directory
-	return &file.inode
-}
-
-type regularFileFD struct {
-	fileDescription
-
-	// These are immutable.
-	readable bool
-	writable bool
-
-	// off is the file offset. off is accessed using atomic memory operations.
-	// offMu serializes operations that may mutate off.
-	off   int64
-	offMu sync.Mutex
-}
-
-// Release implements vfs.FileDescriptionImpl.Release.
-func (fd *regularFileFD) Release() {
-	if fd.writable {
-		fd.vfsfd.VirtualDentry().Mount().EndWrite()
-	}
-}
-
-// PRead implements vfs.FileDescriptionImpl.PRead.
-func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
-	if !fd.readable {
-		return 0, syserror.EINVAL
-	}
-	f := fd.inode().impl.(*regularFile)
-	f.mu.RLock()
-	if offset >= int64(len(f.data)) {
-		f.mu.RUnlock()
-		return 0, io.EOF
-	}
-	n, err := dst.CopyOut(ctx, f.data[offset:])
-	f.mu.RUnlock()
-	return int64(n), err
-}
-
-// Read implements vfs.FileDescriptionImpl.Read.
-func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
-	fd.offMu.Lock()
-	n, err := fd.PRead(ctx, dst, fd.off, opts)
-	fd.off += n
-	fd.offMu.Unlock()
-	return n, err
-}
-
-// PWrite implements vfs.FileDescriptionImpl.PWrite.
-func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
-	if !fd.writable {
-		return 0, syserror.EINVAL
-	}
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
-	srclen := src.NumBytes()
-	if srclen == 0 {
-		return 0, nil
-	}
-	f := fd.inode().impl.(*regularFile)
-	f.mu.Lock()
-	end := offset + srclen
-	if end < offset {
-		// Overflow.
-		f.mu.Unlock()
-		return 0, syserror.EFBIG
-	}
-	if end > f.dataLen {
-		f.data = append(f.data, make([]byte, end-f.dataLen)...)
-		atomic.StoreInt64(&f.dataLen, end)
-	}
-	n, err := src.CopyIn(ctx, f.data[offset:end])
-	f.mu.Unlock()
-	return int64(n), err
-}
-
-// Write implements vfs.FileDescriptionImpl.Write.
-func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
-	fd.offMu.Lock()
-	n, err := fd.PWrite(ctx, src, fd.off, opts)
-	fd.off += n
-	fd.offMu.Unlock()
-	return n, err
-}
-
-// Seek implements vfs.FileDescriptionImpl.Seek.
-func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
-	fd.offMu.Lock()
-	defer fd.offMu.Unlock()
-	switch whence {
-	case linux.SEEK_SET:
-		// use offset as specified
-	case linux.SEEK_CUR:
-		offset += fd.off
-	case linux.SEEK_END:
-		offset += atomic.LoadInt64(&fd.inode().impl.(*regularFile).dataLen)
-	default:
-		return 0, syserror.EINVAL
-	}
-	if offset < 0 {
-		return 0, syserror.EINVAL
-	}
-	fd.off = offset
-	return offset, nil
-}
-
-// Sync implements vfs.FileDescriptionImpl.Sync.
-func (fd *regularFileFD) Sync(ctx context.Context) error {
-	return nil
-}
diff --git a/pkg/sentry/fsimpl/memfs/symlink.go b/pkg/sentry/fsimpl/memfs/symlink.go
deleted file mode 100644
index b2ac2cbeb..000000000
--- a/pkg/sentry/fsimpl/memfs/symlink.go
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2019 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-package memfs
-
-import (
-	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
-)
-
-type symlink struct {
-	inode  inode
-	target string // immutable
-}
-
-func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
-	link := &symlink{
-		target: target,
-	}
-	link.inode.init(link, fs, creds, 0777)
-	link.inode.nlink = 1 // from parent directory
-	return &link.inode
-}
-
-// O_PATH is unimplemented, so there's no way to get a FileDescription
-// representing a symlink yet.
diff --git a/pkg/sentry/fsimpl/tmpfs/BUILD b/pkg/sentry/fsimpl/tmpfs/BUILD
new file mode 100644
index 000000000..a5b285987
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/BUILD
@@ -0,0 +1,92 @@
+load("//tools/go_stateify:defs.bzl", "go_library")
+load("@io_bazel_rules_go//go:def.bzl", "go_test")
+load("//tools/go_generics:defs.bzl", "go_template_instance")
+
+package(licenses = ["notice"])
+
+go_template_instance(
+    name = "dentry_list",
+    out = "dentry_list.go",
+    package = "tmpfs",
+    prefix = "dentry",
+    template = "//pkg/ilist:generic_list",
+    types = {
+        "Element": "*dentry",
+        "Linker": "*dentry",
+    },
+)
+
+go_library(
+    name = "tmpfs",
+    srcs = [
+        "dentry_list.go",
+        "directory.go",
+        "filesystem.go",
+        "named_pipe.go",
+        "regular_file.go",
+        "symlink.go",
+        "tmpfs.go",
+    ],
+    importpath = "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs",
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/amutex",
+        "//pkg/fspath",
+        "//pkg/log",
+        "//pkg/sentry/arch",
+        "//pkg/sentry/context",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/fsutil",
+        "//pkg/sentry/kernel",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/pipe",
+        "//pkg/sentry/memmap",
+        "//pkg/sentry/pgalloc",
+        "//pkg/sentry/platform",
+        "//pkg/sentry/safemem",
+        "//pkg/sentry/usage",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "benchmark_test",
+    size = "small",
+    srcs = ["benchmark_test.go"],
+    deps = [
+        ":tmpfs",
+        "//pkg/abi/linux",
+        "//pkg/fspath",
+        "//pkg/refs",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/fs",
+        "//pkg/sentry/fs/tmpfs",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
+
+go_test(
+    name = "tmpfs_test",
+    size = "small",
+    srcs = [
+        "pipe_test.go",
+        "regular_file_test.go",
+    ],
+    embed = [":tmpfs"],
+    deps = [
+        "//pkg/abi/linux",
+        "//pkg/fspath",
+        "//pkg/sentry/context",
+        "//pkg/sentry/context/contexttest",
+        "//pkg/sentry/kernel/auth",
+        "//pkg/sentry/kernel/contexttest",
+        "//pkg/sentry/usermem",
+        "//pkg/sentry/vfs",
+        "//pkg/syserror",
+    ],
+)
diff --git a/pkg/sentry/fsimpl/tmpfs/benchmark_test.go b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
new file mode 100644
index 000000000..d88c83499
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/benchmark_test.go
@@ -0,0 +1,487 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package benchmark_test
+
+import (
+	"fmt"
+	"runtime"
+	"strings"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/refs"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/fs"
+	_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Differences from stat_benchmark:
+//
+// - Syscall interception, CopyInPath, copyOutStat, and overlayfs overheads are
+// not included.
+//
+// - *MountStat benchmarks use a tmpfs root mount and a tmpfs submount at /tmp.
+// Non-MountStat benchmarks use a tmpfs root mount and no submounts.
+// stat_benchmark uses a varying root mount, a tmpfs submount at /tmp, and a
+// subdirectory /tmp/<top_dir> (assuming TEST_TMPDIR == "/tmp"). Thus
+// stat_benchmark at depth 1 does a comparable amount of work to *MountStat
+// benchmarks at depth 2, and non-MountStat benchmarks at depth 3.
+var depths = []int{1, 2, 3, 8, 64, 100}
+
+const (
+	mountPointName = "tmp"
+	filename       = "gvisor_test_temp_0_1557494568"
+)
+
+// This is copied from syscalls/linux/sys_file.go, with the dependency on
+// kernel.Task stripped out.
+func fileOpOn(ctx context.Context, mntns *fs.MountNamespace, root, wd *fs.Dirent, dirFD int32, path string, resolve bool, fn func(root *fs.Dirent, d *fs.Dirent) error) error {
+	var (
+		d   *fs.Dirent // The file.
+		rel *fs.Dirent // The relative directory for search (if required.)
+		err error
+	)
+
+	// Extract the working directory (maybe).
+	if len(path) > 0 && path[0] == '/' {
+		// Absolute path; rel can be nil.
+	} else if dirFD == linux.AT_FDCWD {
+		// Need to reference the working directory.
+		rel = wd
+	} else {
+		// Need to extract the given FD.
+		return syserror.EBADF
+	}
+
+	// Lookup the node.
+	remainingTraversals := uint(linux.MaxSymlinkTraversals)
+	if resolve {
+		d, err = mntns.FindInode(ctx, root, rel, path, &remainingTraversals)
+	} else {
+		d, err = mntns.FindLink(ctx, root, rel, path, &remainingTraversals)
+	}
+	if err != nil {
+		return err
+	}
+
+	err = fn(root, d)
+	d.DecRef()
+	return err
+}
+
+func BenchmarkVFS1TmpfsStat(b *testing.B) {
+	for _, depth := range depths {
+		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+			ctx := contexttest.Context(b)
+
+			// Create VFS.
+			tmpfsFS, ok := fs.FindFilesystem("tmpfs")
+			if !ok {
+				b.Fatalf("failed to find tmpfs filesystem type")
+			}
+			rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+			if err != nil {
+				b.Fatalf("failed to create tmpfs root mount: %v", err)
+			}
+			mntns, err := fs.NewMountNamespace(ctx, rootInode)
+			if err != nil {
+				b.Fatalf("failed to create mount namespace: %v", err)
+			}
+			defer mntns.DecRef()
+
+			var filePathBuilder strings.Builder
+			filePathBuilder.WriteByte('/')
+
+			// Create nested directories with given depth.
+			root := mntns.Root()
+			defer root.DecRef()
+			d := root
+			d.IncRef()
+			defer d.DecRef()
+			for i := depth; i > 0; i-- {
+				name := fmt.Sprintf("%d", i)
+				if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
+					b.Fatalf("failed to create directory %q: %v", name, err)
+				}
+				next, err := d.Walk(ctx, root, name)
+				if err != nil {
+					b.Fatalf("failed to walk to directory %q: %v", name, err)
+				}
+				d.DecRef()
+				d = next
+				filePathBuilder.WriteString(name)
+				filePathBuilder.WriteByte('/')
+			}
+
+			// Create the file that will be stat'd.
+			file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
+			if err != nil {
+				b.Fatalf("failed to create file %q: %v", filename, err)
+			}
+			file.DecRef()
+			filePathBuilder.WriteString(filename)
+			filePath := filePathBuilder.String()
+
+			dirPath := false
+			runtime.GC()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+					if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+						return syserror.ENOTDIR
+					}
+					uattr, err := d.Inode.UnstableAttr(ctx)
+					if err != nil {
+						return err
+					}
+					// Sanity check.
+					if uattr.Perms.User.Execute {
+						b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
+					}
+					return nil
+				})
+				if err != nil {
+					b.Fatalf("stat(%q) failed: %v", filePath, err)
+				}
+			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
+		})
+	}
+}
+
+func BenchmarkVFS2MemfsStat(b *testing.B) {
+	for _, depth := range depths {
+		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+			ctx := contexttest.Context(b)
+			creds := auth.CredentialsFromContext(ctx)
+
+			// Create VFS.
+			vfsObj := vfs.New()
+			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+				AllowUserMount: true,
+			})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+			if err != nil {
+				b.Fatalf("failed to create tmpfs root mount: %v", err)
+			}
+			defer mntns.DecRef(vfsObj)
+
+			var filePathBuilder strings.Builder
+			filePathBuilder.WriteByte('/')
+
+			// Create nested directories with given depth.
+			root := mntns.Root()
+			defer root.DecRef()
+			vd := root
+			vd.IncRef()
+			for i := depth; i > 0; i-- {
+				name := fmt.Sprintf("%d", i)
+				pop := vfs.PathOperation{
+					Root:  root,
+					Start: vd,
+					Path:  fspath.Parse(name),
+				}
+				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+					Mode: 0755,
+				}); err != nil {
+					b.Fatalf("failed to create directory %q: %v", name, err)
+				}
+				nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+				if err != nil {
+					b.Fatalf("failed to walk to directory %q: %v", name, err)
+				}
+				vd.DecRef()
+				vd = nextVD
+				filePathBuilder.WriteString(name)
+				filePathBuilder.WriteByte('/')
+			}
+
+			// Create the file that will be stat'd.
+			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+				Root:               root,
+				Start:              vd,
+				Path:               fspath.Parse(filename),
+				FollowFinalSymlink: true,
+			}, &vfs.OpenOptions{
+				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+				Mode:  0644,
+			})
+			vd.DecRef()
+			vd = vfs.VirtualDentry{}
+			if err != nil {
+				b.Fatalf("failed to create file %q: %v", filename, err)
+			}
+			defer fd.DecRef()
+			filePathBuilder.WriteString(filename)
+			filePath := filePathBuilder.String()
+
+			runtime.GC()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+					Root:               root,
+					Start:              root,
+					Path:               fspath.Parse(filePath),
+					FollowFinalSymlink: true,
+				}, &vfs.StatOptions{})
+				if err != nil {
+					b.Fatalf("stat(%q) failed: %v", filePath, err)
+				}
+				// Sanity check.
+				if stat.Mode&^linux.S_IFMT != 0644 {
+					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
+				}
+			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
+		})
+	}
+}
+
+func BenchmarkVFS1TmpfsMountStat(b *testing.B) {
+	for _, depth := range depths {
+		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+			ctx := contexttest.Context(b)
+
+			// Create VFS.
+			tmpfsFS, ok := fs.FindFilesystem("tmpfs")
+			if !ok {
+				b.Fatalf("failed to find tmpfs filesystem type")
+			}
+			rootInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+			if err != nil {
+				b.Fatalf("failed to create tmpfs root mount: %v", err)
+			}
+			mntns, err := fs.NewMountNamespace(ctx, rootInode)
+			if err != nil {
+				b.Fatalf("failed to create mount namespace: %v", err)
+			}
+			defer mntns.DecRef()
+
+			var filePathBuilder strings.Builder
+			filePathBuilder.WriteByte('/')
+
+			// Create and mount the submount.
+			root := mntns.Root()
+			defer root.DecRef()
+			if err := root.Inode.CreateDirectory(ctx, root, mountPointName, fs.FilePermsFromMode(0755)); err != nil {
+				b.Fatalf("failed to create mount point: %v", err)
+			}
+			mountPoint, err := root.Walk(ctx, root, mountPointName)
+			if err != nil {
+				b.Fatalf("failed to walk to mount point: %v", err)
+			}
+			defer mountPoint.DecRef()
+			submountInode, err := tmpfsFS.Mount(ctx, "tmpfs", fs.MountSourceFlags{}, "", nil)
+			if err != nil {
+				b.Fatalf("failed to create tmpfs submount: %v", err)
+			}
+			if err := mntns.Mount(ctx, mountPoint, submountInode); err != nil {
+				b.Fatalf("failed to mount tmpfs submount: %v", err)
+			}
+			filePathBuilder.WriteString(mountPointName)
+			filePathBuilder.WriteByte('/')
+
+			// Create nested directories with given depth.
+			d, err := root.Walk(ctx, root, mountPointName)
+			if err != nil {
+				b.Fatalf("failed to walk to mount root: %v", err)
+			}
+			defer d.DecRef()
+			for i := depth; i > 0; i-- {
+				name := fmt.Sprintf("%d", i)
+				if err := d.Inode.CreateDirectory(ctx, d, name, fs.FilePermsFromMode(0755)); err != nil {
+					b.Fatalf("failed to create directory %q: %v", name, err)
+				}
+				next, err := d.Walk(ctx, root, name)
+				if err != nil {
+					b.Fatalf("failed to walk to directory %q: %v", name, err)
+				}
+				d.DecRef()
+				d = next
+				filePathBuilder.WriteString(name)
+				filePathBuilder.WriteByte('/')
+			}
+
+			// Create the file that will be stat'd.
+			file, err := d.Inode.Create(ctx, d, filename, fs.FileFlags{Read: true, Write: true}, fs.FilePermsFromMode(0644))
+			if err != nil {
+				b.Fatalf("failed to create file %q: %v", filename, err)
+			}
+			file.DecRef()
+			filePathBuilder.WriteString(filename)
+			filePath := filePathBuilder.String()
+
+			dirPath := false
+			runtime.GC()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				err := fileOpOn(ctx, mntns, root, root, linux.AT_FDCWD, filePath, true /* resolve */, func(root *fs.Dirent, d *fs.Dirent) error {
+					if dirPath && !fs.IsDir(d.Inode.StableAttr) {
+						return syserror.ENOTDIR
+					}
+					uattr, err := d.Inode.UnstableAttr(ctx)
+					if err != nil {
+						return err
+					}
+					// Sanity check.
+					if uattr.Perms.User.Execute {
+						b.Fatalf("got wrong permissions (%0o)", uattr.Perms.LinuxMode())
+					}
+					return nil
+				})
+				if err != nil {
+					b.Fatalf("stat(%q) failed: %v", filePath, err)
+				}
+			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
+		})
+	}
+}
+
+func BenchmarkVFS2MemfsMountStat(b *testing.B) {
+	for _, depth := range depths {
+		b.Run(fmt.Sprintf("%d", depth), func(b *testing.B) {
+			ctx := contexttest.Context(b)
+			creds := auth.CredentialsFromContext(ctx)
+
+			// Create VFS.
+			vfsObj := vfs.New()
+			vfsObj.MustRegisterFilesystemType("tmpfs", tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+				AllowUserMount: true,
+			})
+			mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+			if err != nil {
+				b.Fatalf("failed to create tmpfs root mount: %v", err)
+			}
+			defer mntns.DecRef(vfsObj)
+
+			var filePathBuilder strings.Builder
+			filePathBuilder.WriteByte('/')
+
+			// Create the mount point.
+			root := mntns.Root()
+			defer root.DecRef()
+			pop := vfs.PathOperation{
+				Root:  root,
+				Start: root,
+				Path:  fspath.Parse(mountPointName),
+			}
+			if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+				Mode: 0755,
+			}); err != nil {
+				b.Fatalf("failed to create mount point: %v", err)
+			}
+			// Save the mount point for later use.
+			mountPoint, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+			if err != nil {
+				b.Fatalf("failed to walk to mount point: %v", err)
+			}
+			defer mountPoint.DecRef()
+			// Create and mount the submount.
+			if err := vfsObj.MountAt(ctx, creds, "", &pop, "tmpfs", &vfs.MountOptions{}); err != nil {
+				b.Fatalf("failed to mount tmpfs submount: %v", err)
+			}
+			filePathBuilder.WriteString(mountPointName)
+			filePathBuilder.WriteByte('/')
+
+			// Create nested directories with given depth.
+			vd, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+			if err != nil {
+				b.Fatalf("failed to walk to mount root: %v", err)
+			}
+			for i := depth; i > 0; i-- {
+				name := fmt.Sprintf("%d", i)
+				pop := vfs.PathOperation{
+					Root:  root,
+					Start: vd,
+					Path:  fspath.Parse(name),
+				}
+				if err := vfsObj.MkdirAt(ctx, creds, &pop, &vfs.MkdirOptions{
+					Mode: 0755,
+				}); err != nil {
+					b.Fatalf("failed to create directory %q: %v", name, err)
+				}
+				nextVD, err := vfsObj.GetDentryAt(ctx, creds, &pop, &vfs.GetDentryOptions{})
+				if err != nil {
+					b.Fatalf("failed to walk to directory %q: %v", name, err)
+				}
+				vd.DecRef()
+				vd = nextVD
+				filePathBuilder.WriteString(name)
+				filePathBuilder.WriteByte('/')
+			}
+
+			// Verify that we didn't create any directories under the mount
+			// point (i.e. they were all created on the submount).
+			firstDirName := fmt.Sprintf("%d", depth)
+			if child := mountPoint.Dentry().Child(firstDirName); child != nil {
+				b.Fatalf("created directory %q under root mount, not submount", firstDirName)
+			}
+
+			// Create the file that will be stat'd.
+			fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+				Root:               root,
+				Start:              vd,
+				Path:               fspath.Parse(filename),
+				FollowFinalSymlink: true,
+			}, &vfs.OpenOptions{
+				Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+				Mode:  0644,
+			})
+			vd.DecRef()
+			if err != nil {
+				b.Fatalf("failed to create file %q: %v", filename, err)
+			}
+			fd.DecRef()
+			filePathBuilder.WriteString(filename)
+			filePath := filePathBuilder.String()
+
+			runtime.GC()
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+					Root:               root,
+					Start:              root,
+					Path:               fspath.Parse(filePath),
+					FollowFinalSymlink: true,
+				}, &vfs.StatOptions{})
+				if err != nil {
+					b.Fatalf("stat(%q) failed: %v", filePath, err)
+				}
+				// Sanity check.
+				if stat.Mode&^linux.S_IFMT != 0644 {
+					b.Fatalf("got wrong permissions (%0o)", stat.Mode)
+				}
+			}
+			// Don't include deferred cleanup in benchmark time.
+			b.StopTimer()
+		})
+	}
+}
+
+func init() {
+	// Turn off reference leak checking for a fair comparison between vfs1 and
+	// vfs2.
+	refs.SetLeakMode(refs.NoLeakChecking)
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/directory.go b/pkg/sentry/fsimpl/tmpfs/directory.go
new file mode 100644
index 000000000..887ca2619
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/directory.go
@@ -0,0 +1,187 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+type directory struct {
+	inode inode
+
+	// childList is a list containing (1) child Dentries and (2) fake Dentries
+	// (with inode == nil) that represent the iteration position of
+	// directoryFDs. childList is used to support directoryFD.IterDirents()
+	// efficiently. childList is protected by filesystem.mu.
+	childList dentryList
+}
+
+func (fs *filesystem) newDirectory(creds *auth.Credentials, mode linux.FileMode) *inode {
+	dir := &directory{}
+	dir.inode.init(dir, fs, creds, mode)
+	dir.inode.nlink = 2 // from "." and parent directory or ".." for root
+	return &dir.inode
+}
+
+func (i *inode) isDir() bool {
+	_, ok := i.impl.(*directory)
+	return ok
+}
+
+type directoryFD struct {
+	fileDescription
+	vfs.DirectoryFileDescriptionDefaultImpl
+
+	// Protected by filesystem.mu.
+	iter *dentry
+	off  int64
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *directoryFD) Release() {
+	if fd.iter != nil {
+		fs := fd.filesystem()
+		dir := fd.inode().impl.(*directory)
+		fs.mu.Lock()
+		dir.childList.Remove(fd.iter)
+		fs.mu.Unlock()
+		fd.iter = nil
+	}
+}
+
+// IterDirents implements vfs.FileDescriptionImpl.IterDirents.
+func (fd *directoryFD) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
+	fs := fd.filesystem()
+	vfsd := fd.vfsfd.VirtualDentry().Dentry()
+
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+
+	if fd.off == 0 {
+		if !cb.Handle(vfs.Dirent{
+			Name:    ".",
+			Type:    linux.DT_DIR,
+			Ino:     vfsd.Impl().(*dentry).inode.ino,
+			NextOff: 1,
+		}) {
+			return nil
+		}
+		fd.off++
+	}
+	if fd.off == 1 {
+		parentInode := vfsd.ParentOrSelf().Impl().(*dentry).inode
+		if !cb.Handle(vfs.Dirent{
+			Name:    "..",
+			Type:    parentInode.direntType(),
+			Ino:     parentInode.ino,
+			NextOff: 2,
+		}) {
+			return nil
+		}
+		fd.off++
+	}
+
+	dir := vfsd.Impl().(*dentry).inode.impl.(*directory)
+	var child *dentry
+	if fd.iter == nil {
+		// Start iteration at the beginning of dir.
+		child = dir.childList.Front()
+		fd.iter = &dentry{}
+	} else {
+		// Continue iteration from where we left off.
+		child = fd.iter.Next()
+		dir.childList.Remove(fd.iter)
+	}
+	for child != nil {
+		// Skip other directoryFD iterators.
+		if child.inode != nil {
+			if !cb.Handle(vfs.Dirent{
+				Name:    child.vfsd.Name(),
+				Type:    child.inode.direntType(),
+				Ino:     child.inode.ino,
+				NextOff: fd.off + 1,
+			}) {
+				dir.childList.InsertBefore(child, fd.iter)
+				return nil
+			}
+			fd.off++
+		}
+		child = child.Next()
+	}
+	dir.childList.PushBack(fd.iter)
+	return nil
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *directoryFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fs := fd.filesystem()
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+
+	switch whence {
+	case linux.SEEK_SET:
+		// Use offset as given.
+	case linux.SEEK_CUR:
+		offset += fd.off
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+
+	// If the offset isn't changing (e.g. due to lseek(0, SEEK_CUR)), don't
+	// seek even if doing so might reposition the iterator due to concurrent
+	// mutation of the directory. Compare fs/libfs.c:dcache_dir_lseek().
+	if fd.off == offset {
+		return offset, nil
+	}
+
+	fd.off = offset
+	// Compensate for "." and "..".
+	remChildren := int64(0)
+	if offset >= 2 {
+		remChildren = offset - 2
+	}
+
+	dir := fd.inode().impl.(*directory)
+
+	// Ensure that fd.iter exists and is not linked into dir.childList.
+	if fd.iter == nil {
+		fd.iter = &dentry{}
+	} else {
+		dir.childList.Remove(fd.iter)
+	}
+	// Insert fd.iter before the remChildren'th child, or at the end of the
+	// list if remChildren >= number of children.
+	child := dir.childList.Front()
+	for child != nil {
+		// Skip other directoryFD iterators.
+		if child.inode != nil {
+			if remChildren == 0 {
+				dir.childList.InsertBefore(child, fd.iter)
+				return offset, nil
+			}
+			remChildren--
+		}
+		child = child.Next()
+	}
+	dir.childList.PushBack(fd.iter)
+	return offset, nil
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/filesystem.go b/pkg/sentry/fsimpl/tmpfs/filesystem.go
new file mode 100644
index 000000000..26979729e
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/filesystem.go
@@ -0,0 +1,698 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"fmt"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// Sync implements vfs.FilesystemImpl.Sync.
+func (fs *filesystem) Sync(ctx context.Context) error {
+	// All filesystem state is in-memory.
+	return nil
+}
+
+// stepLocked resolves rp.Component() to an existing file, starting from the
+// given directory.
+//
+// stepLocked is loosely analogous to fs/namei.c:walk_component().
+//
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func stepLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+	if !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
+	}
+afterSymlink:
+	nextVFSD, err := rp.ResolveComponent(&d.vfsd)
+	if err != nil {
+		return nil, err
+	}
+	if nextVFSD == nil {
+		// Since the Dentry tree is the sole source of truth for tmpfs, if it's
+		// not in the Dentry tree, it doesn't exist.
+		return nil, syserror.ENOENT
+	}
+	next := nextVFSD.Impl().(*dentry)
+	if symlink, ok := next.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() {
+		// TODO: symlink traversals update access time
+		if err := rp.HandleSymlink(symlink.target); err != nil {
+			return nil, err
+		}
+		goto afterSymlink // don't check the current directory again
+	}
+	rp.Advance()
+	return next, nil
+}
+
+// walkParentDirLocked resolves all but the last path component of rp to an
+// existing directory, starting from the given directory (which is usually
+// rp.Start().Impl().(*dentry)). It does not check that the returned directory
+// is searchable by the provider of rp.
+//
+// walkParentDirLocked is loosely analogous to Linux's
+// fs/namei.c:path_parentat().
+//
+// Preconditions: filesystem.mu must be locked. !rp.Done().
+func walkParentDirLocked(rp *vfs.ResolvingPath, d *dentry) (*dentry, error) {
+	for !rp.Final() {
+		next, err := stepLocked(rp, d)
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// resolveLocked resolves rp to an existing file.
+//
+// resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
+//
+// Preconditions: filesystem.mu must be locked.
+func resolveLocked(rp *vfs.ResolvingPath) (*dentry, error) {
+	d := rp.Start().Impl().(*dentry)
+	for !rp.Done() {
+		next, err := stepLocked(rp, d)
+		if err != nil {
+			return nil, err
+		}
+		d = next
+	}
+	if rp.MustBeDir() && !d.inode.isDir() {
+		return nil, syserror.ENOTDIR
+	}
+	return d, nil
+}
+
+// doCreateAt checks that creating a file at rp is permitted, then invokes
+// create to do so.
+//
+// doCreateAt is loosely analogous to a conjunction of Linux's
+// fs/namei.c:filename_create() and done_path_create().
+//
+// Preconditions: !rp.Done(). For the final path component in rp,
+// !rp.ShouldFollowSymlink().
+func (fs *filesystem) doCreateAt(rp *vfs.ResolvingPath, dir bool, create func(parent *dentry, name string) error) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EEXIST
+	}
+	// Call parent.vfsd.Child() instead of stepLocked() or rp.ResolveChild(),
+	// because if the child exists we want to return EEXIST immediately instead
+	// of attempting symlink/mount traversal.
+	if parent.vfsd.Child(name) != nil {
+		return syserror.EEXIST
+	}
+	if !dir && rp.MustBeDir() {
+		return syserror.ENOENT
+	}
+	// In memfs, the only way to cause a dentry to be disowned is by removing
+	// it from the filesystem, so this check is equivalent to checking if
+	// parent has been removed.
+	if parent.vfsd.IsDisowned() {
+		return syserror.ENOENT
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	return create(parent, name)
+}
+
+// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
+func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
+	if err != nil {
+		return nil, err
+	}
+	if opts.CheckSearchable {
+		if !d.inode.isDir() {
+			return nil, syserror.ENOTDIR
+		}
+		if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true /* isDir */); err != nil {
+			return nil, err
+		}
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
+func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return nil, err
+	}
+	d.IncRef()
+	return &d.vfsd, nil
+}
+
+// LinkAt implements vfs.FilesystemImpl.LinkAt.
+func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		if rp.Mount() != vd.Mount() {
+			return syserror.EXDEV
+		}
+		d := vd.Dentry().Impl().(*dentry)
+		if d.inode.isDir() {
+			return syserror.EPERM
+		}
+		if d.inode.nlink == 0 {
+			return syserror.ENOENT
+		}
+		if d.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+		d.inode.incLinksLocked()
+		child := fs.newDentry(d.inode)
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
+}
+
+// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
+func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
+	return fs.doCreateAt(rp, true /* dir */, func(parent *dentry, name string) error {
+		if parent.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+		parent.inode.incLinksLocked() // from child's ".."
+		child := fs.newDentry(fs.newDirectory(rp.Credentials(), opts.Mode))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
+}
+
+// MknodAt implements vfs.FilesystemImpl.MknodAt.
+func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		switch opts.Mode.FileType() {
+		case 0, linux.S_IFREG:
+			child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+			parent.vfsd.InsertChild(&child.vfsd, name)
+			parent.inode.impl.(*directory).childList.PushBack(child)
+			return nil
+		case linux.S_IFIFO:
+			child := fs.newDentry(fs.newNamedPipe(rp.Credentials(), opts.Mode))
+			parent.vfsd.InsertChild(&child.vfsd, name)
+			parent.inode.impl.(*directory).childList.PushBack(child)
+			return nil
+		case linux.S_IFBLK, linux.S_IFCHR, linux.S_IFSOCK:
+			// Not yet supported.
+			return syserror.EPERM
+		default:
+			return syserror.EINVAL
+		}
+	})
+}
+
+// OpenAt implements vfs.FilesystemImpl.OpenAt.
+func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
+	if opts.Flags&linux.O_TMPFILE != 0 {
+		// Not yet supported.
+		return nil, syserror.EOPNOTSUPP
+	}
+
+	// Handle O_CREAT and !O_CREAT separately, since in the latter case we
+	// don't need fs.mu for writing.
+	if opts.Flags&linux.O_CREAT == 0 {
+		fs.mu.RLock()
+		defer fs.mu.RUnlock()
+		d, err := resolveLocked(rp)
+		if err != nil {
+			return nil, err
+		}
+		return d.open(ctx, rp, opts.Flags, false /* afterCreate */)
+	}
+
+	mustCreate := opts.Flags&linux.O_EXCL != 0
+	start := rp.Start().Impl().(*dentry)
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	if rp.Done() {
+		// Reject attempts to open directories with O_CREAT.
+		if rp.MustBeDir() {
+			return nil, syserror.EISDIR
+		}
+		if mustCreate {
+			return nil, syserror.EEXIST
+		}
+		return start.open(ctx, rp, opts.Flags, false /* afterCreate */)
+	}
+afterTrailingSymlink:
+	parent, err := walkParentDirLocked(rp, start)
+	if err != nil {
+		return nil, err
+	}
+	// Check for search permission in the parent directory.
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayExec, true); err != nil {
+		return nil, err
+	}
+	// Reject attempts to open directories with O_CREAT.
+	if rp.MustBeDir() {
+		return nil, syserror.EISDIR
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return nil, syserror.EISDIR
+	}
+	// Determine whether or not we need to create a file.
+	child, err := stepLocked(rp, parent)
+	if err == syserror.ENOENT {
+		// Already checked for searchability above; now check for writability.
+		if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true); err != nil {
+			return nil, err
+		}
+		if err := rp.Mount().CheckBeginWrite(); err != nil {
+			return nil, err
+		}
+		defer rp.Mount().EndWrite()
+		// Create and open the child.
+		child := fs.newDentry(fs.newRegularFile(rp.Credentials(), opts.Mode))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return child.open(ctx, rp, opts.Flags, true)
+	}
+	if err != nil {
+		return nil, err
+	}
+	// Do we need to resolve a trailing symlink?
+	if !rp.Done() {
+		start = parent
+		goto afterTrailingSymlink
+	}
+	// Open existing file.
+	if mustCreate {
+		return nil, syserror.EEXIST
+	}
+	return child.open(ctx, rp, opts.Flags, false)
+}
+
+func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, flags uint32, afterCreate bool) (*vfs.FileDescription, error) {
+	ats := vfs.AccessTypesForOpenFlags(flags)
+	if !afterCreate {
+		if err := d.inode.checkPermissions(rp.Credentials(), ats, d.inode.isDir()); err != nil {
+			return nil, err
+		}
+	}
+	mnt := rp.Mount()
+	switch impl := d.inode.impl.(type) {
+	case *regularFile:
+		var fd regularFileFD
+		fd.readable = vfs.MayReadFileWithOpenFlags(flags)
+		fd.writable = vfs.MayWriteFileWithOpenFlags(flags)
+		if fd.writable {
+			if err := mnt.CheckBeginWrite(); err != nil {
+				return nil, err
+			}
+			// mnt.EndWrite() is called by regularFileFD.Release().
+		}
+		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
+		if flags&linux.O_TRUNC != 0 {
+			impl.mu.Lock()
+			impl.data.Truncate(0, impl.memFile)
+			atomic.StoreUint64(&impl.size, 0)
+			impl.mu.Unlock()
+		}
+		return &fd.vfsfd, nil
+	case *directory:
+		// Can't open directories writably.
+		if ats&vfs.MayWrite != 0 {
+			return nil, syserror.EISDIR
+		}
+		var fd directoryFD
+		fd.vfsfd.Init(&fd, flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{})
+		return &fd.vfsfd, nil
+	case *symlink:
+		// Can't open symlinks without O_PATH (which is unimplemented).
+		return nil, syserror.ELOOP
+	case *namedPipe:
+		return newNamedPipeFD(ctx, impl, rp, &d.vfsd, flags)
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl))
+	}
+}
+
+// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
+func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
+	if err != nil {
+		return "", err
+	}
+	symlink, ok := d.inode.impl.(*symlink)
+	if !ok {
+		return "", syserror.EINVAL
+	}
+	return symlink.target, nil
+}
+
+// RenameAt implements vfs.FilesystemImpl.RenameAt.
+func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
+	if opts.Flags != 0 {
+		// TODO(b/145974740): Support renameat2 flags.
+		return syserror.EINVAL
+	}
+
+	// Resolve newParent first to verify that it's on this Mount.
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	newParent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	newName := rp.Component()
+	if newName == "." || newName == ".." {
+		return syserror.EBUSY
+	}
+	mnt := rp.Mount()
+	if mnt != oldParentVD.Mount() {
+		return syserror.EXDEV
+	}
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+
+	oldParent := oldParentVD.Dentry().Impl().(*dentry)
+	if err := oldParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	// Call vfs.Dentry.Child() instead of stepLocked() or rp.ResolveChild(),
+	// because if the existing child is a symlink or mount point then we want
+	// to rename over it rather than follow it.
+	renamedVFSD := oldParent.vfsd.Child(oldName)
+	if renamedVFSD == nil {
+		return syserror.ENOENT
+	}
+	renamed := renamedVFSD.Impl().(*dentry)
+	if renamed.inode.isDir() {
+		if renamed == newParent || renamedVFSD.IsAncestorOf(&newParent.vfsd) {
+			return syserror.EINVAL
+		}
+		if oldParent != newParent {
+			// Writability is needed to change renamed's "..".
+			if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite, true /* isDir */); err != nil {
+				return err
+			}
+		}
+	} else {
+		if opts.MustBeDir || rp.MustBeDir() {
+			return syserror.ENOTDIR
+		}
+	}
+
+	if err := newParent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	replacedVFSD := newParent.vfsd.Child(newName)
+	var replaced *dentry
+	if replacedVFSD != nil {
+		replaced = replacedVFSD.Impl().(*dentry)
+		if replaced.inode.isDir() {
+			if !renamed.inode.isDir() {
+				return syserror.EISDIR
+			}
+			if replaced.vfsd.HasChildren() {
+				return syserror.ENOTEMPTY
+			}
+		} else {
+			if rp.MustBeDir() {
+				return syserror.ENOTDIR
+			}
+			if renamed.inode.isDir() {
+				return syserror.ENOTDIR
+			}
+		}
+	} else {
+		if renamed.inode.isDir() && newParent.inode.nlink == maxLinks {
+			return syserror.EMLINK
+		}
+	}
+	if newParent.vfsd.IsDisowned() {
+		return syserror.ENOENT
+	}
+
+	// Linux places this check before some of those above; we do it here for
+	// simplicity, under the assumption that applications are not intentionally
+	// doing noop renames expecting them to succeed where non-noop renames
+	// would fail.
+	if renamedVFSD == replacedVFSD {
+		return nil
+	}
+	vfsObj := rp.VirtualFilesystem()
+	oldParentDir := oldParent.inode.impl.(*directory)
+	newParentDir := newParent.inode.impl.(*directory)
+	if err := vfsObj.PrepareRenameDentry(vfs.MountNamespaceFromContext(ctx), renamedVFSD, replacedVFSD); err != nil {
+		return err
+	}
+	if replaced != nil {
+		newParentDir.childList.Remove(replaced)
+		if replaced.inode.isDir() {
+			newParent.inode.decLinksLocked() // from replaced's ".."
+		}
+		replaced.inode.decLinksLocked()
+	}
+	oldParentDir.childList.Remove(renamed)
+	newParentDir.childList.PushBack(renamed)
+	if renamed.inode.isDir() {
+		oldParent.inode.decLinksLocked()
+		newParent.inode.incLinksLocked()
+	}
+	// TODO: update timestamps and parent directory sizes
+	vfsObj.CommitRenameReplaceDentry(renamedVFSD, &newParent.vfsd, newName, replacedVFSD)
+	return nil
+}
+
+// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
+func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." {
+		return syserror.EINVAL
+	}
+	if name == ".." {
+		return syserror.ENOTEMPTY
+	}
+	childVFSD := parent.vfsd.Child(name)
+	if childVFSD == nil {
+		return syserror.ENOENT
+	}
+	child := childVFSD.Impl().(*dentry)
+	if !child.inode.isDir() {
+		return syserror.ENOTDIR
+	}
+	if childVFSD.HasChildren() {
+		return syserror.ENOTEMPTY
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	vfsObj := rp.VirtualFilesystem()
+	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+		return err
+	}
+	parent.inode.impl.(*directory).childList.Remove(child)
+	parent.inode.decLinksLocked() // from child's ".."
+	child.inode.decLinksLocked()
+	vfsObj.CommitDeleteDentry(childVFSD)
+	return nil
+}
+
+// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
+func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return err
+	}
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	// TODO: implement inode.setStat
+	return syserror.EPERM
+}
+
+// StatAt implements vfs.FilesystemImpl.StatAt.
+func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	d, err := resolveLocked(rp)
+	if err != nil {
+		return linux.Statx{}, err
+	}
+	var stat linux.Statx
+	d.inode.statTo(&stat)
+	return stat, nil
+}
+
+// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
+func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return linux.Statfs{}, err
+	}
+	// TODO: actually implement statfs
+	return linux.Statfs{}, syserror.ENOSYS
+}
+
+// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
+func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
+	return fs.doCreateAt(rp, false /* dir */, func(parent *dentry, name string) error {
+		child := fs.newDentry(fs.newSymlink(rp.Credentials(), target))
+		parent.vfsd.InsertChild(&child.vfsd, name)
+		parent.inode.impl.(*directory).childList.PushBack(child)
+		return nil
+	})
+}
+
+// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
+func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
+	fs.mu.Lock()
+	defer fs.mu.Unlock()
+	parent, err := walkParentDirLocked(rp, rp.Start().Impl().(*dentry))
+	if err != nil {
+		return err
+	}
+	if err := parent.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec, true /* isDir */); err != nil {
+		return err
+	}
+	name := rp.Component()
+	if name == "." || name == ".." {
+		return syserror.EISDIR
+	}
+	childVFSD := parent.vfsd.Child(name)
+	if childVFSD == nil {
+		return syserror.ENOENT
+	}
+	child := childVFSD.Impl().(*dentry)
+	if child.inode.isDir() {
+		return syserror.EISDIR
+	}
+	if !rp.MustBeDir() {
+		return syserror.ENOTDIR
+	}
+	mnt := rp.Mount()
+	if err := mnt.CheckBeginWrite(); err != nil {
+		return err
+	}
+	defer mnt.EndWrite()
+	vfsObj := rp.VirtualFilesystem()
+	if err := vfsObj.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), childVFSD); err != nil {
+		return err
+	}
+	parent.inode.impl.(*directory).childList.Remove(child)
+	child.inode.decLinksLocked()
+	vfsObj.CommitDeleteDentry(childVFSD)
+	return nil
+}
+
+// ListxattrAt implements vfs.FilesystemImpl.ListxattrAt.
+func (fs *filesystem) ListxattrAt(ctx context.Context, rp *vfs.ResolvingPath) ([]string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return nil, err
+	}
+	// TODO(b/127675828): support extended attributes
+	return nil, syserror.ENOTSUP
+}
+
+// GetxattrAt implements vfs.FilesystemImpl.GetxattrAt.
+func (fs *filesystem) GetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) (string, error) {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return "", err
+	}
+	// TODO(b/127675828): support extended attributes
+	return "", syserror.ENOTSUP
+}
+
+// SetxattrAt implements vfs.FilesystemImpl.SetxattrAt.
+func (fs *filesystem) SetxattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetxattrOptions) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return err
+	}
+	// TODO(b/127675828): support extended attributes
+	return syserror.ENOTSUP
+}
+
+// RemovexattrAt implements vfs.FilesystemImpl.RemovexattrAt.
+func (fs *filesystem) RemovexattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	_, err := resolveLocked(rp)
+	if err != nil {
+		return err
+	}
+	// TODO(b/127675828): support extended attributes
+	return syserror.ENOTSUP
+}
+
+// PrependPath implements vfs.FilesystemImpl.PrependPath.
+func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
+	fs.mu.RLock()
+	defer fs.mu.RUnlock()
+	return vfs.GenericPrependPath(vfsroot, vd, b)
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/named_pipe.go b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
new file mode 100644
index 000000000..40bde54de
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/named_pipe.go
@@ -0,0 +1,60 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+type namedPipe struct {
+	inode inode
+
+	pipe *pipe.VFSPipe
+}
+
+// Preconditions:
+//   * fs.mu must be locked.
+//   * rp.Mount().CheckBeginWrite() has been called successfully.
+func (fs *filesystem) newNamedPipe(creds *auth.Credentials, mode linux.FileMode) *inode {
+	file := &namedPipe{pipe: pipe.NewVFSPipe(pipe.DefaultPipeSize, usermem.PageSize)}
+	file.inode.init(file, fs, creds, mode)
+	file.inode.nlink = 1 // Only the parent has a link.
+	return &file.inode
+}
+
+// namedPipeFD implements vfs.FileDescriptionImpl. Methods are implemented
+// entirely via struct embedding.
+type namedPipeFD struct {
+	fileDescription
+
+	*pipe.VFSPipeFD
+}
+
+func newNamedPipeFD(ctx context.Context, np *namedPipe, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, flags uint32) (*vfs.FileDescription, error) {
+	var err error
+	var fd namedPipeFD
+	fd.VFSPipeFD, err = np.pipe.NewVFSPipeFD(ctx, rp, vfsd, &fd.vfsfd, flags)
+	if err != nil {
+		return nil, err
+	}
+	mnt := rp.Mount()
+	fd.vfsfd.Init(&fd, flags, mnt, vfsd, &vfs.FileDescriptionOptions{})
+	return &fd.vfsfd, nil
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/pipe_test.go b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
new file mode 100644
index 000000000..70b42a6ec
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/pipe_test.go
@@ -0,0 +1,235 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"bytes"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/context/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+const fileName = "mypipe"
+
+func TestSeparateFDs(t *testing.T) {
+	ctx, creds, vfsObj, root := setup(t)
+	defer root.DecRef()
+
+	// Open the read side. This is done in a concurrently because opening
+	// One end the pipe blocks until the other end is opened.
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Path:               fspath.Parse(fileName),
+		FollowFinalSymlink: true,
+	}
+	rfdchan := make(chan *vfs.FileDescription)
+	go func() {
+		openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY}
+		rfd, _ := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+		rfdchan <- rfd
+	}()
+
+	// Open the write side.
+	openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY}
+	wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != nil {
+		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
+	}
+	defer wfd.DecRef()
+
+	rfd, ok := <-rfdchan
+	if !ok {
+		t.Fatalf("failed to open pipe for reading %q", fileName)
+	}
+	defer rfd.DecRef()
+
+	const msg = "vamos azul"
+	checkEmpty(ctx, t, rfd)
+	checkWrite(ctx, t, wfd, msg)
+	checkRead(ctx, t, rfd, msg)
+}
+
+func TestNonblockingRead(t *testing.T) {
+	ctx, creds, vfsObj, root := setup(t)
+	defer root.DecRef()
+
+	// Open the read side as nonblocking.
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Path:               fspath.Parse(fileName),
+		FollowFinalSymlink: true,
+	}
+	openOpts := vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_NONBLOCK}
+	rfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != nil {
+		t.Fatalf("failed to open pipe for reading %q: %v", fileName, err)
+	}
+	defer rfd.DecRef()
+
+	// Open the write side.
+	openOpts = vfs.OpenOptions{Flags: linux.O_WRONLY}
+	wfd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != nil {
+		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
+	}
+	defer wfd.DecRef()
+
+	const msg = "geh blau"
+	checkEmpty(ctx, t, rfd)
+	checkWrite(ctx, t, wfd, msg)
+	checkRead(ctx, t, rfd, msg)
+}
+
+func TestNonblockingWriteError(t *testing.T) {
+	ctx, creds, vfsObj, root := setup(t)
+	defer root.DecRef()
+
+	// Open the write side as nonblocking, which should return ENXIO.
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Path:               fspath.Parse(fileName),
+		FollowFinalSymlink: true,
+	}
+	openOpts := vfs.OpenOptions{Flags: linux.O_WRONLY | linux.O_NONBLOCK}
+	_, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != syserror.ENXIO {
+		t.Fatalf("expected ENXIO, but got error: %v", err)
+	}
+}
+
+func TestSingleFD(t *testing.T) {
+	ctx, creds, vfsObj, root := setup(t)
+	defer root.DecRef()
+
+	// Open the pipe as readable and writable.
+	pop := vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Path:               fspath.Parse(fileName),
+		FollowFinalSymlink: true,
+	}
+	openOpts := vfs.OpenOptions{Flags: linux.O_RDWR}
+	fd, err := vfsObj.OpenAt(ctx, creds, &pop, &openOpts)
+	if err != nil {
+		t.Fatalf("failed to open pipe for writing %q: %v", fileName, err)
+	}
+	defer fd.DecRef()
+
+	const msg = "forza blu"
+	checkEmpty(ctx, t, fd)
+	checkWrite(ctx, t, fd, msg)
+	checkRead(ctx, t, fd, msg)
+}
+
+// setup creates a VFS with a pipe in the root directory at path fileName. The
+// returned VirtualDentry must be DecRef()'d be the caller. It calls t.Fatal
+// upon failure.
+func setup(t *testing.T) (context.Context, *auth.Credentials, *vfs.VirtualFilesystem, vfs.VirtualDentry) {
+	ctx := contexttest.Context(t)
+	creds := auth.CredentialsFromContext(ctx)
+
+	// Create VFS.
+	vfsObj := vfs.New()
+	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		t.Fatalf("failed to create tmpfs root mount: %v", err)
+	}
+
+	// Create the pipe.
+	root := mntns.Root()
+	pop := vfs.PathOperation{
+		Root:  root,
+		Start: root,
+		Path:  fspath.Parse(fileName),
+	}
+	mknodOpts := vfs.MknodOptions{Mode: linux.ModeNamedPipe | 0644}
+	if err := vfsObj.MknodAt(ctx, creds, &pop, &mknodOpts); err != nil {
+		t.Fatalf("failed to create file %q: %v", fileName, err)
+	}
+
+	// Sanity check: the file pipe exists and has the correct mode.
+	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Path:               fspath.Parse(fileName),
+		FollowFinalSymlink: true,
+	}, &vfs.StatOptions{})
+	if err != nil {
+		t.Fatalf("stat(%q) failed: %v", fileName, err)
+	}
+	if stat.Mode&^linux.S_IFMT != 0644 {
+		t.Errorf("got wrong permissions (%0o)", stat.Mode)
+	}
+	if stat.Mode&linux.S_IFMT != linux.ModeNamedPipe {
+		t.Errorf("got wrong file type (%0o)", stat.Mode)
+	}
+
+	return ctx, creds, vfsObj, root
+}
+
+// checkEmpty calls t.Fatal if the pipe in fd is not empty.
+func checkEmpty(ctx context.Context, t *testing.T, fd *vfs.FileDescription) {
+	readData := make([]byte, 1)
+	dst := usermem.BytesIOSequence(readData)
+	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
+	if err != syserror.ErrWouldBlock {
+		t.Fatalf("expected ErrWouldBlock reading from empty pipe %q, but got: %v", fileName, err)
+	}
+	if bytesRead != 0 {
+		t.Fatalf("expected to read 0 bytes, but got %d", bytesRead)
+	}
+}
+
+// checkWrite calls t.Fatal if it fails to write all of msg to fd.
+func checkWrite(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
+	writeData := []byte(msg)
+	src := usermem.BytesIOSequence(writeData)
+	bytesWritten, err := fd.Write(ctx, src, vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("error writing to pipe %q: %v", fileName, err)
+	}
+	if bytesWritten != int64(len(writeData)) {
+		t.Fatalf("expected to write %d bytes, but wrote %d", len(writeData), bytesWritten)
+	}
+}
+
+// checkRead calls t.Fatal if it fails to read msg from fd.
+func checkRead(ctx context.Context, t *testing.T, fd *vfs.FileDescription, msg string) {
+	readData := make([]byte, len(msg))
+	dst := usermem.BytesIOSequence(readData)
+	bytesRead, err := fd.Read(ctx, dst, vfs.ReadOptions{})
+	if err != nil {
+		t.Fatalf("error reading from pipe %q: %v", fileName, err)
+	}
+	if bytesRead != int64(len(msg)) {
+		t.Fatalf("expected to read %d bytes, but got %d", len(msg), bytesRead)
+	}
+	if !bytes.Equal(readData, []byte(msg)) {
+		t.Fatalf("expected to read %q from pipe, but got %q", msg, string(readData))
+	}
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file.go b/pkg/sentry/fsimpl/tmpfs/regular_file.go
new file mode 100644
index 000000000..f51e247a7
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file.go
@@ -0,0 +1,357 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"io"
+	"math"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/memmap"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/safemem"
+	"gvisor.dev/gvisor/pkg/sentry/usage"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+type regularFile struct {
+	inode inode
+
+	// memFile is a platform.File used to allocate pages to this regularFile.
+	memFile *pgalloc.MemoryFile
+
+	// mu protects the fields below.
+	mu sync.RWMutex
+
+	// data maps offsets into the file to offsets into memFile that store
+	// the file's data.
+	data fsutil.FileRangeSet
+
+	// size is the size of data, but accessed using atomic memory
+	// operations to avoid locking in inode.stat().
+	size uint64
+
+	// seals represents file seals on this inode.
+	seals uint32
+}
+
+func (fs *filesystem) newRegularFile(creds *auth.Credentials, mode linux.FileMode) *inode {
+	file := &regularFile{
+		memFile: fs.memFile,
+	}
+	file.inode.init(file, fs, creds, mode)
+	file.inode.nlink = 1 // from parent directory
+	return &file.inode
+}
+
+type regularFileFD struct {
+	fileDescription
+
+	// These are immutable.
+	readable bool
+	writable bool
+
+	// off is the file offset. off is accessed using atomic memory operations.
+	// offMu serializes operations that may mutate off.
+	off   int64
+	offMu sync.Mutex
+}
+
+// Release implements vfs.FileDescriptionImpl.Release.
+func (fd *regularFileFD) Release() {
+	if fd.writable {
+		fd.vfsfd.VirtualDentry().Mount().EndWrite()
+	}
+}
+
+// PRead implements vfs.FileDescriptionImpl.PRead.
+func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
+	if !fd.readable {
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	if dst.NumBytes() == 0 {
+		return 0, nil
+	}
+	f := fd.inode().impl.(*regularFile)
+	rw := getRegularFileReadWriter(f, offset)
+	n, err := dst.CopyOutFrom(ctx, rw)
+	putRegularFileReadWriter(rw)
+	return int64(n), err
+}
+
+// Read implements vfs.FileDescriptionImpl.Read.
+func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, err := fd.PRead(ctx, dst, fd.off, opts)
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// PWrite implements vfs.FileDescriptionImpl.PWrite.
+func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
+	if !fd.writable {
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	srclen := src.NumBytes()
+	if srclen == 0 {
+		return 0, nil
+	}
+	f := fd.inode().impl.(*regularFile)
+	end := offset + srclen
+	if end < offset {
+		// Overflow.
+		return 0, syserror.EFBIG
+	}
+	rw := getRegularFileReadWriter(f, offset)
+	n, err := src.CopyInTo(ctx, rw)
+	putRegularFileReadWriter(rw)
+	return n, err
+}
+
+// Write implements vfs.FileDescriptionImpl.Write.
+func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
+	fd.offMu.Lock()
+	n, err := fd.PWrite(ctx, src, fd.off, opts)
+	fd.off += n
+	fd.offMu.Unlock()
+	return n, err
+}
+
+// Seek implements vfs.FileDescriptionImpl.Seek.
+func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
+	fd.offMu.Lock()
+	defer fd.offMu.Unlock()
+	switch whence {
+	case linux.SEEK_SET:
+		// use offset as specified
+	case linux.SEEK_CUR:
+		offset += fd.off
+	case linux.SEEK_END:
+		offset += int64(atomic.LoadUint64(&fd.inode().impl.(*regularFile).size))
+	default:
+		return 0, syserror.EINVAL
+	}
+	if offset < 0 {
+		return 0, syserror.EINVAL
+	}
+	fd.off = offset
+	return offset, nil
+}
+
+// Sync implements vfs.FileDescriptionImpl.Sync.
+func (fd *regularFileFD) Sync(ctx context.Context) error {
+	return nil
+}
+
+// regularFileReadWriter implements safemem.Reader and Safemem.Writer.
+type regularFileReadWriter struct {
+	file *regularFile
+
+	// Offset into the file to read/write at. Note that this may be
+	// different from the FD offset if PRead/PWrite is used.
+	off uint64
+}
+
+var regularFileReadWriterPool = sync.Pool{
+	New: func() interface{} {
+		return &regularFileReadWriter{}
+	},
+}
+
+func getRegularFileReadWriter(file *regularFile, offset int64) *regularFileReadWriter {
+	rw := regularFileReadWriterPool.Get().(*regularFileReadWriter)
+	rw.file = file
+	rw.off = uint64(offset)
+	return rw
+}
+
+func putRegularFileReadWriter(rw *regularFileReadWriter) {
+	rw.file = nil
+	regularFileReadWriterPool.Put(rw)
+}
+
+// ReadToBlocks implements safemem.Reader.ReadToBlocks.
+func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
+	rw.file.mu.RLock()
+
+	// Compute the range to read (limited by file size and overflow-checked).
+	if rw.off >= rw.file.size {
+		rw.file.mu.RUnlock()
+		return 0, io.EOF
+	}
+	end := rw.file.size
+	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
+		end = rend
+	}
+
+	var done uint64
+	seg, gap := rw.file.data.Find(uint64(rw.off))
+	for rw.off < end {
+		mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
+		switch {
+		case seg.Ok():
+			// Get internal mappings.
+			ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
+			if err != nil {
+				rw.file.mu.RUnlock()
+				return done, err
+			}
+
+			// Copy from internal mappings.
+			n, err := safemem.CopySeq(dsts, ims)
+			done += n
+			rw.off += uint64(n)
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				rw.file.mu.RUnlock()
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Tmpfs holes are zero-filled.
+			gapmr := gap.Range().Intersect(mr)
+			dst := dsts.TakeFirst64(gapmr.Length())
+			n, err := safemem.ZeroSeq(dst)
+			done += n
+			rw.off += uint64(n)
+			dsts = dsts.DropFirst64(n)
+			if err != nil {
+				rw.file.mu.RUnlock()
+				return done, err
+			}
+
+			// Continue.
+			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
+		}
+	}
+	rw.file.mu.RUnlock()
+	return done, nil
+}
+
+// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
+func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
+	rw.file.mu.Lock()
+
+	// Compute the range to write (overflow-checked).
+	end := rw.off + srcs.NumBytes()
+	if end <= rw.off {
+		end = math.MaxInt64
+	}
+
+	// Check if seals prevent either file growth or all writes.
+	switch {
+	case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
+		rw.file.mu.Unlock()
+		return 0, syserror.EPERM
+	case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
+		// When growth is sealed, Linux effectively allows writes which would
+		// normally grow the file to partially succeed up to the current EOF,
+		// rounded down to the page boundary before the EOF.
+		//
+		// This happens because writes (and thus the growth check) for tmpfs
+		// files proceed page-by-page on Linux, and the final write to the page
+		// containing EOF fails, resulting in a partial write up to the start of
+		// that page.
+		//
+		// To emulate this behaviour, artifically truncate the write to the
+		// start of the page containing the current EOF.
+		//
+		// See Linux, mm/filemap.c:generic_perform_write() and
+		// mm/shmem.c:shmem_write_begin().
+		if pgstart := uint64(usermem.Addr(rw.file.size).RoundDown()); end > pgstart {
+			end = pgstart
+		}
+		if end <= rw.off {
+			// Truncation would result in no data being written.
+			rw.file.mu.Unlock()
+			return 0, syserror.EPERM
+		}
+	}
+
+	// Page-aligned mr for when we need to allocate memory. RoundUp can't
+	// overflow since end is an int64.
+	pgstartaddr := usermem.Addr(rw.off).RoundDown()
+	pgendaddr, _ := usermem.Addr(end).RoundUp()
+	pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)}
+
+	var (
+		done   uint64
+		retErr error
+	)
+	seg, gap := rw.file.data.Find(uint64(rw.off))
+	for rw.off < end {
+		mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
+		switch {
+		case seg.Ok():
+			// Get internal mappings.
+			ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Write)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Copy to internal mappings.
+			n, err := safemem.CopySeq(ims, srcs)
+			done += n
+			rw.off += uint64(n)
+			srcs = srcs.DropFirst64(n)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Continue.
+			seg, gap = seg.NextNonEmpty()
+
+		case gap.Ok():
+			// Allocate memory for the write.
+			gapMR := gap.Range().Intersect(pgMR)
+			fr, err := rw.file.memFile.Allocate(gapMR.Length(), usage.Tmpfs)
+			if err != nil {
+				retErr = err
+				goto exitLoop
+			}
+
+			// Write to that memory as usual.
+			seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}
+		}
+	}
+exitLoop:
+	// If the write ends beyond the file's previous size, it causes the
+	// file to grow.
+	if rw.off > rw.file.size {
+		atomic.StoreUint64(&rw.file.size, rw.off)
+	}
+
+	rw.file.mu.Unlock()
+	return done, retErr
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/regular_file_test.go b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
new file mode 100644
index 000000000..3731c5b6f
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/regular_file_test.go
@@ -0,0 +1,224 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"testing"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fspath"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/contexttest"
+	"gvisor.dev/gvisor/pkg/sentry/usermem"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+)
+
+// newFileFD creates a new file in a new tmpfs mount, and returns the FD. If
+// the returned err is not nil, then cleanup should be called when the FD is no
+// longer needed.
+func newFileFD(ctx context.Context, filename string) (*vfs.FileDescription, func(), error) {
+	creds := auth.CredentialsFromContext(ctx)
+
+	vfsObj := vfs.New()
+	vfsObj.MustRegisterFilesystemType("tmpfs", FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
+		AllowUserMount: true,
+	})
+	mntns, err := vfsObj.NewMountNamespace(ctx, creds, "", "tmpfs", &vfs.GetFilesystemOptions{})
+	if err != nil {
+		return nil, nil, fmt.Errorf("failed to create tmpfs root mount: %v", err)
+	}
+	root := mntns.Root()
+
+	// Create the file that will be write/read.
+	fd, err := vfsObj.OpenAt(ctx, creds, &vfs.PathOperation{
+		Root:               root,
+		Start:              root,
+		Path:               fspath.Parse(filename),
+		FollowFinalSymlink: true,
+	}, &vfs.OpenOptions{
+		Flags: linux.O_RDWR | linux.O_CREAT | linux.O_EXCL,
+		Mode:  0644,
+	})
+	if err != nil {
+		root.DecRef()
+		mntns.DecRef(vfsObj)
+		return nil, nil, fmt.Errorf("failed to create file %q: %v", filename, err)
+	}
+
+	return fd, func() {
+		root.DecRef()
+		mntns.DecRef(vfsObj)
+	}, nil
+}
+
+// Test that we can write some data to a file and read it back.`
+func TestSimpleWriteRead(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, "simpleReadWrite")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	// Write.
+	data := []byte("foobarbaz")
+	n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("fd.Write failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+	}
+	if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want {
+		t.Errorf("fd.Write left offset at %d, want %d", got, want)
+	}
+
+	// Seek back to beginning.
+	if _, err := fd.Seek(ctx, 0, linux.SEEK_SET); err != nil {
+		t.Fatalf("fd.Seek failed: %v", err)
+	}
+	if got, want := fd.Impl().(*regularFileFD).off, int64(0); got != want {
+		t.Errorf("fd.Seek(0) left offset at %d, want %d", got, want)
+	}
+
+	// Read.
+	buf := make([]byte, len(data))
+	n, err = fd.Read(ctx, usermem.BytesIOSequence(buf), vfs.ReadOptions{})
+	if err != nil && err != io.EOF {
+		t.Fatalf("fd.Read failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Read got short read length %d, want %d", n, len(data))
+	}
+	if got, want := string(buf), string(data); got != want {
+		t.Errorf("Read got %q want %s", got, want)
+	}
+	if got, want := fd.Impl().(*regularFileFD).off, int64(len(data)); got != want {
+		t.Errorf("fd.Write left offset at %d, want %d", got, want)
+	}
+}
+
+func TestPWrite(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, "PRead")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	// Fill file with 1k 'a's.
+	data := bytes.Repeat([]byte{'a'}, 1000)
+	n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("fd.Write failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+	}
+
+	// Write "gVisor is awesome" at various offsets.
+	buf := []byte("gVisor is awesome")
+	offsets := []int{0, 1, 2, 10, 20, 50, 100, len(data) - 100, len(data) - 1, len(data), len(data) + 1}
+	for _, offset := range offsets {
+		name := fmt.Sprintf("PWrite offset=%d", offset)
+		t.Run(name, func(t *testing.T) {
+			n, err := fd.PWrite(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.WriteOptions{})
+			if err != nil {
+				t.Errorf("fd.PWrite got err %v want nil", err)
+			}
+			if n != int64(len(buf)) {
+				t.Errorf("fd.PWrite got %d bytes want %d", n, len(buf))
+			}
+
+			// Update data to reflect expected file contents.
+			if len(data) < offset+len(buf) {
+				data = append(data, make([]byte, (offset+len(buf))-len(data))...)
+			}
+			copy(data[offset:], buf)
+
+			// Read the whole file and compare with data.
+			readBuf := make([]byte, len(data))
+			n, err = fd.PRead(ctx, usermem.BytesIOSequence(readBuf), 0, vfs.ReadOptions{})
+			if err != nil {
+				t.Fatalf("fd.PRead failed: %v", err)
+			}
+			if n != int64(len(data)) {
+				t.Errorf("fd.PRead got short read length %d, want %d", n, len(data))
+			}
+			if got, want := string(readBuf), string(data); got != want {
+				t.Errorf("PRead got %q want %s", got, want)
+			}
+
+		})
+	}
+}
+
+func TestPRead(t *testing.T) {
+	ctx := contexttest.Context(t)
+	fd, cleanup, err := newFileFD(ctx, "PRead")
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cleanup()
+
+	// Write 100 sequences of 'gVisor is awesome'.
+	data := bytes.Repeat([]byte("gVisor is awsome"), 100)
+	n, err := fd.Write(ctx, usermem.BytesIOSequence(data), vfs.WriteOptions{})
+	if err != nil {
+		t.Fatalf("fd.Write failed: %v", err)
+	}
+	if n != int64(len(data)) {
+		t.Errorf("fd.Write got short write length %d, want %d", n, len(data))
+	}
+
+	// Read various sizes from various offsets.
+	sizes := []int{0, 1, 2, 10, 20, 50, 100, 1000}
+	offsets := []int{0, 1, 2, 10, 20, 50, 100, 1000, len(data) - 100, len(data) - 1, len(data), len(data) + 1}
+
+	for _, size := range sizes {
+		for _, offset := range offsets {
+			name := fmt.Sprintf("PRead offset=%d size=%d", offset, size)
+			t.Run(name, func(t *testing.T) {
+				var (
+					wantRead []byte
+					wantErr  error
+				)
+				if offset < len(data) {
+					wantRead = data[offset:]
+				} else if size > 0 {
+					wantErr = io.EOF
+				}
+				if offset+size < len(data) {
+					wantRead = wantRead[:size]
+				}
+				buf := make([]byte, size)
+				n, err := fd.PRead(ctx, usermem.BytesIOSequence(buf), int64(offset), vfs.ReadOptions{})
+				if err != wantErr {
+					t.Errorf("fd.PRead got err %v want %v", err, wantErr)
+				}
+				if n != int64(len(wantRead)) {
+					t.Errorf("fd.PRead got %d bytes want %d", n, len(wantRead))
+				}
+				if got := string(buf[:n]); got != string(wantRead) {
+					t.Errorf("fd.PRead got %q want %q", got, string(wantRead))
+				}
+			})
+		}
+	}
+}
diff --git a/pkg/sentry/fsimpl/tmpfs/symlink.go b/pkg/sentry/fsimpl/tmpfs/symlink.go
new file mode 100644
index 000000000..5246aca84
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/symlink.go
@@ -0,0 +1,36 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tmpfs
+
+import (
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+)
+
+type symlink struct {
+	inode  inode
+	target string // immutable
+}
+
+func (fs *filesystem) newSymlink(creds *auth.Credentials, target string) *inode {
+	link := &symlink{
+		target: target,
+	}
+	link.inode.init(link, fs, creds, 0777)
+	link.inode.nlink = 1 // from parent directory
+	return &link.inode
+}
+
+// O_PATH is unimplemented, so there's no way to get a FileDescription
+// representing a symlink yet.
diff --git a/pkg/sentry/fsimpl/tmpfs/tmpfs.go b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
new file mode 100644
index 000000000..7be6faa5b
--- /dev/null
+++ b/pkg/sentry/fsimpl/tmpfs/tmpfs.go
@@ -0,0 +1,299 @@
+// Copyright 2019 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tmpfs provides a filesystem implementation that behaves like tmpfs:
+// the Dentry tree is the sole source of truth for the state of the filesystem.
+//
+// Lock order:
+//
+// filesystem.mu
+//   regularFileFD.offMu
+//     regularFile.mu
+//   inode.mu
+package tmpfs
+
+import (
+	"fmt"
+	"math"
+	"sync"
+	"sync/atomic"
+
+	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/sentry/context"
+	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
+	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
+	"gvisor.dev/gvisor/pkg/sentry/vfs"
+	"gvisor.dev/gvisor/pkg/syserror"
+)
+
+// FilesystemType implements vfs.FilesystemType.
+type FilesystemType struct{}
+
+// filesystem implements vfs.FilesystemImpl.
+type filesystem struct {
+	vfsfs vfs.Filesystem
+
+	// memFile is used to allocate pages to for regular files.
+	memFile *pgalloc.MemoryFile
+
+	// mu serializes changes to the Dentry tree.
+	mu sync.RWMutex
+
+	nextInoMinusOne uint64 // accessed using atomic memory operations
+}
+
+// GetFilesystem implements vfs.FilesystemType.GetFilesystem.
+func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
+	memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx)
+	if memFileProvider == nil {
+		panic("MemoryFileProviderFromContext returned nil")
+	}
+	fs := filesystem{
+		memFile: memFileProvider.MemoryFile(),
+	}
+	fs.vfsfs.Init(vfsObj, &fs)
+	root := fs.newDentry(fs.newDirectory(creds, 01777))
+	return &fs.vfsfs, &root.vfsd, nil
+}
+
+// Release implements vfs.FilesystemImpl.Release.
+func (fs *filesystem) Release() {
+}
+
+// dentry implements vfs.DentryImpl.
+type dentry struct {
+	vfsd vfs.Dentry
+
+	// inode is the inode represented by this dentry. Multiple Dentries may
+	// share a single non-directory inode (with hard links). inode is
+	// immutable.
+	inode *inode
+
+	// tmpfs doesn't count references on dentries; because the dentry tree is
+	// the sole source of truth, it is by definition always consistent with the
+	// state of the filesystem. However, it does count references on inodes,
+	// because inode resources are released when all references are dropped.
+	// (tmpfs doesn't really have resources to release, but we implement
+	// reference counting because tmpfs regular files will.)
+
+	// dentryEntry (ugh) links dentries into their parent directory.childList.
+	dentryEntry
+}
+
+func (fs *filesystem) newDentry(inode *inode) *dentry {
+	d := &dentry{
+		inode: inode,
+	}
+	d.vfsd.Init(d)
+	return d
+}
+
+// IncRef implements vfs.DentryImpl.IncRef.
+func (d *dentry) IncRef() {
+	d.inode.incRef()
+}
+
+// TryIncRef implements vfs.DentryImpl.TryIncRef.
+func (d *dentry) TryIncRef() bool {
+	return d.inode.tryIncRef()
+}
+
+// DecRef implements vfs.DentryImpl.DecRef.
+func (d *dentry) DecRef() {
+	d.inode.decRef()
+}
+
+// inode represents a filesystem object.
+type inode struct {
+	// refs is a reference count. refs is accessed using atomic memory
+	// operations.
+	//
+	// A reference is held on all inodes that are reachable in the filesystem
+	// tree. For non-directories (which may have multiple hard links), this
+	// means that a reference is dropped when nlink reaches 0. For directories,
+	// nlink never reaches 0 due to the "." entry; instead,
+	// filesystem.RmdirAt() drops the reference.
+	refs int64
+
+	// Inode metadata; protected by mu and accessed using atomic memory
+	// operations unless otherwise specified.
+	mu    sync.RWMutex
+	mode  uint32 // excluding file type bits, which are based on impl
+	nlink uint32 // protected by filesystem.mu instead of inode.mu
+	uid   uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
+	gid   uint32 // auth.KGID, but ...
+	ino   uint64 // immutable
+
+	impl interface{} // immutable
+}
+
+const maxLinks = math.MaxUint32
+
+func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) {
+	i.refs = 1
+	i.mode = uint32(mode)
+	i.uid = uint32(creds.EffectiveKUID)
+	i.gid = uint32(creds.EffectiveKGID)
+	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
+	// i.nlink initialized by caller
+	i.impl = impl
+}
+
+// incLinksLocked increments i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+// i.nlink < maxLinks.
+func (i *inode) incLinksLocked() {
+	if i.nlink == 0 {
+		panic("tmpfs.inode.incLinksLocked() called with no existing links")
+	}
+	if i.nlink == maxLinks {
+		panic("memfs.inode.incLinksLocked() called with maximum link count")
+	}
+	atomic.AddUint32(&i.nlink, 1)
+}
+
+// decLinksLocked decrements i's link count.
+//
+// Preconditions: filesystem.mu must be locked for writing. i.nlink != 0.
+func (i *inode) decLinksLocked() {
+	if i.nlink == 0 {
+		panic("tmpfs.inode.decLinksLocked() called with no existing links")
+	}
+	atomic.AddUint32(&i.nlink, ^uint32(0))
+}
+
+func (i *inode) incRef() {
+	if atomic.AddInt64(&i.refs, 1) <= 1 {
+		panic("tmpfs.inode.incRef() called without holding a reference")
+	}
+}
+
+func (i *inode) tryIncRef() bool {
+	for {
+		refs := atomic.LoadInt64(&i.refs)
+		if refs == 0 {
+			return false
+		}
+		if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) {
+			return true
+		}
+	}
+}
+
+func (i *inode) decRef() {
+	if refs := atomic.AddInt64(&i.refs, -1); refs == 0 {
+		// This is unnecessary; it's mostly to simulate what tmpfs would do.
+		if regFile, ok := i.impl.(*regularFile); ok {
+			regFile.mu.Lock()
+			regFile.data.DropAll(regFile.memFile)
+			atomic.StoreUint64(&regFile.size, 0)
+			regFile.mu.Unlock()
+		}
+	} else if refs < 0 {
+		panic("tmpfs.inode.decRef() called without holding a reference")
+	}
+}
+
+func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error {
+	return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
+}
+
+// Go won't inline this function, and returning linux.Statx (which is quite
+// big) means spending a lot of time in runtime.duffcopy(), so instead it's an
+// output parameter.
+func (i *inode) statTo(stat *linux.Statx) {
+	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
+	stat.Blksize = 1 // usermem.PageSize in tmpfs
+	stat.Nlink = atomic.LoadUint32(&i.nlink)
+	stat.UID = atomic.LoadUint32(&i.uid)
+	stat.GID = atomic.LoadUint32(&i.gid)
+	stat.Mode = uint16(atomic.LoadUint32(&i.mode))
+	stat.Ino = i.ino
+	// TODO: device number
+	switch impl := i.impl.(type) {
+	case *regularFile:
+		stat.Mode |= linux.S_IFREG
+		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+		stat.Size = uint64(atomic.LoadUint64(&impl.size))
+		// In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in
+		// a uint64 accessed using atomic memory operations to avoid taking
+		// locks).
+		stat.Blocks = allocatedBlocksForSize(stat.Size)
+	case *directory:
+		stat.Mode |= linux.S_IFDIR
+	case *symlink:
+		stat.Mode |= linux.S_IFLNK
+		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
+		stat.Size = uint64(len(impl.target))
+		stat.Blocks = allocatedBlocksForSize(stat.Size)
+	case *namedPipe:
+		stat.Mode |= linux.S_IFIFO
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+	}
+}
+
+// allocatedBlocksForSize returns the number of 512B blocks needed to
+// accommodate the given size in bytes, as appropriate for struct
+// stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
+// size is independent of the "preferred block size for I/O", struct
+// stat::st_blksize and struct statx::stx_blksize.)
+func allocatedBlocksForSize(size uint64) uint64 {
+	return (size + 511) / 512
+}
+
+func (i *inode) direntType() uint8 {
+	switch i.impl.(type) {
+	case *regularFile:
+		return linux.DT_REG
+	case *directory:
+		return linux.DT_DIR
+	case *symlink:
+		return linux.DT_LNK
+	default:
+		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
+	}
+}
+
+// fileDescription is embedded by tmpfs implementations of
+// vfs.FileDescriptionImpl.
+type fileDescription struct {
+	vfsfd vfs.FileDescription
+	vfs.FileDescriptionDefaultImpl
+}
+
+func (fd *fileDescription) filesystem() *filesystem {
+	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
+}
+
+func (fd *fileDescription) inode() *inode {
+	return fd.vfsfd.Dentry().Impl().(*dentry).inode
+}
+
+// Stat implements vfs.FileDescriptionImpl.Stat.
+func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
+	var stat linux.Statx
+	fd.inode().statTo(&stat)
+	return stat, nil
+}
+
+// SetStat implements vfs.FileDescriptionImpl.SetStat.
+func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
+	if opts.Stat.Mask == 0 {
+		return nil
+	}
+	// TODO: implement inode.setStat
+	return syserror.EPERM
+}
-- 
cgit v1.2.3